In [5]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge import Rouge
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from codebleu import calc_codebleu

In [7]:
import ollama
import pandas as pd
from datasets import load_dataset

from datasets import Dataset 
from ragas.metrics import summarization_score
from ragas import evaluate
import import_ipynb


### Word Counter and Generate Summary Functions

In [8]:
def word_counter(text):
    
    counter = len(text.split())
    
    return counter

def generate_summary(text, model):
    
    summary_word_limit = int(0.2*word_counter(text))
    
    prompt =  f"Summarize the given text in maximum {summary_word_limit} words. \
               Extract the most important information. \
               Only output the summary without any additional text."
    
    response = ollama.chat(model=model, messages=[
        {
            'role': 'system',
            'content': prompt
        },
        {
            'role': 'user',
            'content': text,
        },
    ])
    
    summary = response['message']['content']
    
    return summary

#### Loading sujayC66/text_summarization_512_length_1_4000 dataset from Hugging Face

In [9]:
raw_data_set = load_dataset("sujayC66/text_summarization_512_length_1_4000")


#### Data Preprocessing

In [10]:
df=pd.DataFrame(raw_data_set['train'])
df=df.rename(columns={"content":"text","summary":"model_summary"})
train_df = df.drop(columns = [ "__index_level_0__"])
train_df['original_count'] = train_df['text'].apply(word_counter)
train_df['model_count'] = train_df['model_summary'].apply(word_counter)
train_df['pct_model_count'] = 100*(train_df['model_count']/train_df['original_count'])


train_df.head()

Unnamed: 0,text,model_summary,original_count,model_count,pct_model_count
0,"LONDON - Hunting PLC (LSE: HTG), a precision e...",Hunting PLC's 2023 financial performance align...,410,54,13.170732
1,Promoter entity of Sapphire Foods India Arinja...,"Sapphire Foods India's promoter, Arinjaya (Mau...",317,47,14.826498
2,"Gold price climbed Rs 410 to Rs 61,210 per 10 ...","Gold prices rose by Rs 410 to Rs 61,210 per 10...",325,54,16.615385
3,"New Delhi, Jan 11 (IANS) Life Insurance Corp...","LIC received orders for Rs 3,528 crore from In...",218,60,27.522936
4,"“I don’t want to sound alarmist, but it pays t...",MFIs have become the largest providers of micr...,93,29,31.182796


In [11]:
train_df.shape

(3377, 5)

In [12]:
df= train_df[:3200]
df.shape

(3200, 5)

#### LLM Models to be tested

In [15]:
# LLM Models 

llm_models = ["phi3:latest", "deepseek-llm:latest", "mistral:latest", "llama3.1:latest",  "qwen2:latest"]  

#### Generating Summaries for each LLM Model

In [12]:
df= train_df[:3200]

for llm in llm_models: 
    print(llm)
    df[llm + "_GenSummary"] = df["text"].apply(lambda x: generate_summary(x, model=llm))
    df.to_csv(f'test_3200.csv')

phi3:latest


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[llm + "_GenSummary"] = df["text"].apply(lambda x: generate_summary(x, model=llm))


deepseek-llm:latest


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[llm + "_GenSummary"] = df["text"].apply(lambda x: generate_summary(x, model=llm))


mistral:latest


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[llm + "_GenSummary"] = df["text"].apply(lambda x: generate_summary(x, model=llm))


llama3.1:latest


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[llm + "_GenSummary"] = df["text"].apply(lambda x: generate_summary(x, model=llm))


qwen2:latest


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[llm + "_GenSummary"] = df["text"].apply(lambda x: generate_summary(x, model=llm))


In [14]:
df.to_csv(f'test_3200_backup.csv')
df.to_excel(f'test_3200.xlsx')

In [106]:
df.shape

(3200, 15)

#### Get Word Count 

In [13]:
#Get word_count_pct
for llm in llm_models: 
    column_name = llm + "_GenSummary"
    pct_count = llm + "_pct_count"
    df[pct_count] = df[column_name].apply(word_counter)
    df[pct_count] = 100*df[pct_count]/df['original_count']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[pct_count] = df[column_name].apply(word_counter)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[pct_count] = 100*df[pct_count]/df['original_count']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[pct_count] = df[column_name].apply(word_counter)
A value is trying to be set on a copy of a sli

In [16]:
word_count_df = pd.DataFrame()
word_count_df['phi'] = df['phi3:latest_pct_count']
word_count_df['deepseek'] = df['deepseek-llm:latest_pct_count']
word_count_df['mistral'] = df['mistral:latest_pct_count']
word_count_df['llama3.1'] = df['llama3.1:latest_pct_count']
word_count_df['qwen2'] = df['qwen2:latest_pct_count']

In [17]:
df.columns

Index(['text', 'model_summary', 'original_count', 'model_count',
       'pct_model_count', 'phi3:latest_GenSummary',
       'deepseek-llm:latest_GenSummary', 'mistral:latest_GenSummary',
       'llama3.1:latest_GenSummary', 'qwen2:latest_GenSummary',
       'phi3:latest_pct_count', 'deepseek-llm:latest_pct_count',
       'mistral:latest_pct_count', 'llama3.1:latest_pct_count',
       'qwen2:latest_pct_count'],
      dtype='object')

### Word Count Comparisons

In [18]:
word_count_df.describe()

Unnamed: 0,phi,deepseek,mistral,llama3.1,qwen2
count,3200.0,3200.0,3200.0,3200.0,3200.0
mean,14.454865,16.566206,29.886796,15.218574,31.795555
std,62.62953,7.269763,9.135821,2.112653,11.828262
min,0.0,2.908277,10.454545,6.98324,0.273224
25%,2.439024,11.519341,23.387378,13.760698,24.091205
50%,7.481654,15.411912,28.312957,15.147873,30.285428
75%,16.666667,20.263646,34.554924,16.568047,37.698943
max,3184.042553,67.307692,74.489796,25.0,168.518519


## Evaluation

In [13]:
import import_ipynb
import Eval_Metrics
import numpy as np
eval_metrics = Eval_Metrics.Evaluation_Metrics()
Eval_df_copy =df.copy(deep=True)

In [None]:

df.to_csv(f'test_3200_FINAL_backup.csv')
df.to_excel(f'test_3200_FINAL.xlsx')

#### Getting ROUGE Scores

In [68]:
## Rouge Scores
AllRouge_Scores = pd.DataFrame()
AllRouge_Scores = pd.DataFrame(columns=['LLM_Model','rouge-1-r','rouge-1-p','rouge-1-f','rouge-2-r','rouge-2-p','rouge-2-f','rouge-l-r','rouge-l-p','rouge-l-f'])

# llm_model = ['phi3:latest','llama3.1:latest']
for llm in llm_models: 
# for llm in llm_model: 
    print(llm)
    GeneratedSummary_col = llm + "_GenSummary"
    ModelSummary_col = 'model_summary'
    
    test_rouge_df = Eval_df_copy[[GeneratedSummary_col,ModelSummary_col]].copy(deep=True)
    test_rouge_df[GeneratedSummary_col] = test_rouge_df[GeneratedSummary_col].replace('', np.nan)
    test_rouge_df = test_rouge_df.dropna(subset=[GeneratedSummary_col])
    test_rouge_df = test_rouge_df.reset_index(drop=True)
    
    rouge_scores = eval_metrics.get_rouge_scores(test_rouge_df[ModelSummary_col],test_rouge_df[GeneratedSummary_col])
    
    print(rouge_scores)
    ref_row = [llm , 
               rouge_scores['rouge-1']['r'] , 
               rouge_scores['rouge-1']['p'], 
               rouge_scores['rouge-1']['f'], 
               rouge_scores['rouge-2']['r'], 
               rouge_scores['rouge-2']['p'], 
               rouge_scores['rouge-2']['f'],
               rouge_scores['rouge-l']['r'], 
               rouge_scores['rouge-l']['p'], 
               rouge_scores['rouge-l']['f']
               ]
    
    AllRouge_Scores.loc[-1] = ref_row
    AllRouge_Scores.index = AllRouge_Scores.index + 1  #shift index
    AllRouge_Scores = AllRouge_Scores.sort_index()  #sort by index
    

phi3:latest
{'rouge-1': {'r': 0.07559622704413912, 'p': 0.09085900441705777, 'f': 0.07202600748898173}, 'rouge-2': {'r': 0.010513045440923866, 'p': 0.009372265398385074, 'f': 0.009198374266425939}, 'rouge-l': {'r': 0.06615216251378056, 'p': 0.08153424272049929, 'f': 0.06327465564639882}}
deepseek-llm:latest
{'rouge-1': {'r': 0.3984040852265913, 'p': 0.4504441678098226, 'f': 0.4019042115689887}, 'rouge-2': {'r': 0.18189963691944783, 'p': 0.21264128206886695, 'f': 0.18479483180486755}, 'rouge-l': {'r': 0.36103406136402055, 'p': 0.40811292894938683, 'f': 0.36398686011750464}}
mistral:latest
{'rouge-1': {'r': 0.5675780646734188, 'p': 0.38540428072344257, 'f': 0.4446109864414285}, 'rouge-2': {'r': 0.2881548902176165, 'p': 0.19232923667430415, 'f': 0.2217566776633234}, 'rouge-l': {'r': 0.5254460893796375, 'p': 0.35623218266650397, 'f': 0.4112549076362883}}
llama3.1:latest
{'rouge-1': {'r': 0.46811666903497645, 'p': 0.5448599500066181, 'f': 0.48796842803339524}, 'rouge-2': {'r': 0.25303461739

In [72]:
AllRouge_Scores

Unnamed: 0,LLM_Model,rouge-1-r,rouge-1-p,rouge-1-f,rouge-2-r,rouge-2-p,rouge-2-f,rouge-l-r,rouge-l-p,rouge-l-f
0,qwen2:latest,0.583885,0.380998,0.439601,0.293958,0.188656,0.217142,0.53632,0.349282,0.403292
1,llama3.1:latest,0.468117,0.54486,0.487968,0.253035,0.311468,0.2693,0.430178,0.500524,0.448395
2,mistral:latest,0.567578,0.385404,0.444611,0.288155,0.192329,0.221757,0.525446,0.356232,0.411255
3,deepseek-llm:latest,0.398404,0.450444,0.401904,0.1819,0.212641,0.184795,0.361034,0.408113,0.363987
4,phi3:latest,0.075596,0.090859,0.072026,0.010513,0.009372,0.009198,0.066152,0.081534,0.063275


#### Getting BERTScore

In [22]:

import numpy as np
from evaluate import load
import statistics
import importlib
importlib.reload(Eval_Metrics)
bertscore = load("bertscore")
eval_metrics = Eval_Metrics.Evaluation_Metrics()

In [None]:
df = pd.read_csv('test_3200_FINAL_backup.csv')
Eval_df_copy =df.copy(deep=True)
Eval_df_copy = Eval_df_copy.drop('Unnamed: 0', axis=1)

In [35]:
## BERTScore Scores
AllBERT_Scores = pd.DataFrame()
AllBERT_Scores = pd.DataFrame(columns=['LLM_Model','Precision Mean','Precision Stddev','Recall Mean','Recall Stddev','F1 Score Mean','F1 Score Stddev','hashcode'])

# llm_model = ['phi3:latest','llama3.1:latest']
for llm in llm_models: 
# for llm in llm_model: 
    print(llm)
    GeneratedSummary_col = llm + "_GenSummary"
    ModelSummary_col = 'model_summary'
    
    # results = bertscore.compute(predictions=predictions, references=references, lang="en")
    
    test_bert_df = Eval_df_copy[[GeneratedSummary_col,ModelSummary_col]].copy(deep=True)
    test_bert_df[GeneratedSummary_col] = test_bert_df[GeneratedSummary_col].replace('', np.nan)
    test_bert_df = test_bert_df.dropna(subset=[GeneratedSummary_col])
    test_bert_df = test_bert_df.reset_index(drop=True)
    
    BERT_Scores = bertscore.compute(predictions=test_bert_df[GeneratedSummary_col], references=test_bert_df[ModelSummary_col], lang="en")
    precision_mean = statistics.mean(BERT_Scores['precision'])
    precision_stdev = statistics.stdev(BERT_Scores['precision'])
    recall_mean = statistics.mean(BERT_Scores['recall'])
    recall_stdev = statistics.stdev(BERT_Scores['recall'])
    f1_mean = statistics.mean(BERT_Scores['f1'])
    f1_stdev = statistics.stdev(BERT_Scores['f1'])
    hash_code = BERT_Scores['hashcode']
    
    print('precision_mean',precision_mean)
    print('precision_stdev',precision_stdev)
    print('recall_mean',recall_mean)
    print('recall_stdev',recall_stdev)
    print('f1_mean',f1_mean)
    print('f1_stdev',f1_stdev)
    ref_row = [llm , 
               precision_mean, 
               precision_stdev, 
               recall_mean, 
               recall_stdev, 
               f1_mean, 
               f1_stdev,
               hash_code
               ]
    
    AllBERT_Scores.loc[-1] = ref_row
    AllBERT_Scores.index = AllBERT_Scores.index + 1  #shift index
    AllBERT_Scores = AllBERT_Scores.sort_index()  #sort by index

phi3:latest




precision_mean 0.7769536617337596
precision_stdev 0.05196536175369054
recall_mean 0.7884865583088865
recall_stdev 0.051000813490244226
f1_mean 0.7823995193490242
f1_stdev 0.04928785532957433
deepseek-llm:latest
precision_mean 0.9065995134972036
precision_stdev 0.029721384020192563
recall_mean 0.8983758135698736
recall_stdev 0.029521019105387965
f1_mean 0.9021323166601359
f1_stdev 0.024136782895646584
mistral:latest
precision_mean 0.8908155338466167
precision_stdev 0.030417561403784703
recall_mean 0.9232952133752406
recall_stdev 0.02261668397260543
f1_mean 0.9065113559551538
f1_stdev 0.022490289846493698
llama3.1:latest
precision_mean 0.9262234579399228
precision_stdev 0.02671300475743821
recall_mean 0.9094036938063801
recall_stdev 0.02875995661510874
f1_mean 0.9174677292071283
f1_stdev 0.023032587254509476
qwen2:latest
precision_mean 0.8904012389853597
precision_stdev 0.03378242280398264
recall_mean 0.9240397435612977
recall_stdev 0.025273881654074626
f1_mean 0.9065637845732272
f1_stde

In [36]:
AllBERT_Scores

Unnamed: 0,LLM_Model,Precision Mean,Precision Stddev,Recall Mean,Recall Stddev,F1 Score Mean,F1 Score Stddev,hashcode
0,qwen2:latest,0.890401,0.033782,0.92404,0.025274,0.906564,0.024592,roberta-large_L17_no-idf_version=0.3.12(hug_tr...
1,llama3.1:latest,0.926223,0.026713,0.909404,0.02876,0.917468,0.023033,roberta-large_L17_no-idf_version=0.3.12(hug_tr...
2,mistral:latest,0.890816,0.030418,0.923295,0.022617,0.906511,0.02249,roberta-large_L17_no-idf_version=0.3.12(hug_tr...
3,deepseek-llm:latest,0.9066,0.029721,0.898376,0.029521,0.902132,0.024137,roberta-large_L17_no-idf_version=0.3.12(hug_tr...
4,phi3:latest,0.776954,0.051965,0.788487,0.051001,0.7824,0.049288,roberta-large_L17_no-idf_version=0.3.12(hug_tr...
