**Importing Dependencies and Dataset**

In [2]:
!pip install --upgrade pip

[0m

In [3]:
pip install nltk

[0m

In [4]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.14.0 textstat-0.7.3
[0m

In [5]:
import pandas as pd
import nltk
import numpy as np
import os
from transformers import pipeline
from nltk.translate.bleu_score import corpus_bleu
from textstat import flesch_kincaid_grade
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from math import exp
from tqdm import tqdm

nltk.download('punkt')

# from google.colab import files
# # Upload files
# uploaded = files.upload()
# # List uploaded files
# for filename in uploaded.keys():
#     print(f'Uploaded file: {filename}')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

**Lading data and selecting pretranied models**

In [7]:
# Load Data
train_df_ = pd.read_csv("/content/train.csv")
train_df = train_df_.head(150)

# Select Pretrained Models
models = [
    "google-t5/t5-large",
    "pszemraj/led-base-book-summary",
    "hardikJ11/bart-base-finetuned-cnn-news",
    "philschmid/bart-large-cnn-samsum",
    "BigSneed/autotrain-sima-2512277279",
]

**Calculating BLEU Score, Semantic Coherence, Factual Accuracy and Content Coverage to compare the above 5 models**

In [8]:
# Initialize BLEU Scores list
bleu_scores = []

# Semantic Coherence (Example implementation)
def semantic_coherence(generated_summary, dialogue):
    # Your semantic coherence metric calculation logic
    summary_tokens = word_tokenize(generated_summary.lower())
    dialogue_tokens = word_tokenize(dialogue.lower())

    # Calculate the intersection of tokens
    common_tokens = set(summary_tokens) & set(dialogue_tokens)

    # Calculate semantic coherence score based on the ratio of common tokens to summary length
    coherence_score = len(common_tokens) / len(summary_tokens)

    return coherence_score

# Factual Accuracy (Example implementation)
def factual_accuracy(generated_summary, reference_summary):
    # Your factual accuracy metric calculation logic
    gen_tokens = set(word_tokenize(generated_summary.lower()))
    ref_tokens = set(word_tokenize(reference_summary.lower()))

    # Calculate the intersection of tokens
    common_tokens = gen_tokens & ref_tokens

    # Calculate factual accuracy score based on the ratio of common tokens to reference summary length
    accuracy_score = len(common_tokens) / len(ref_tokens) if len(ref_tokens) != 0 else 0
    return accuracy_score

# Content Coverage (Example implementation)
def content_coverage(generated_summary, dialogue):
    # Your content coverage metric calculation logic

    summary_tokens = set(word_tokenize(generated_summary.lower()))
    dialogue_tokens = set(word_tokenize(dialogue.lower()))

    # Calculate the intersection of tokens
    common_tokens = summary_tokens & dialogue_tokens

    # Calculate the content coverage score based on the ratio of common tokens to dialogue length
    coverage_score = len(common_tokens) / len(dialogue_tokens) if len(dialogue_tokens) != 0 else 0
    return coverage_score

# Initialize evaluation results DataFrame
evaluation_results = pd.DataFrame(columns=["Model", "BLEU Score", "Semantic Coherence", "Factual Accuracy", "Content Coverage", "Readability"])

# Initialize empty list to store evaluation results
evaluation_results_list = []



In [9]:
# Initialize evaluation metric accumulators
semantic_coherence_scores = []
factual_accuracy_scores = []
content_coverage_scores = []
readability_scores = []
bleu_scores = []
evaluation_results_list = []

# Apply Models and Evaluate
for model_name in models:
    print(f"Evaluating model: {model_name}")

    # Initialize the summarization pipeline
    summarizer = pipeline("summarization", model=model_name, tokenizer=model_name)

    # Initialize batch size
    batch_size = 8  # Adjust this based on your available memory

    # Generate summaries in batches
    generated_summaries = []
    semantic_coherence_scores_batch = []
    factual_accuracy_scores_batch = []
    content_coverage_scores_batch = []
    readability_scores_batch = []
    for index in tqdm(range(0, len(train_df), batch_size)):
        batch = train_df.iloc[index:index + batch_size]
        dialogues = batch['dialogue'].tolist()
        summaries = batch['summary'].tolist()

        # Generate summaries
        generated_summaries_batch = summarizer(dialogues, max_length= 60, min_length= 20, do_sample=False)
        generated_summaries.extend([generated_summary["summary_text"] for generated_summary in generated_summaries_batch])

        # Evaluate metrics in batch
        for i, generated_summary in enumerate(generated_summaries_batch):
            coherence_score = semantic_coherence(generated_summary["summary_text"], dialogues[i])
            semantic_coherence_scores_batch.append(coherence_score)

            accuracy_score = factual_accuracy(generated_summary["summary_text"], summaries[i])
            factual_accuracy_scores_batch.append(accuracy_score)

            coverage_score = content_coverage(generated_summary["summary_text"], dialogues[i])
            content_coverage_scores_batch.append(coverage_score)

            readability_score = flesch_kincaid_grade(generated_summary["summary_text"])
            readability_scores_batch.append(readability_score)

    # Calculate BLEU Score
    reference_summaries = train_df["summary"].tolist()
    bleu_score = corpus_bleu([[summary] for summary in reference_summaries], generated_summaries)
    bleu_scores.append(bleu_score)

    # Append results to the evaluation results list
    evaluation_results_list.append({
        "Model": model_name,
        "BLEU Score": bleu_score,
        "Semantic Coherence": sum(semantic_coherence_scores_batch) / len(semantic_coherence_scores_batch),
        "Factual Accuracy": sum(factual_accuracy_scores_batch) / len(factual_accuracy_scores_batch),
        "Content Coverage": sum(content_coverage_scores_batch) / len(content_coverage_scores_batch),
        "Readability": sum(readability_scores_batch) / len(readability_scores_batch),
    })

    # Print a separator for clarity
    print("=" * 50)


Evaluating model: google-t5/t5-large


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

100%|██████████| 19/19 [50:44<00:00, 160.26s/it]


Evaluating model: pszemraj/led-base-book-summary


config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/648M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

100%|██████████| 19/19 [16:41<00:00, 52.72s/it]


Evaluating model: hardikJ11/bart-base-finetuned-cnn-news


config.json:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/344 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

100%|██████████| 19/19 [09:58<00:00, 31.50s/it]


Evaluating model: philschmid/bart-large-cnn-samsum


config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

100%|██████████| 19/19 [22:21<00:00, 70.62s/it]


Evaluating model: BigSneed/autotrain-sima-2512277279


config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

 37%|███▋      | 7/19 [06:10<10:45, 53.81s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 19/19 [16:47<00:00, 53.04s/it]






In [10]:
# Concatenate the evaluation results list into a DataFrame
evaluation_results = pd.concat([pd.DataFrame(item, index=[0]) for item in evaluation_results_list], ignore_index=True)

# Compare Results
print("BLEU Scores:", bleu_scores)

BLEU Scores: [0.3521851333075347, 0.31491796092967095, 0.3343905857837462, 0.439666132253334, 0.3548851115348075]


**Saving results to csv, which will be used as input for calculating topsis score for each model**


In [11]:
# Save evaluation results to a CSV file
evaluation_results.to_csv("topsis_input.csv", index=False)

In [12]:
evaluation_results.head()

Unnamed: 0,Model,BLEU Score,Semantic Coherence,Factual Accuracy,Content Coverage,Readability
0,google-t5/t5-large,0.352185,0.768574,0.447016,0.391597,3.856667
1,pszemraj/led-base-book-summary,0.314918,0.688963,0.544512,0.482278,3.578667
2,hardikJ11/bart-base-finetuned-cnn-news,0.334391,0.710904,0.390311,0.342793,3.858
3,philschmid/bart-large-cnn-samsum,0.439666,0.63289,0.539266,0.288917,5.119333
4,BigSneed/autotrain-sima-2512277279,0.354885,0.771122,0.42866,0.382961,3.981333
