In [2]:
import warnings
warnings.filterwarnings('ignore')


In [3]:
import pandas as pd
from bs4 import BeautifulSoup
import nltk 
nltk.download('punkt')  # Download for sentence tokenization
nltk.download('stopwords')  # Download for stopword removal


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ecesoclab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ecesoclab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
df = pd.read_csv('Reviews.csv')


In [5]:
import pandas as pd
from bs4 import BeautifulSoup
import nltk
import string

# Initialize NLTK resources before your functions
nltk.download('punkt')
nltk.download('stopwords')

# Get the list of English stopwords
stop_words = set(nltk.corpus.stopwords.words('english'))
punctuation_table = str.maketrans('', '', string.punctuation)

def clean_html(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

def tokenize_and_remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [w for w in tokens if w.lower() not in stop_words]
    return filtered_tokens

# Define the enhanced preprocessing function
def preprocess_text(text):
    text = text.lower()
    tokens = tokenize_and_remove_stopwords(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [word.translate(punctuation_table) for word in tokens]
    tokens = [word for word in tokens if word.strip()]
    return ' '.join(tokens)

# Apply preprocessing to your DataFrame
df['Processed_Summary'] = df['Summary'].apply(clean_html)
df['Processed_Summary'] = df['Processed_Summary'].apply(preprocess_text)  # Enhanced preprocessing

df['Processed_Text'] = df['Text'].apply(clean_html)
df['Processed_Text'] = df['Processed_Text'].apply(preprocess_text)  # Enhanced preprocessing


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ecesoclab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ecesoclab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
df = df.reset_index(drop=True)


In [8]:
from sklearn.model_selection import train_test_split

# Split the data
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)


In [9]:
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [10]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Now you can use the tokenizer with padding
inputs = tokenizer("Example text", return_tensors="pt", padding=True)

In [11]:
from torch.utils.data import Dataset

class SummaryDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_length=512):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        summary = self.summaries[idx]
        encoding = tokenizer(
            text, summary,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        inputs = {
            'input_ids': encoding.input_ids.flatten(),
            'attention_mask': encoding.attention_mask.flatten(),
            'labels': encoding.input_ids.flatten()  # Using input_ids for labels for simplicity
        }
        return inputs

# Initialize datasets
train_dataset = SummaryDataset(train_df['Processed_Text'], train_df['Processed_Summary'], tokenizer)
test_dataset = SummaryDataset(test_df['Processed_Text'], test_df['Processed_Summary'], tokenizer)


In [None]:
train_df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Processed_Summary,Processed_Text
82,83,B003ZFRKGO,A2VOZX7YBT0D6D,"Johnnycakes ""Johnnycakes""",15,15,5,1325635200,Forget Molecular Gastronomy - this stuff rocke...,I know the product title says Molecular Gastro...,forget molecular gastronomy stuff rockes coffe...,know product title says molecular gastronomy n...
254,255,B0048IC328,A1W867A8DSHFHC,no name,1,3,1,1339804800,Don't Waste Your Money,"I felt energized within five minutes, but it l...",nt waste money,felt energized within five minutes lasted 45 m...
922,923,B000ER6YO0,A3SFS8R1T6JJM9,PolyOWannaCracker,0,0,3,1318032000,Meh. My daughter eats most everything else......,I introduced my daughter to Earth's Best Organ...,meh daughter eats everything else nt thing,introduced daughter earth s best organic turke...
408,409,B001IZM8A6,AVI998S4IX2Y1,"Melissa N. Connor ""rare maven""",1,1,5,1257897600,A Fantastic & Healthy Product,A grande item ! Delicious fish with the ultim...,fantastic healthy product,grande item delicious fish ultimate flavor bes...
653,654,B002BCD2OG,A2ZE5ICI6LWAZ0,"Debra Schiff ""http://hereandthere123.blogspot...",27,27,5,1259193600,"Caramel flavor, excellent for baking and toppi...",If you want to find a good flavor substitute f...,caramel flavor excellent baking toppings tips ...,want find good flavor substitute maple syrup b...
...,...,...,...,...,...,...,...,...,...,...,...,...
106,107,B004K2IHUO,A1ZKFQLHFZAEH9,"S. J. Monson ""world citizen""",2,8,3,1236384000,disappointing,not what I was expecting in terms of the compa...,disappointing,expecting terms company s reputation excellent...
270,271,B000LKZK7C,A2VUB6YYTF234O,Anel Lopez,0,0,4,1302739200,Good product,"I only use raw sugar, it did seem a little sma...",good product,use raw sugar seem little smaller normal cryst...
860,861,B000VKYKTG,A1O9O18AE5HIOL,Sweet Tooth,2,2,3,1283472000,Chocolate was all melt,The chocolate on the stick was all melted and ...,chocolate melt,chocolate stick melted sticks stuck oneanother...
435,436,B000G6RYNE,A15USNEAJUXOSH,L. Schrank,0,0,5,1326067200,Delicious,"I love these chips, I buy the 24 pack once a m...",delicious,love chips buy 24 pack month bags right size s...


In [12]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Start training
trainer.train()


  2%|▏         | 10/567 [01:07<1:03:31,  6.84s/it]

{'loss': 7.4026, 'grad_norm': 216.87892150878906, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.05}


  4%|▎         | 20/567 [02:16<1:02:41,  6.88s/it]

{'loss': 6.5381, 'grad_norm': 208.92771911621094, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.11}


  5%|▌         | 30/567 [03:25<1:02:12,  6.95s/it]

{'loss': 4.9314, 'grad_norm': 198.79556274414062, 'learning_rate': 3e-06, 'epoch': 0.16}


  7%|▋         | 40/567 [04:40<1:06:53,  7.62s/it]

{'loss': 3.2735, 'grad_norm': 178.6471405029297, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.21}


  9%|▉         | 50/567 [05:56<1:04:50,  7.53s/it]

{'loss': 1.6676, 'grad_norm': 38.228736877441406, 'learning_rate': 5e-06, 'epoch': 0.26}


 11%|█         | 60/567 [07:10<1:02:49,  7.43s/it]

{'loss': 0.7403, 'grad_norm': 2.8461127281188965, 'learning_rate': 6e-06, 'epoch': 0.32}


 12%|█▏        | 66/567 [07:55<1:01:55,  7.42s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 12%|█▏        | 70/567 [08:25<1:02:02,  7.49s/it]

{'loss': 0.7677, 'grad_norm': 1.3115177154541016, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.37}


 14%|█▍        | 80/567 [09:40<1:00:29,  7.45s/it]

{'loss': 0.5744, 'grad_norm': 1.3718820810317993, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.42}


 16%|█▌        | 90/567 [10:54<59:15,  7.45s/it]  

{'loss': 0.6119, 'grad_norm': 1.9546984434127808, 'learning_rate': 9e-06, 'epoch': 0.48}


 18%|█▊        | 100/567 [12:09<58:32,  7.52s/it]

{'loss': 0.8257, 'grad_norm': 1.3986252546310425, 'learning_rate': 1e-05, 'epoch': 0.53}


 19%|█▉        | 110/567 [13:24<57:11,  7.51s/it]

{'loss': 0.582, 'grad_norm': 1.954835295677185, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.58}


 21%|██        | 120/567 [14:39<55:54,  7.50s/it]

{'loss': 0.6103, 'grad_norm': 2.5664618015289307, 'learning_rate': 1.2e-05, 'epoch': 0.63}


 23%|██▎       | 130/567 [15:54<54:40,  7.51s/it]

{'loss': 0.6051, 'grad_norm': 1.6952883005142212, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.69}


 25%|██▍       | 140/567 [17:09<53:18,  7.49s/it]

{'loss': 0.6749, 'grad_norm': 2.689178466796875, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.74}


 26%|██▋       | 150/567 [18:24<51:48,  7.46s/it]

{'loss': 0.6956, 'grad_norm': 1.4631489515304565, 'learning_rate': 1.5e-05, 'epoch': 0.79}


 28%|██▊       | 160/567 [19:39<50:49,  7.49s/it]

{'loss': 0.5472, 'grad_norm': 1.3991875648498535, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.85}


 30%|██▉       | 170/567 [20:54<49:21,  7.46s/it]

{'loss': 0.4588, 'grad_norm': 2.1405367851257324, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.9}


 32%|███▏      | 180/567 [22:08<48:07,  7.46s/it]

{'loss': 0.6652, 'grad_norm': 1.7884125709533691, 'learning_rate': 1.8e-05, 'epoch': 0.95}


 34%|███▎      | 190/567 [23:18<40:10,  6.39s/it]

{'loss': 0.6544, 'grad_norm': 1.5784542560577393, 'learning_rate': 1.9e-05, 'epoch': 1.01}


 35%|███▌      | 200/567 [24:33<45:33,  7.45s/it]

{'loss': 0.5802, 'grad_norm': 2.0496230125427246, 'learning_rate': 2e-05, 'epoch': 1.06}


 37%|███▋      | 210/567 [25:48<44:26,  7.47s/it]

{'loss': 0.5246, 'grad_norm': 2.059903144836426, 'learning_rate': 2.1e-05, 'epoch': 1.11}


 39%|███▉      | 220/567 [27:02<43:17,  7.48s/it]

{'loss': 0.5885, 'grad_norm': 1.6818119287490845, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.16}


 41%|████      | 230/567 [28:18<42:06,  7.50s/it]

{'loss': 0.6564, 'grad_norm': 3.4429285526275635, 'learning_rate': 2.3000000000000003e-05, 'epoch': 1.22}


 42%|████▏     | 240/567 [29:32<40:24,  7.42s/it]

{'loss': 0.5067, 'grad_norm': 2.1143100261688232, 'learning_rate': 2.4e-05, 'epoch': 1.27}


 44%|████▎     | 247/567 [30:24<39:16,  7.36s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 44%|████▍     | 250/567 [30:46<38:53,  7.36s/it]

{'loss': 0.6579, 'grad_norm': 3.065006971359253, 'learning_rate': 2.5e-05, 'epoch': 1.32}


 46%|████▌     | 260/567 [32:00<37:59,  7.43s/it]

{'loss': 0.4879, 'grad_norm': 1.7014919519424438, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.38}


 48%|████▊     | 270/567 [33:13<36:20,  7.34s/it]

{'loss': 0.6145, 'grad_norm': 2.1341464519500732, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.43}


 49%|████▉     | 280/567 [34:27<35:21,  7.39s/it]

{'loss': 0.5179, 'grad_norm': 2.2580528259277344, 'learning_rate': 2.8000000000000003e-05, 'epoch': 1.48}


 51%|█████     | 290/567 [35:41<34:07,  7.39s/it]

{'loss': 0.588, 'grad_norm': 3.230095386505127, 'learning_rate': 2.9e-05, 'epoch': 1.53}


 53%|█████▎    | 300/567 [36:55<32:48,  7.37s/it]

{'loss': 0.63, 'grad_norm': 2.038292646408081, 'learning_rate': 3e-05, 'epoch': 1.59}


 55%|█████▍    | 310/567 [38:09<32:05,  7.49s/it]

{'loss': 0.6926, 'grad_norm': 2.385385513305664, 'learning_rate': 3.1e-05, 'epoch': 1.64}


 56%|█████▋    | 320/567 [39:24<30:30,  7.41s/it]

{'loss': 0.5061, 'grad_norm': 1.7106350660324097, 'learning_rate': 3.2000000000000005e-05, 'epoch': 1.69}


 58%|█████▊    | 330/567 [40:38<29:04,  7.36s/it]

{'loss': 0.5186, 'grad_norm': 2.665177345275879, 'learning_rate': 3.3e-05, 'epoch': 1.75}


 60%|█████▉    | 340/567 [41:51<27:58,  7.40s/it]

{'loss': 0.5704, 'grad_norm': 1.9637267589569092, 'learning_rate': 3.4000000000000007e-05, 'epoch': 1.8}


 62%|██████▏   | 350/567 [43:05<26:47,  7.41s/it]

{'loss': 0.5326, 'grad_norm': 2.523322820663452, 'learning_rate': 3.5e-05, 'epoch': 1.85}


 63%|██████▎   | 360/567 [44:19<25:30,  7.40s/it]

{'loss': 0.5458, 'grad_norm': 2.019186496734619, 'learning_rate': 3.6e-05, 'epoch': 1.9}


 65%|██████▌   | 370/567 [45:34<24:27,  7.45s/it]

{'loss': 0.5756, 'grad_norm': 1.7187049388885498, 'learning_rate': 3.7e-05, 'epoch': 1.96}


 67%|██████▋   | 380/567 [46:42<20:35,  6.60s/it]

{'loss': 0.599, 'grad_norm': 1.534719705581665, 'learning_rate': 3.8e-05, 'epoch': 2.01}


 69%|██████▉   | 390/567 [47:56<21:41,  7.35s/it]

{'loss': 0.4573, 'grad_norm': 1.4700524806976318, 'learning_rate': 3.9000000000000006e-05, 'epoch': 2.06}


 71%|███████   | 400/567 [49:10<20:30,  7.37s/it]

{'loss': 0.5058, 'grad_norm': 2.5243613719940186, 'learning_rate': 4e-05, 'epoch': 2.12}


 72%|███████▏  | 410/567 [50:24<19:19,  7.39s/it]

{'loss': 0.4643, 'grad_norm': 1.8400628566741943, 'learning_rate': 4.1e-05, 'epoch': 2.17}


 74%|███████▍  | 420/567 [51:37<18:07,  7.40s/it]

{'loss': 0.5401, 'grad_norm': 2.220425844192505, 'learning_rate': 4.2e-05, 'epoch': 2.22}


 76%|███████▌  | 430/567 [52:51<16:43,  7.33s/it]

{'loss': 0.5297, 'grad_norm': 2.5896694660186768, 'learning_rate': 4.3e-05, 'epoch': 2.28}


 78%|███████▊  | 440/567 [54:05<15:34,  7.36s/it]

{'loss': 0.4346, 'grad_norm': 1.7705626487731934, 'learning_rate': 4.4000000000000006e-05, 'epoch': 2.33}


 78%|███████▊  | 442/567 [54:19<15:14,  7.31s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 79%|███████▉  | 450/567 [55:18<14:25,  7.39s/it]

{'loss': 0.706, 'grad_norm': 1.4628478288650513, 'learning_rate': 4.5e-05, 'epoch': 2.38}


 81%|████████  | 460/567 [56:31<13:05,  7.34s/it]

{'loss': 0.4366, 'grad_norm': 2.2308361530303955, 'learning_rate': 4.600000000000001e-05, 'epoch': 2.43}


 83%|████████▎ | 470/567 [57:47<12:46,  7.90s/it]

{'loss': 0.783, 'grad_norm': 2.462557315826416, 'learning_rate': 4.7e-05, 'epoch': 2.49}


 85%|████████▍ | 480/567 [59:05<11:14,  7.76s/it]

{'loss': 0.5243, 'grad_norm': 1.8439524173736572, 'learning_rate': 4.8e-05, 'epoch': 2.54}


 86%|████████▋ | 490/567 [1:00:23<09:54,  7.72s/it]

{'loss': 0.4952, 'grad_norm': 2.1726644039154053, 'learning_rate': 4.9e-05, 'epoch': 2.59}


 88%|████████▊ | 500/567 [1:01:37<08:18,  7.44s/it]

{'loss': 0.4964, 'grad_norm': 1.6504364013671875, 'learning_rate': 5e-05, 'epoch': 2.65}


 90%|████████▉ | 510/567 [1:02:54<07:07,  7.51s/it]

{'loss': 0.4111, 'grad_norm': 1.6298922300338745, 'learning_rate': 4.253731343283582e-05, 'epoch': 2.7}


 92%|█████████▏| 520/567 [1:04:08<05:50,  7.45s/it]

{'loss': 0.6735, 'grad_norm': 1.9726948738098145, 'learning_rate': 3.5074626865671645e-05, 'epoch': 2.75}


 93%|█████████▎| 530/567 [1:05:23<04:35,  7.45s/it]

{'loss': 0.4891, 'grad_norm': 2.9114246368408203, 'learning_rate': 2.7611940298507467e-05, 'epoch': 2.8}


 95%|█████████▌| 540/567 [1:06:37<03:21,  7.45s/it]

{'loss': 0.5595, 'grad_norm': 1.8491947650909424, 'learning_rate': 2.0149253731343285e-05, 'epoch': 2.86}


 97%|█████████▋| 550/567 [1:07:52<02:07,  7.52s/it]

{'loss': 0.5761, 'grad_norm': 3.111241340637207, 'learning_rate': 1.2686567164179105e-05, 'epoch': 2.91}


 99%|█████████▉| 560/567 [1:09:06<00:52,  7.44s/it]

{'loss': 0.5064, 'grad_norm': 1.8076826333999634, 'learning_rate': 5.2238805970149255e-06, 'epoch': 2.96}


100%|██████████| 567/567 [1:09:53<00:00,  7.40s/it]

{'train_runtime': 4193.8343, 'train_samples_per_second': 0.539, 'train_steps_per_second': 0.135, 'train_loss': 0.9464302580192606, 'epoch': 3.0}





TrainOutput(global_step=567, training_loss=0.9464302580192606, metrics={'train_runtime': 4193.8343, 'train_samples_per_second': 0.539, 'train_steps_per_second': 0.135, 'train_loss': 0.9464302580192606, 'epoch': 3.0})

In [15]:
!pip install rouge-score



Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting absl-py (from rouge-score)
  Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
                                              0.0/133.7 kB ? eta -:--:--
     -------------------------------------- 133.7/133.7 kB 7.7 MB/s eta 0:00:00
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py): started
  Building wheel for rouge-score (setup.py): finished with status 'done'
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24972 sha256=56b8578d9400395b7102fefe4a1b7541aebbacb0db77039c0398b244c4fbca36
  Stored in directory: c:\users\ecesoclab\appdata\local\pip\cache\wheels\5f\dd\89\461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: absl-py, rouge-score
Successfully installed absl-p


[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: C:\Users\ecesoclab\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [20]:
from rouge_score import rouge_scorer

# Function to generate summaries
def generate_summary(model, tokenizer, text, max_length=100):
    try:
        input_ids = tokenizer.encode(text, return_tensors='pt')
        if input_ids.size(1) > max_length:
            print(f"Skipping generation for input of length {input_ids.size(1)} as it exceeds max_length of {max_length}.")
            return None

        summary_ids = model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)
        return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None


# Function to compute ROUGE scores
def compute_rouge_scores(model, dataset, tokenizer):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    # Loop through the dataset
    for i in range(len(dataset)):
        text = dataset.texts[i]
        reference_summary = dataset.summaries[i]
        # Generate summary
        generated_summary = generate_summary(model, tokenizer, text)

        # Compute ROUGE scores
        score = scorer.score(reference_summary, generated_summary)
        scores['rouge1'].append(score['rouge1'].fmeasure)
        scores['rouge2'].append(score['rouge2'].fmeasure)
        scores['rougeL'].append(score['rougeL'].fmeasure)

    # Calculate average scores
    avg_scores = {
        'rouge1': sum(scores['rouge1']) / len(scores['rouge1']),
        'rouge2': sum(scores['rouge2']) / len(scores['rouge2']),
        'rougeL': sum(scores['rougeL']) / len(scores['rougeL'])
    }

    return avg_scores

# Example of how to call the compute_rouge_scores function
# This assumes you have a trained model and tokenizer ready, and a dataset prepared
avg_rouge_scores = compute_rouge_scores(model, test_dataset, tokenizer)
print("Average ROUGE Scores:", avg_rouge_scores)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

Average ROUGE Scores: {'rouge1': 0.07907748833580741, 'rouge2': 0.017950841147095414, 'rougeL': 0.07279729568842672}


In [21]:
from rouge_score import rouge_scorer

def compute_rouge_scores(model, dataset, tokenizer, dataframe):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores_list = []

    # Loop through the dataset
    for i in range(len(dataset)):
        text = dataset.texts[i]
        reference_summary = dataset.summaries[i]

        # Generate summary
        generated_summary = generate_summary(model, tokenizer, text)

        # Compute ROUGE scores
        scores = scorer.score(reference_summary, generated_summary)
        
        # Create a detailed score dictionary
        detailed_scores = {
            'ROUGE-1 Precision': scores['rouge1'].precision,
            'ROUGE-1 Recall': scores['rouge1'].recall,
            'ROUGE-1 F1': scores['rouge1'].fmeasure,
            'ROUGE-2 Precision': scores['rouge2'].precision,
            'ROUGE-2 Recall': scores['rouge2'].recall,
            'ROUGE-2 F1': scores['rouge2'].fmeasure,
            'ROUGE-L Precision': scores['rougeL'].precision,
            'ROUGE-L Recall': scores['rougeL'].recall,
            'ROUGE-L F1': scores['rougeL'].fmeasure
        }

        # Append the score dictionary to the list
        scores_list.append(detailed_scores)

    # Append scores to the dataframe
    score_df = pd.DataFrame(scores_list)
    enhanced_df = pd.concat([dataframe.reset_index(drop=True), score_df], axis=1)
    return enhanced_df

# Call the function and pass the DataFrame
enhanced_df = compute_rouge_scores(model, test_dataset, tokenizer, test_df)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

In [24]:
# Save to CSV
enhanced_df.to_csv("detailed_rouge_scores_output.csv", index=False)

enhanced_df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,...,Processed_Text,ROUGE-1 Precision,ROUGE-1 Recall,ROUGE-1 F1,ROUGE-2 Precision,ROUGE-2 Recall,ROUGE-2 F1,ROUGE-L Precision,ROUGE-L Recall,ROUGE-L F1
0,927,B000ER6YO0,A2F0WNTW3QQZYS,Ruth,0,0,5,1298505600,One of our favorites,This is one of my son's favorite baby foods. ...,...,one son s favorite baby foods bit sweetness sw...,0.024390,1.000000,0.047619,0.000000,0.000000,0.000000,0.024390,1.000000,0.047619
1,631,B000G6RYNE,A1IVFBJA9KAI1M,Shane Martin,2,3,4,1191369600,Tasty!,The chips come in a large box with individuall...,...,chips come large box individually wrapped bags...,0.022727,1.000000,0.044444,0.000000,0.000000,0.000000,0.022727,1.000000,0.044444
2,683,B000G6MBX2,A1UDFKEKTO70NX,Tiffany,1,1,4,1245110400,tasty,These chips are perfect for snacking with or w...,...,chips perfect snacking without salsa texture b...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,515,B000G6RYNE,A16S0NQ3MWJXFL,ObsidianGT,0,1,5,1291248000,"Delicious, what else did you expect?","Seriously, now. Tasty, tasty, tasty. Fresh, to...",...,seriously tasty tasty tasty fresh problem pack...,0.015152,0.333333,0.028986,0.000000,0.000000,0.000000,0.015152,0.333333,0.028986
4,366,B00437JI8Q,A4TLHOUT1PHTL,Alison,0,0,5,1346198400,Can't find anywhere else!,My fiance loves these dark chocolate and nut g...,...,fiance loves dark chocolate nut granola bars g...,0.045455,0.200000,0.074074,0.000000,0.000000,0.000000,0.045455,0.200000,0.074074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,110,B001REEG6C,AY12DBB0U420B,Gary Peterson,0,0,5,1316390400,My Idea of a Good Diet Food.,I'm presently on a diet and I was at my Fresh ...,...,m presently diet fresh easy neighborhood groce...,0.050000,1.000000,0.095238,0.037975,1.000000,0.073171,0.050000,1.000000,0.095238
248,941,B000ER6YO0,A1ESDLEDR9Y0JX,A. Spencer,1,2,1,1310256000,the garbanzo beans in it give horrible gas,To be fair only one of my twins got gas from t...,...,fair one twins got gas horrible night screamin...,0.064516,0.800000,0.119403,0.016393,0.250000,0.030769,0.032258,0.400000,0.059701
249,78,B004X2KR36,A26M5O53PHZTKN,"Debs ""peanut""",0,0,5,1348185600,Taste great,These taste really good. I have been purchasin...,...,taste really good purchasing different brand s...,0.021739,0.500000,0.041667,0.000000,0.000000,0.000000,0.021739,0.500000,0.041667
250,85,B0019CW0HE,AK2CXHH9VRZ2A,I. GLENN,4,4,3,1313193600,INCREASED MY DOGS ITCHING,"Awesome dog food. However, when given to my ""B...",...,awesome dog food however given boston severe r...,0.088235,1.000000,0.162162,0.000000,0.000000,0.000000,0.058824,0.666667,0.108108
