In [241]:
import pandas as pd
import numpy as np

df1= pd.read_csv("/kaggle/input/amazon-fine-food-reviews/Reviews.csv")
df1.dropna(inplace=True)

In [242]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 568401 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568401 non-null  int64 
 1   ProductId               568401 non-null  object
 2   UserId                  568401 non-null  object
 3   ProfileName             568401 non-null  object
 4   HelpfulnessNumerator    568401 non-null  int64 
 5   HelpfulnessDenominator  568401 non-null  int64 
 6   Score                   568401 non-null  int64 
 7   Time                    568401 non-null  int64 
 8   Summary                 568401 non-null  object
 9   Text                    568401 non-null  object
dtypes: int64(5), object(5)
memory usage: 47.7+ MB


In [243]:
df=df1[["Summary","Text"]].head(10000)

In [244]:
df.head()

Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,"""Delight"" says it all",This is a confection that has been around a fe...
3,Cough Medicine,If you are looking for the secret ingredient i...
4,Great taffy,Great taffy at a great price. There was a wid...


In [245]:
df["Summary"] = df["Summary"].map(lambda x: x.lower())
df["Text"] = df["Text"].map(lambda x: x.lower())

In [246]:
df.isna().sum()

Summary    0
Text       0
dtype: int64

In [247]:
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords
nltk.download('stopwords')

# Remove special characters
def remove_special_characters(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

# Remove HTML tags
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

# Remove stop words
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

df['Summary'] = df['Summary'].apply(remove_special_characters)
df['Text'] = df['Text'].apply(remove_special_characters)

df['Summary'] = df['Summary'].apply(remove_html_tags)
df['Text'] = df['Text'].apply(remove_html_tags)

df['Summary'] = df['Summary'].apply(remove_stopwords)
df['Text'] = df['Text'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [248]:
df.head()

Unnamed: 0,Summary,Text
0,good quality dog food,bought several vitality canned dog food produc...
1,advertised,product arrived labeled jumbo salted peanutsth...
2,delight says,confection around centuries light pillowy citr...
3,cough medicine,looking secret ingredient robitussin believe f...
4,great taffy,great taffy great price wide assortment yummy ...


In [249]:
df.to_csv("Reviews_cleaned.csv")
df.reset_index(drop=True, inplace=True)


In [250]:
from sklearn.model_selection import train_test_split
# Step 2: Divide the dataset into training and testing (75:25)
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)

In [251]:
train_df.head()

Unnamed: 0,Summary,Text
4901,best decaf ever,far best decaf ive ever tried cant brew large ...
4375,yummmmm,great product hats betty crocker becoming main...
6698,good,drink good especially long hot workout quenche...
9805,liquid heaven,stephens absolutely best hot chocolate ever ta...
1101,sure dont taste like ketchup,loved ketchup flavored potato chips since kid ...


In [252]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [253]:
import torch
from torch.utils.data import Dataset

import pandas as pd

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data.dropna()  # Remove rows with any NaN values
        self.data = self.data[self.data.astype(str).ne('').all(1)]  # Remove rows with any empty string values
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        input_text = row['Text']
        summary = row['Summary']

        input_encoding = self.tokenizer(input_text, truncation=True, max_length=self.max_length, padding='max_length', return_tensors='pt')
        summary_encoding = self.tokenizer(summary, truncation=True, max_length=self.max_length, padding='max_length', return_tensors='pt')

        return {
            'input_ids': input_encoding['input_ids'].squeeze(0),
            'attention_mask': input_encoding['attention_mask'].squeeze(0),
            'labels': summary_encoding['input_ids'].squeeze(0)  # Shifted labels for language modeling
        }


In [254]:
train_dataset = CustomDataset(train_df, tokenizer)
eval_dataset = CustomDataset(test_df, tokenizer)

In [255]:
print(train_dataset[0])

{'input_ids': tensor([50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 16370,  1266,   875,  1878,   220,   425,
         1683,  3088, 18548,  9059,  1588,  6508,  4634,  9059, 15649,  4634,
        17666,  1949,  7539, 15649,  4939,  2158, 40163, 15649,  7209, 25103,
        18221,   588,  6891, 12922, 18548,  1560, 

In [279]:
%env WANDB_MODE=disabled
from transformers import Trainer, TrainingArguments,DataCollatorForLanguageModeling

def load_data_collator(tokenizer, mlm=False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator


data_collator = load_data_collator(tokenizer)


# Define training arguments
training_args = TrainingArguments(
    output_dir='./output_dir',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    save_strategy="no",
    logging_dir=None,
    learning_rate=2e-6
    
)





# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Fine-tune the model
trainer.train()

output_dir = "./saved_model"
trainer.save_model(output_dir)

env: WANDB_MODE=disabled


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,6.2656
1000,6.2359
1500,6.2167
2000,6.2298
2500,6.2429
3000,6.2505
3500,6.2308
4000,6.2397
4500,6.2376
5000,6.2317


In [None]:
!pip install rouge

In [281]:


# Save the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.save_pretrained(output_dir)

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.json',
 './saved_model/merges.txt',
 './saved_model/added_tokens.json')

In [282]:
tokenizer = GPT2Tokenizer.from_pretrained("/kaggle/working/saved_model")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'
model = GPT2LMHeadModel.from_pretrained("/kaggle/working/saved_model")

In [283]:
def generate_summary(input_text, max_length=10):
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=128, truncation=True)
    attention_mask = input_ids.ne(tokenizer.pad_token_id) 
    # Generate summary
    summary_ids = model.generate(input_ids,attention_mask=attention_mask, max_length=len(input_ids[0])+10,length_penalty=0.8, num_beams=4, early_stopping=True)

    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example usage
input_text="I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most."
input_text=remove_special_characters(input_text)
input_text=remove_html_tags(input_text)
input_text=remove_stopwords(input_text)
generated_summary = generate_summary(input_text)
print("Generated Summary:", generated_summary)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Summary: bought several Vitality canned dog food products found good quality product looks like stew processed meat smells better Labrador finicky appreciates product better dog food dog food dog food dog food dog food


In [284]:
print("Generated Summary:", generated_summary)

Generated Summary: bought several Vitality canned dog food products found good quality product looks like stew processed meat smells better Labrador finicky appreciates product better dog food dog food dog food dog food dog food


In [285]:
from rouge_score import rouge_scorer

def compute_rouge_scores(reference_summary, generated_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)
    return scores

# Example usage
reference_summary = "good quality dog food"


scores = compute_rouge_scores(reference_summary, generated_summary)

print("ROUGE-1 Precision:", scores['rouge1'].precision)
print("ROUGE-1 Recall:", scores['rouge1'].recall)
print("ROUGE-1 F1 Score:", scores['rouge1'].fmeasure)

print("ROUGE-L Precision:", scores['rougeL'].precision)
print("ROUGE-L Recall:", scores['rougeL'].recall)
print("ROUGE-L F1 Score:", scores['rougeL'].fmeasure)


ROUGE-1 Precision: 0.12121212121212122
ROUGE-1 Recall: 1.0
ROUGE-1 F1 Score: 0.21621621621621626
ROUGE-L Precision: 0.12121212121212122
ROUGE-L Recall: 1.0
ROUGE-L F1 Score: 0.21621621621621626


In [286]:
test_df["Generated_Summary"] = test_df["Text"].apply(lambda row: generate_summary(row))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

In [302]:
from rouge import Rouge




def calculate_rcs(data_frame, generated_col, reference_col):
    rouge_scores = []
    for idx, row in data_frame.iterrows():
        generated_summary = str(row[generated_col]).strip() 
        reference_summary = str(row[reference_col]).strip() 
        if generated_summary and reference_summary:  
            scores = rouge.get_scores(generated_summary, reference_summary)
            rouge_scores.append(scores)
    return rouge_scores



rouge = Rouge()

rouge_scores = calculate_rcs(test_df, "Generated_Summary", "Summary")


# for scores in rouge_scores:
#     print(scores)


In [305]:
import pickle

# Save rouge_scores to a pickle file
with open("rouge_scores.pkl", "wb") as f:
    pickle.dump(rouge_scores, f)

In [307]:
import pickle

# Load rouge_scores from the pickle file
with open("rouge_scores.pkl", "rb") as f:
    rouge_scores = pickle.load(f)

# Print loaded_rouge_scores
for scores in rouge_scores:
    print(scores)


[{'rouge-1': {'r': 1.0, 'p': 0.03225806451612903, 'f': 0.06249999939453126}, 'rouge-2': {'r': 1.0, 'p': 0.014492753623188406, 'f': 0.028571428289795923}, 'rouge-l': {'r': 1.0, 'p': 0.03225806451612903, 'f': 0.06249999939453126}}]
[{'rouge-1': {'r': 0.5, 'p': 0.020833333333333332, 'f': 0.039999999232000004}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.5, 'p': 0.020833333333333332, 'f': 0.039999999232000004}}]
[{'rouge-1': {'r': 1.0, 'p': 0.07142857142857142, 'f': 0.1333333320888889}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 1.0, 'p': 0.07142857142857142, 'f': 0.1333333320888889}}]
[{'rouge-1': {'r': 0.6666666666666666, 'p': 0.18181818181818182, 'f': 0.28571428234693885}, 'rouge-2': {'r': 0.5, 'p': 0.08333333333333333, 'f': 0.14285714040816327}, 'rouge-l': {'r': 0.6666666666666666, 'p': 0.18181818181818182, 'f': 0.28571428234693885}}]
[{'rouge-1': {'r': 0.6, 'p': 0.12, 'f': 0.19999999722222223}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'

In [308]:

rouge_scores_sum = {'rouge-1': {'p': 0, 'r': 0, 'f': 0},
                    'rouge-2': {'p': 0, 'r': 0, 'f': 0},
                    'rouge-l': {'p': 0, 'r': 0, 'f': 0}}

num_examples = len(rouge_scores)

# Calculate the sum of ROUGE scores
for scores in rouge_scores:
    for rouge_type in rouge_scores_sum.keys():
        for metric in ['p', 'r', 'f']:
            rouge_scores_sum[rouge_type][metric] += scores[0][rouge_type][metric]

# Calculate average ROUGE scores
avg_rouge_scores = {rouge_type: {metric: rouge_scores_sum[rouge_type][metric] / num_examples
                                  for metric in ['p', 'r', 'f']}
                    for rouge_type in rouge_scores_sum.keys()}

# Print the average ROUGE scores
for rouge_type, metrics in avg_rouge_scores.items():
    print(f" {rouge_type}:")
    print(f"Precision: {metrics['p']:.2f}, Recall: {metrics['r']:.2f}, F1-Score: {metrics['f']:.2f}")


 rouge-1:
Precision: 0.05, Recall: 0.44, F1-Score: 0.08
 rouge-2:
Precision: 0.01, Recall: 0.13, F1-Score: 0.02
 rouge-l:
Precision: 0.04, Recall: 0.41, F1-Score: 0.08


In [309]:
def calculate_average_rouge_scores(rouge_scores):
    # Initialize dictionaries to store cumulative scores and average scores
    rouge_totals = {'rouge-1': 0.0, 'rouge-2': 0.0, 'rouge-l': 0.0}
    num_scores = len(rouge_scores)

    # Calculate cumulative scores
    for score_dict in rouge_scores:
        for rouge_type in rouge_totals.keys():
            rouge_totals[rouge_type] += score_dict[0][rouge_type]['f']

    # Calculate average scores
    avg_rouge_scores = {rouge_type: rouge_totals[rouge_type] / num_scores for rouge_type in rouge_totals.keys()}

    return avg_rouge_scores


average_rouge_scores = calculate_average_rouge_scores(rouge_scores)

# Print the average ROUGE scores
for rouge_type, score in average_rouge_scores.items():
    print(f"Average {rouge_type} score:", score)


Average rouge-1 score: 0.0826138011637614
Average rouge-2 score: 0.017829062615820387
Average rouge-l score: 0.0774726200968081
