In [3]:
import pandas as pd
import numpy as np
import torch
import re
import random


Loading Dataset

In [4]:
# Load the dataset:
import pandas as pd
df = pd.read_csv('./Reviews.csv')

In [5]:
df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [6]:
df = df.dropna()


In [7]:
df.shape

(568411, 10)

Cleaning and preprocessing

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer,PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/najiya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/najiya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/najiya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
def preprocess_text(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Lowercasing
    tokens = [word.lower() for word in tokens]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]



    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [10]:
# Combining the two columns review and summary:
df['combined'] = df['Text']  + 'TL;DR' + df['Summary']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['combined'] = df['Text']  + 'TL;DR' + df['Summary']


In [11]:
# reviews = df.head(50000)
reviews=df
reviews= reviews['combined'].tolist()


Model Training

In [12]:
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelWithLMHead.from_pretrained("gpt2")



Dividing the dataset into training and testing (75:25) 

In [13]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df,test_size=0.25)

In [14]:
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=3e-4)

In [15]:
tokenizer.encode(" TL;DR ")
extra_length = len(tokenizer.encode(" TL;DR "))

Custom Data Class

In [16]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPT2ReviewDataset(Dataset):
    def __init__(self, tokenizer, reviews, max_len):
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.eos = self.tokenizer.eos_token
        self.eos_id = self.tokenizer.eos_token_id
        self.reviews = reviews
        self.result = []

        for review in self.reviews:
            # Encode the text using tokenizer.encode(). We add EOS at the end
            tokenized = self.tokenizer.encode(review + self.eos, truncation=True, max_length=self.max_len)

            # Padding/truncating the encoded sequence to max_len
            padded = self.pad_truncate(tokenized)

            # Creating a tensor and adding to the result
            self.result.append(torch.tensor(padded, dtype=torch.long))

    def __len__(self):
        return len(self.result)


    def __getitem__(self, item):
        return self.result[item]

    def pad_truncate(self, name):

        name_length = len(name) - extra_length
        if name_length < self.max_len:
            difference = self.max_len - name_length
            result = name + [self.eos_id] * difference
        elif name_length > self.max_len:
            result = name[:self.max_len + 3]+[self.eos_id]
        else:
            result = name
        return result

In [17]:
max_length = 100
reviews_dataset = GPT2ReviewDataset(tokenizer, reviews, max_length)


In [18]:
dataloader = DataLoader(reviews_dataset, batch_size=32, shuffle=True,drop_last=True)

Fine-tuning the GPT-2 model on the review dataset to generate summaries.

In [19]:
def train(model, optimizer, dl, epochs):
    for epoch in range(epochs):
        print("Epoch :",(epoch+1))
        for idx, batch in enumerate(dl):
             with torch.set_grad_enabled(True):
                optimizer.zero_grad()
                batch = batch.to(device)
                output = model(batch, labels=batch)
                loss = output[0]
                loss.backward()
                optimizer.step()
                if idx % 5000 == 0:
                    # print("loss: %f"%(loss))
                    print(f"loss: {loss}, {idx}")
    torch.save(model.state_dict(), 'model_completeDataset.pth')  # Save the trained model

In [20]:
train(model=model, optimizer=optimizer, dl=dataloader, epochs=5)

Epoch : 1


loss: 6.765002727508545, 0
loss: 2.2353031635284424, 5000
loss: 2.0522336959838867, 10000
loss: 1.942301630973816, 15000
Epoch : 2
loss: 1.8105591535568237, 0
loss: 1.8702399730682373, 5000
loss: 1.4940941333770752, 10000
loss: 1.5958985090255737, 15000
Epoch : 3
loss: 1.5737464427947998, 0
loss: 1.3289415836334229, 5000
loss: 1.408329963684082, 10000
loss: 1.3203850984573364, 15000
Epoch : 4
loss: 1.3660566806793213, 0
loss: 1.6720166206359863, 5000
loss: 1.543911099433899, 10000
loss: 1.6843066215515137, 15000
Epoch : 5
loss: 1.0744960308074951, 0
loss: 1.0217071771621704, 5000
loss: 1.4755713939666748, 10000
loss: 1.2410725355148315, 15000


Model inference

In [30]:
def topk(probs, n=9):
    # The scores are initially softmaxed to convert to probabilities
    probs = torch.softmax(probs, dim= -1)

    # PyTorch has its own topk method, which we use here
    tokensProb, topIx = torch.topk(probs, k=n)

    # The new selection pool (9 choices) is normalized
    tokensProb = tokensProb / torch.sum(tokensProb)

    # Send to CPU for numpy handling
    tokensProb = tokensProb.cpu().detach().numpy()

    # Make a random choice from the pool based on the new prob distribution
    choice = np.random.choice(n, 1, p = tokensProb)
    tokenId = topIx[choice][0]

    return int(tokenId)

In [32]:
def model_infer(model, tokenizer, review, max_length=15):
    # Preprocess the init token (task designator)
    review_encoded = tokenizer.encode(review)
    result = review_encoded
    initial_input = torch.tensor(review_encoded).unsqueeze(0).to(device)

    with torch.set_grad_enabled(False):
        # Feed the init token to the model
        output = model(initial_input)

        # Flatten the logits at the final time step
        logits = output.logits[0,-1]

        # Make a top-k choice and append to the result
        result.append(topk(logits))

        # For max_length times:
        for _ in range(max_length):
            # Feed the current sequence to the model and make a choice
            input = torch.tensor(result).unsqueeze(0).to(device)
            output = model(input)
            logits = output.logits[0,-1]
            res_id = topk(logits)

            # If the chosen token is EOS, return the result
            if res_id == tokenizer.eos_token_id:
                return tokenizer.decode(result)
            else: # Append to the sequence
                result.append(res_id)
    # IF no EOS is generated, return after the max_len
    return tokenizer.decode(result)

Evaluation


In [33]:
from transformers import AutoTokenizer, AutoModelWithLMHead
import torch
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelWithLMHead.from_pretrained("gpt2")
model.load_state_dict(torch.load('model50000.pth'))  # Load the saved model
# model.eval()  # Set the model to evaluation mode


<All keys matched successfully>

In [34]:
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import numpy as np

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=3e-4)

In [35]:
Given_Review_Text =  "The Fender CD-60S Dreadnought Acoustic Guitar is a great instrument for beginners. It has a solid construction, produces a rich sound, and feels comfortable to play. However, some users have reported issues with the tuning stability."
# Given_Review_Text=input()

In [36]:
Given_Summary = "Good for beginners but has tuning stability issues."
# Given_Summary=input()

In [175]:

generated_summary = model_infer(model, tokenizer, Given_Review_Text + " TL;DR ").split(" TL;DR ")[1].strip()
print("Generated Summary :",generated_summary)


Generated Summary : Great Guitar for Beginner


In [177]:
from rouge import Rouge

# Initialize Rouge
rouge = Rouge()

# Assuming you have generated summaries and actual summaries
generated_summaries = [generated_summary]  # Replace [...] with your generated summaries
actual_summaries = [Given_Summary]     # Replace [...] with your actual summaries

# Compute ROUGE scores
scores = rouge.get_scores(generated_summaries, actual_summaries, avg=False)[0]

# Print ROUGE scores
print("Rouge Scores")
# print(scores)
print(f"ROUGE-1 : Precision:{scores['rouge-1']['p']}, Recall :{scores['rouge-1']['r']}, F1-Score :{scores['rouge-1']['f']}")
print(f"ROUGE-2 : Precision:{scores['rouge-2']['p']}, Recall :{scores['rouge-2']['r']}, F1-Score:{scores['rouge-2']['f']}")
print(f"ROUGE-l : Precision:{scores['rouge-l']['p']}, Recall :{scores['rouge-l']['r']}, F1-Score:{scores['rouge-l']['f']}")


Rouge Scores
ROUGE-1 : Precision:0.25, Recall :0.125, F1-Score :0.16666666222222234
ROUGE-2 : Precision:0.0, Recall :0.0, F1-Score:0.0
ROUGE-l : Precision:0.25, Recall :0.125, F1-Score:0.16666666222222234
