In [1]:
import warnings
warnings.filterwarnings('ignore')


In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import nltk
nltk.download('punkt')  # Download for sentence tokenization
nltk.download('stopwords')  # Download for stopword removal


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ecesoclab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ecesoclab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df = pd.read_csv('Reviews.csv')


In [4]:
df.dropna(subset=['Summary'], inplace=True)

In [5]:
import pandas as pd
from bs4 import BeautifulSoup
import nltk
import string

# Initialize NLTK resources before your functions
nltk.download('punkt')
nltk.download('stopwords')

# Get the list of English stopwords
stop_words = set(nltk.corpus.stopwords.words('english'))
punctuation_table = str.maketrans('', '', string.punctuation)

def clean_html(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

def tokenize_and_remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [w for w in tokens if w.lower() not in stop_words]
    return filtered_tokens

# Define the enhanced preprocessing function
def preprocess_text(text):
    text = text.lower()
    tokens = tokenize_and_remove_stopwords(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [word.translate(punctuation_table) for word in tokens]
    tokens = [word for word in tokens if word.strip()]
    return ' '.join(tokens)

# Apply preprocessing to your DataFrame
df['Processed_Summary'] = df['Summary'].apply(clean_html)
df['Processed_Summary'] = df['Processed_Summary'].apply(preprocess_text)  # Enhanced preprocessing

df['Processed_Text'] = df['Text'].apply(clean_html)
df['Processed_Text'] = df['Processed_Text'].apply(preprocess_text)  # Enhanced preprocessing


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ecesoclab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ecesoclab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
df = df.reset_index(drop=True)


In [7]:
from sklearn.model_selection import train_test_split

# Split the data
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)


In [8]:
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [9]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Now you can use the tokenizer with padding
inputs = tokenizer("Example text", return_tensors="pt", padding=True)

In [10]:
from torch.utils.data import Dataset

class SummaryDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_length=256):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        summary = self.summaries[idx]
        encoding = tokenizer.encode_plus(
            text, summary,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding.input_ids.flatten(),
            'attention_mask': encoding.attention_mask.flatten(),
            'labels': encoding.input_ids.flatten()
        }



In [11]:
from transformers import TrainingArguments, Trainer

# Adjusted training arguments for CPU
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,  # Reduced for faster testing cycles
    per_device_train_batch_size=2,  # Reduced to ensure it fits into CPU memory
    per_device_eval_batch_size=2,
    warmup_steps=100,  # Reduced warmup steps
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,  
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch to save time during training
    save_strategy="no", 
    load_best_model_at_end=False,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=SummaryDataset(train_df['Processed_Text'], train_df['Processed_Summary'], tokenizer),
    eval_dataset=SummaryDataset(test_df['Processed_Text'], test_df['Processed_Summary'], tokenizer)
)

# Start training
trainer.train()


  1%|          | 7/1125 [00:12<33:41,  1.81s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  1%|          | 10/1125 [00:18<33:38,  1.81s/it]

{'loss': 6.8677, 'grad_norm': 196.80755615234375, 'learning_rate': 5e-06, 'epoch': 0.01}


  2%|▏         | 20/1125 [00:38<38:15,  2.08s/it]

{'loss': 4.4982, 'grad_norm': 136.36334228515625, 'learning_rate': 1e-05, 'epoch': 0.02}


  2%|▏         | 27/1125 [00:52<38:03,  2.08s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  3%|▎         | 30/1125 [00:59<37:53,  2.08s/it]

{'loss': 2.1354, 'grad_norm': 17.10883903503418, 'learning_rate': 1.5e-05, 'epoch': 0.03}


  4%|▎         | 40/1125 [01:19<37:26,  2.07s/it]

{'loss': 1.3154, 'grad_norm': 5.728706359863281, 'learning_rate': 2e-05, 'epoch': 0.04}


  4%|▍         | 50/1125 [01:40<37:22,  2.09s/it]

{'loss': 1.6443, 'grad_norm': 4.784907341003418, 'learning_rate': 2.5e-05, 'epoch': 0.04}


  5%|▌         | 60/1125 [02:01<36:33,  2.06s/it]

{'loss': 1.3028, 'grad_norm': 2.415741443634033, 'learning_rate': 3e-05, 'epoch': 0.05}


  6%|▌         | 70/1125 [02:21<36:11,  2.06s/it]

{'loss': 1.3933, 'grad_norm': 3.530714988708496, 'learning_rate': 3.5e-05, 'epoch': 0.06}


  7%|▋         | 80/1125 [02:42<35:45,  2.05s/it]

{'loss': 1.3121, 'grad_norm': 3.8117802143096924, 'learning_rate': 4e-05, 'epoch': 0.07}


  8%|▊         | 88/1125 [02:58<35:42,  2.07s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  8%|▊         | 90/1125 [03:03<35:43,  2.07s/it]

{'loss': 1.235, 'grad_norm': 12.521965026855469, 'learning_rate': 4.5e-05, 'epoch': 0.08}


  8%|▊         | 95/1125 [03:13<35:44,  2.08s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  9%|▉         | 100/1125 [03:23<35:16,  2.07s/it]

{'loss': 1.1439, 'grad_norm': 3.4287562370300293, 'learning_rate': 5e-05, 'epoch': 0.09}


 10%|▉         | 110/1125 [03:44<34:38,  2.05s/it]

{'loss': 1.0339, 'grad_norm': 7.32097053527832, 'learning_rate': 4.951219512195122e-05, 'epoch': 0.1}


 11%|█         | 120/1125 [04:05<35:25,  2.11s/it]

{'loss': 1.0046, 'grad_norm': 4.141312122344971, 'learning_rate': 4.902439024390244e-05, 'epoch': 0.11}


 12%|█▏        | 130/1125 [04:25<33:36,  2.03s/it]

{'loss': 1.6902, 'grad_norm': 7.125929355621338, 'learning_rate': 4.853658536585366e-05, 'epoch': 0.12}


 12%|█▏        | 140/1125 [04:46<33:17,  2.03s/it]

{'loss': 1.2928, 'grad_norm': 5.795243740081787, 'learning_rate': 4.804878048780488e-05, 'epoch': 0.12}


 13%|█▎        | 150/1125 [05:06<33:05,  2.04s/it]

{'loss': 0.933, 'grad_norm': 4.6902666091918945, 'learning_rate': 4.75609756097561e-05, 'epoch': 0.13}


 14%|█▍        | 160/1125 [05:27<32:41,  2.03s/it]

{'loss': 1.3889, 'grad_norm': 4.8847575187683105, 'learning_rate': 4.707317073170732e-05, 'epoch': 0.14}


 15%|█▌        | 170/1125 [05:47<32:34,  2.05s/it]

{'loss': 1.1076, 'grad_norm': 3.40095853805542, 'learning_rate': 4.658536585365854e-05, 'epoch': 0.15}


 15%|█▌        | 171/1125 [05:49<32:13,  2.03s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 15%|█▌        | 173/1125 [05:53<32:15,  2.03s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 16%|█▌        | 180/1125 [06:07<32:12,  2.05s/it]

{'loss': 1.9277, 'grad_norm': 3.4292473793029785, 'learning_rate': 4.609756097560976e-05, 'epoch': 0.16}


 17%|█▋        | 190/1125 [06:28<31:11,  2.00s/it]

{'loss': 1.1662, 'grad_norm': 4.771222114562988, 'learning_rate': 4.560975609756098e-05, 'epoch': 0.17}


 18%|█▊        | 200/1125 [06:48<31:17,  2.03s/it]

{'loss': 1.278, 'grad_norm': 4.037363529205322, 'learning_rate': 4.51219512195122e-05, 'epoch': 0.18}


 19%|█▊        | 210/1125 [07:10<32:00,  2.10s/it]

{'loss': 1.1243, 'grad_norm': 4.940978050231934, 'learning_rate': 4.4634146341463416e-05, 'epoch': 0.19}


 20%|█▉        | 220/1125 [07:33<37:20,  2.48s/it]

{'loss': 1.2882, 'grad_norm': 4.714722156524658, 'learning_rate': 4.414634146341464e-05, 'epoch': 0.2}


 20%|██        | 230/1125 [07:55<31:58,  2.14s/it]

{'loss': 1.254, 'grad_norm': 3.74481201171875, 'learning_rate': 4.3658536585365856e-05, 'epoch': 0.2}


 21%|██▏       | 240/1125 [08:16<31:26,  2.13s/it]

{'loss': 0.9751, 'grad_norm': 3.2940866947174072, 'learning_rate': 4.317073170731707e-05, 'epoch': 0.21}


 22%|██▏       | 250/1125 [08:37<29:54,  2.05s/it]

{'loss': 0.9638, 'grad_norm': 3.063807249069214, 'learning_rate': 4.26829268292683e-05, 'epoch': 0.22}


 23%|██▎       | 260/1125 [08:57<29:14,  2.03s/it]

{'loss': 1.0552, 'grad_norm': 3.3354597091674805, 'learning_rate': 4.2195121951219514e-05, 'epoch': 0.23}


 24%|██▍       | 270/1125 [09:17<28:58,  2.03s/it]

{'loss': 1.2628, 'grad_norm': 5.261321544647217, 'learning_rate': 4.170731707317073e-05, 'epoch': 0.24}


 25%|██▍       | 280/1125 [09:39<30:36,  2.17s/it]

{'loss': 1.2163, 'grad_norm': 2.672294855117798, 'learning_rate': 4.1219512195121954e-05, 'epoch': 0.25}


 26%|██▌       | 290/1125 [09:59<27:58,  2.01s/it]

{'loss': 1.0759, 'grad_norm': 2.0492522716522217, 'learning_rate': 4.073170731707317e-05, 'epoch': 0.26}


 27%|██▋       | 300/1125 [10:19<27:32,  2.00s/it]

{'loss': 1.7524, 'grad_norm': 2.859259843826294, 'learning_rate': 4.0243902439024395e-05, 'epoch': 0.27}


 28%|██▊       | 310/1125 [10:39<27:44,  2.04s/it]

{'loss': 1.4029, 'grad_norm': 3.2344107627868652, 'learning_rate': 3.975609756097561e-05, 'epoch': 0.28}


 28%|██▊       | 320/1125 [10:59<27:00,  2.01s/it]

{'loss': 1.3446, 'grad_norm': 3.5142500400543213, 'learning_rate': 3.9268292682926835e-05, 'epoch': 0.28}


 29%|██▉       | 330/1125 [11:20<26:43,  2.02s/it]

{'loss': 1.018, 'grad_norm': 2.919651985168457, 'learning_rate': 3.878048780487805e-05, 'epoch': 0.29}


 30%|███       | 340/1125 [11:40<26:12,  2.00s/it]

{'loss': 1.2315, 'grad_norm': 1.6393564939498901, 'learning_rate': 3.829268292682927e-05, 'epoch': 0.3}


 31%|███       | 350/1125 [12:00<25:42,  1.99s/it]

{'loss': 1.357, 'grad_norm': 6.476970195770264, 'learning_rate': 3.780487804878049e-05, 'epoch': 0.31}


 32%|███▏      | 360/1125 [12:20<25:29,  2.00s/it]

{'loss': 0.7691, 'grad_norm': 2.7523651123046875, 'learning_rate': 3.731707317073171e-05, 'epoch': 0.32}


 33%|███▎      | 370/1125 [12:40<25:23,  2.02s/it]

{'loss': 1.3274, 'grad_norm': 2.460200786590576, 'learning_rate': 3.682926829268293e-05, 'epoch': 0.33}


 34%|███▍      | 380/1125 [13:01<25:39,  2.07s/it]

{'loss': 1.1868, 'grad_norm': 4.077037334442139, 'learning_rate': 3.634146341463415e-05, 'epoch': 0.34}


 35%|███▍      | 390/1125 [13:21<24:57,  2.04s/it]

{'loss': 1.3867, 'grad_norm': 2.542285919189453, 'learning_rate': 3.585365853658537e-05, 'epoch': 0.35}


 36%|███▌      | 400/1125 [13:41<24:33,  2.03s/it]

{'loss': 1.0533, 'grad_norm': 2.3285491466522217, 'learning_rate': 3.5365853658536584e-05, 'epoch': 0.36}


 36%|███▋      | 410/1125 [14:02<24:14,  2.03s/it]

{'loss': 1.0583, 'grad_norm': 2.585453987121582, 'learning_rate': 3.48780487804878e-05, 'epoch': 0.36}


 37%|███▋      | 420/1125 [14:22<23:47,  2.03s/it]

{'loss': 1.2426, 'grad_norm': 3.832702875137329, 'learning_rate': 3.4390243902439025e-05, 'epoch': 0.37}


 38%|███▊      | 430/1125 [14:42<23:36,  2.04s/it]

{'loss': 1.0407, 'grad_norm': 2.7333967685699463, 'learning_rate': 3.390243902439025e-05, 'epoch': 0.38}


 39%|███▉      | 440/1125 [15:03<23:11,  2.03s/it]

{'loss': 0.9846, 'grad_norm': 1.6761990785598755, 'learning_rate': 3.3414634146341465e-05, 'epoch': 0.39}


 40%|████      | 450/1125 [15:23<22:54,  2.04s/it]

{'loss': 1.2603, 'grad_norm': 3.158294200897217, 'learning_rate': 3.292682926829269e-05, 'epoch': 0.4}


 41%|████      | 460/1125 [15:44<22:48,  2.06s/it]

{'loss': 1.2199, 'grad_norm': 3.9327540397644043, 'learning_rate': 3.2439024390243906e-05, 'epoch': 0.41}


 41%|████      | 464/1125 [15:52<22:51,  2.07s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 42%|████▏     | 470/1125 [16:04<22:00,  2.02s/it]

{'loss': 1.3089, 'grad_norm': 2.277754306793213, 'learning_rate': 3.195121951219512e-05, 'epoch': 0.42}


 43%|████▎     | 480/1125 [16:24<21:29,  2.00s/it]

{'loss': 1.0667, 'grad_norm': 1.97587251663208, 'learning_rate': 3.146341463414634e-05, 'epoch': 0.43}


 44%|████▎     | 490/1125 [16:44<21:19,  2.02s/it]

{'loss': 1.4443, 'grad_norm': 2.6271896362304688, 'learning_rate': 3.0975609756097564e-05, 'epoch': 0.44}


 44%|████▍     | 500/1125 [17:04<20:59,  2.02s/it]

{'loss': 1.2943, 'grad_norm': 2.377643346786499, 'learning_rate': 3.048780487804878e-05, 'epoch': 0.44}


 45%|████▌     | 510/1125 [17:25<20:40,  2.02s/it]

{'loss': 1.0774, 'grad_norm': 1.7367082834243774, 'learning_rate': 3e-05, 'epoch': 0.45}


 46%|████▌     | 520/1125 [17:45<20:19,  2.02s/it]

{'loss': 1.3583, 'grad_norm': 3.54937744140625, 'learning_rate': 2.951219512195122e-05, 'epoch': 0.46}


 46%|████▋     | 523/1125 [17:51<20:14,  2.02s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 47%|████▋     | 530/1125 [18:05<19:54,  2.01s/it]

{'loss': 1.4887, 'grad_norm': 1.7751518487930298, 'learning_rate': 2.9024390243902438e-05, 'epoch': 0.47}


 48%|████▊     | 540/1125 [18:25<19:27,  2.00s/it]

{'loss': 1.2468, 'grad_norm': 1.636317253112793, 'learning_rate': 2.8536585365853658e-05, 'epoch': 0.48}


 49%|████▉     | 550/1125 [18:45<19:22,  2.02s/it]

{'loss': 1.2477, 'grad_norm': 2.303053855895996, 'learning_rate': 2.8048780487804882e-05, 'epoch': 0.49}


 49%|████▉     | 553/1125 [18:51<19:21,  2.03s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 50%|████▉     | 560/1125 [19:05<19:08,  2.03s/it]

{'loss': 1.2663, 'grad_norm': 3.099046230316162, 'learning_rate': 2.7560975609756102e-05, 'epoch': 0.5}


 51%|█████     | 570/1125 [19:25<18:22,  1.99s/it]

{'loss': 1.3094, 'grad_norm': 1.8812451362609863, 'learning_rate': 2.707317073170732e-05, 'epoch': 0.51}


 52%|█████▏    | 580/1125 [19:46<18:14,  2.01s/it]

{'loss': 1.0893, 'grad_norm': 1.9136016368865967, 'learning_rate': 2.658536585365854e-05, 'epoch': 0.52}


 52%|█████▏    | 590/1125 [20:06<17:51,  2.00s/it]

{'loss': 1.3582, 'grad_norm': 4.501379013061523, 'learning_rate': 2.609756097560976e-05, 'epoch': 0.52}


 53%|█████▎    | 600/1125 [20:26<17:39,  2.02s/it]

{'loss': 1.2105, 'grad_norm': 2.318634033203125, 'learning_rate': 2.5609756097560977e-05, 'epoch': 0.53}


 54%|█████▍    | 610/1125 [20:46<17:32,  2.04s/it]

{'loss': 1.3127, 'grad_norm': 3.2392473220825195, 'learning_rate': 2.5121951219512197e-05, 'epoch': 0.54}


 55%|█████▌    | 620/1125 [21:06<16:52,  2.01s/it]

{'loss': 1.4327, 'grad_norm': 2.8960399627685547, 'learning_rate': 2.4634146341463414e-05, 'epoch': 0.55}


 56%|█████▌    | 630/1125 [21:26<16:34,  2.01s/it]

{'loss': 1.2943, 'grad_norm': 3.2989492416381836, 'learning_rate': 2.4146341463414634e-05, 'epoch': 0.56}


 57%|█████▋    | 640/1125 [21:46<16:18,  2.02s/it]

{'loss': 1.0082, 'grad_norm': 2.34958553314209, 'learning_rate': 2.3658536585365854e-05, 'epoch': 0.57}


 57%|█████▋    | 644/1125 [21:55<16:11,  2.02s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 58%|█████▊    | 650/1125 [22:07<15:49,  2.00s/it]

{'loss': 1.2995, 'grad_norm': 2.411652088165283, 'learning_rate': 2.3170731707317075e-05, 'epoch': 0.58}


 59%|█████▊    | 660/1125 [22:27<15:43,  2.03s/it]

{'loss': 1.1866, 'grad_norm': 3.716769218444824, 'learning_rate': 2.2682926829268295e-05, 'epoch': 0.59}


 60%|█████▉    | 670/1125 [22:47<15:05,  1.99s/it]

{'loss': 1.1605, 'grad_norm': 1.9937233924865723, 'learning_rate': 2.2195121951219512e-05, 'epoch': 0.6}


 60%|██████    | 680/1125 [23:07<14:45,  1.99s/it]

{'loss': 1.3951, 'grad_norm': 2.3112142086029053, 'learning_rate': 2.1707317073170732e-05, 'epoch': 0.6}


 61%|██████▏   | 690/1125 [23:27<14:36,  2.01s/it]

{'loss': 0.9537, 'grad_norm': 2.4533627033233643, 'learning_rate': 2.1219512195121953e-05, 'epoch': 0.61}


 62%|██████▏   | 700/1125 [23:47<14:15,  2.01s/it]

{'loss': 1.2821, 'grad_norm': 1.7136433124542236, 'learning_rate': 2.073170731707317e-05, 'epoch': 0.62}


 63%|██████▎   | 710/1125 [24:08<14:11,  2.05s/it]

{'loss': 1.2337, 'grad_norm': 1.9217866659164429, 'learning_rate': 2.0243902439024393e-05, 'epoch': 0.63}


 64%|██████▍   | 720/1125 [24:28<13:39,  2.02s/it]

{'loss': 1.3343, 'grad_norm': 2.8721237182617188, 'learning_rate': 1.975609756097561e-05, 'epoch': 0.64}


 65%|██████▍   | 730/1125 [24:48<13:23,  2.03s/it]

{'loss': 0.9915, 'grad_norm': 2.2739622592926025, 'learning_rate': 1.926829268292683e-05, 'epoch': 0.65}


 66%|██████▌   | 740/1125 [25:09<13:08,  2.05s/it]

{'loss': 1.2056, 'grad_norm': 1.5390937328338623, 'learning_rate': 1.878048780487805e-05, 'epoch': 0.66}


 66%|██████▌   | 741/1125 [25:11<13:05,  2.05s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 67%|██████▋   | 750/1125 [25:29<12:43,  2.03s/it]

{'loss': 1.6852, 'grad_norm': 1.5964077711105347, 'learning_rate': 1.8292682926829268e-05, 'epoch': 0.67}


 68%|██████▊   | 760/1125 [25:50<12:22,  2.03s/it]

{'loss': 1.5548, 'grad_norm': 2.9360086917877197, 'learning_rate': 1.7804878048780488e-05, 'epoch': 0.68}


 68%|██████▊   | 770/1125 [26:10<11:59,  2.03s/it]

{'loss': 1.0757, 'grad_norm': 1.9172416925430298, 'learning_rate': 1.7317073170731708e-05, 'epoch': 0.68}


 69%|██████▉   | 780/1125 [26:30<11:41,  2.03s/it]

{'loss': 0.9047, 'grad_norm': 1.6949537992477417, 'learning_rate': 1.682926829268293e-05, 'epoch': 0.69}


 70%|██████▉   | 786/1125 [26:43<11:36,  2.05s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 70%|███████   | 790/1125 [26:51<11:21,  2.04s/it]

{'loss': 1.1563, 'grad_norm': 2.079102039337158, 'learning_rate': 1.634146341463415e-05, 'epoch': 0.7}


 71%|███████   | 800/1125 [27:11<11:00,  2.03s/it]

{'loss': 1.1104, 'grad_norm': 1.6943649053573608, 'learning_rate': 1.5853658536585366e-05, 'epoch': 0.71}


 72%|███████▏  | 810/1125 [27:31<10:42,  2.04s/it]

{'loss': 1.2847, 'grad_norm': 1.5994163751602173, 'learning_rate': 1.5365853658536586e-05, 'epoch': 0.72}


 73%|███████▎  | 820/1125 [27:52<10:18,  2.03s/it]

{'loss': 1.4465, 'grad_norm': 3.0059804916381836, 'learning_rate': 1.4878048780487805e-05, 'epoch': 0.73}


 74%|███████▍  | 830/1125 [28:12<10:01,  2.04s/it]

{'loss': 1.3301, 'grad_norm': 2.1125380992889404, 'learning_rate': 1.4390243902439023e-05, 'epoch': 0.74}


 75%|███████▍  | 840/1125 [28:33<10:26,  2.20s/it]

{'loss': 1.3243, 'grad_norm': 1.1970617771148682, 'learning_rate': 1.3902439024390245e-05, 'epoch': 0.75}


 76%|███████▌  | 850/1125 [28:55<09:32,  2.08s/it]

{'loss': 1.3733, 'grad_norm': 2.2675747871398926, 'learning_rate': 1.3414634146341466e-05, 'epoch': 0.76}


 76%|███████▋  | 860/1125 [29:15<09:12,  2.09s/it]

{'loss': 0.9928, 'grad_norm': 1.9638851881027222, 'learning_rate': 1.2926829268292684e-05, 'epoch': 0.76}


 77%|███████▋  | 870/1125 [29:36<08:47,  2.07s/it]

{'loss': 1.0847, 'grad_norm': 1.5074142217636108, 'learning_rate': 1.2439024390243903e-05, 'epoch': 0.77}


 78%|███████▊  | 880/1125 [29:57<08:23,  2.06s/it]

{'loss': 1.0751, 'grad_norm': 2.14778208732605, 'learning_rate': 1.1951219512195121e-05, 'epoch': 0.78}


 79%|███████▊  | 884/1125 [30:05<08:11,  2.04s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 79%|███████▉  | 890/1125 [30:17<07:56,  2.03s/it]

{'loss': 1.7583, 'grad_norm': 2.7500343322753906, 'learning_rate': 1.1463414634146343e-05, 'epoch': 0.79}


 80%|████████  | 900/1125 [30:37<07:38,  2.04s/it]

{'loss': 0.9824, 'grad_norm': 2.2663142681121826, 'learning_rate': 1.0975609756097562e-05, 'epoch': 0.8}


 81%|████████  | 910/1125 [30:58<07:17,  2.03s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


{'loss': 0.8807, 'grad_norm': 2.059798240661621, 'learning_rate': 1.048780487804878e-05, 'epoch': 0.81}


 82%|████████▏ | 920/1125 [31:18<06:53,  2.02s/it]

{'loss': 1.7421, 'grad_norm': 1.6094987392425537, 'learning_rate': 1e-05, 'epoch': 0.82}


 83%|████████▎ | 930/1125 [31:38<06:32,  2.01s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


{'loss': 0.9684, 'grad_norm': 1.3759864568710327, 'learning_rate': 9.51219512195122e-06, 'epoch': 0.83}


 84%|████████▎ | 940/1125 [31:58<06:15,  2.03s/it]

{'loss': 1.0981, 'grad_norm': 1.8295528888702393, 'learning_rate': 9.02439024390244e-06, 'epoch': 0.84}


 84%|████████▍ | 950/1125 [32:19<05:56,  2.04s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


{'loss': 1.346, 'grad_norm': 2.0438761711120605, 'learning_rate': 8.53658536585366e-06, 'epoch': 0.84}


 85%|████████▌ | 960/1125 [32:39<05:34,  2.03s/it]

{'loss': 1.1233, 'grad_norm': 2.0559582710266113, 'learning_rate': 8.048780487804879e-06, 'epoch': 0.85}


 86%|████████▌ | 970/1125 [33:00<05:26,  2.11s/it]

{'loss': 0.9717, 'grad_norm': 2.768094301223755, 'learning_rate': 7.560975609756098e-06, 'epoch': 0.86}


 87%|████████▋ | 980/1125 [33:20<04:58,  2.06s/it]

{'loss': 1.3346, 'grad_norm': 2.8994603157043457, 'learning_rate': 7.073170731707317e-06, 'epoch': 0.87}


 88%|████████▊ | 990/1125 [33:41<04:34,  2.03s/it]

{'loss': 1.1365, 'grad_norm': 1.582295536994934, 'learning_rate': 6.585365853658537e-06, 'epoch': 0.88}


 89%|████████▉ | 1000/1125 [34:01<04:13,  2.03s/it]

{'loss': 1.2097, 'grad_norm': 1.547122597694397, 'learning_rate': 6.0975609756097564e-06, 'epoch': 0.89}


 90%|████████▉ | 1009/1125 [34:19<03:56,  2.04s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 90%|████████▉ | 1010/1125 [34:21<03:54,  2.04s/it]

{'loss': 0.9477, 'grad_norm': 3.490709066390991, 'learning_rate': 5.609756097560976e-06, 'epoch': 0.9}


 91%|█████████ | 1020/1125 [34:42<03:33,  2.03s/it]

{'loss': 1.1333, 'grad_norm': 1.2116446495056152, 'learning_rate': 5.121951219512195e-06, 'epoch': 0.91}


 92%|█████████▏| 1030/1125 [35:02<03:11,  2.02s/it]

{'loss': 1.1319, 'grad_norm': 1.7570533752441406, 'learning_rate': 4.634146341463415e-06, 'epoch': 0.92}


 92%|█████████▏| 1040/1125 [35:22<02:53,  2.04s/it]

{'loss': 1.3461, 'grad_norm': 1.8326247930526733, 'learning_rate': 4.146341463414634e-06, 'epoch': 0.92}


 93%|█████████▎| 1050/1125 [35:43<02:32,  2.04s/it]

{'loss': 1.0737, 'grad_norm': 1.6635725498199463, 'learning_rate': 3.6585365853658537e-06, 'epoch': 0.93}


 94%|█████████▍| 1060/1125 [36:03<02:13,  2.05s/it]

{'loss': 1.694, 'grad_norm': 2.1294546127319336, 'learning_rate': 3.1707317073170736e-06, 'epoch': 0.94}


 95%|█████████▌| 1070/1125 [36:24<01:52,  2.04s/it]

{'loss': 1.4459, 'grad_norm': 2.8880467414855957, 'learning_rate': 2.6829268292682926e-06, 'epoch': 0.95}


 96%|█████████▌| 1080/1125 [36:44<01:31,  2.04s/it]

{'loss': 1.4014, 'grad_norm': 3.579045295715332, 'learning_rate': 2.195121951219512e-06, 'epoch': 0.96}


 97%|█████████▋| 1090/1125 [37:05<01:11,  2.03s/it]

{'loss': 1.0035, 'grad_norm': 1.3954253196716309, 'learning_rate': 1.707317073170732e-06, 'epoch': 0.97}


 98%|█████████▊| 1100/1125 [37:25<00:50,  2.03s/it]

{'loss': 0.8816, 'grad_norm': 1.2859424352645874, 'learning_rate': 1.2195121951219514e-06, 'epoch': 0.98}


 99%|█████████▊| 1110/1125 [37:45<00:30,  2.03s/it]

{'loss': 1.0029, 'grad_norm': 2.0692379474639893, 'learning_rate': 7.317073170731708e-07, 'epoch': 0.99}


100%|█████████▉| 1120/1125 [38:06<00:10,  2.03s/it]

{'loss': 1.3586, 'grad_norm': 1.4782109260559082, 'learning_rate': 2.439024390243903e-07, 'epoch': 1.0}


100%|██████████| 1125/1125 [38:16<00:00,  2.02s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
                                             

{'eval_loss': 1.1394414901733398, 'eval_runtime': 210.7468, 'eval_samples_per_second': 3.559, 'eval_steps_per_second': 1.779, 'epoch': 1.0}
{'train_runtime': 2506.9478, 'train_samples_per_second': 0.898, 'train_steps_per_second': 0.449, 'train_loss': 1.3194318618774414, 'epoch': 1.0}





TrainOutput(global_step=1125, training_loss=1.3194318618774414, metrics={'train_runtime': 2506.9478, 'train_samples_per_second': 0.898, 'train_steps_per_second': 0.449, 'train_loss': 1.3194318618774414, 'epoch': 1.0})

In [None]:
!pip install rouge-score



In [14]:
from rouge_score import rouge_scorer



# Function to generate summaries
def generate_summary(model, tokenizer, text, max_length=256):
    try:
        input_ids = tokenizer.encode(text, return_tensors='pt')
        if input_ids.size(1) > max_length:
            print(f"Skipping generation for input of length {input_ids.size(1)} as it exceeds max_length of {max_length}.")
            return None

        summary_ids = model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)
        return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None
def compute_rouge_scores(model, dataset, tokenizer, dataframe):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores_list = []
    for i in range(len(dataset)):
        text = dataset.texts[i]
        reference_summary = dataset.summaries[i]
        generated_summary = generate_summary(model, tokenizer, text)
        if generated_summary is None:
            scores_list.append({key: None for key in ['ROUGE-1 Precision', 'ROUGE-1 Recall', 'ROUGE-1 F1', 'ROUGE-2 Precision', 'ROUGE-2 Recall', 'ROUGE-2 F1', 'ROUGE-L Precision', 'ROUGE-L Recall', 'ROUGE-L F1']})
            continue
        scores = scorer.score(reference_summary, generated_summary)
        scores_list.append({
            'ROUGE-1 Precision': scores['rouge1'].precision,            
            'ROUGE-1 Recall': scores['rouge1'].recall,
            'ROUGE-1 F1': scores['rouge1'].fmeasure,
            'ROUGE-2 Precision': scores['rouge2'].precision,
            'ROUGE-2 Recall': scores['rouge2'].recall,
            'ROUGE-2 F1': scores['rouge2'].fmeasure,
            'ROUGE-L Precision': scores['rougeL'].precision,
            'ROUGE-L Recall': scores['rougeL'].recall,
            'ROUGE-L F1': scores['rougeL'].fmeasure
        })

    # Append scores to the dataframe
    score_df = pd.DataFrame(scores_list)
    enhanced_df = pd.concat([dataframe.reset_index(drop=True), score_df], axis=1)
    return enhanced_df



# Initialize the datasets
train_dataset = SummaryDataset(train_df['Processed_Text'], train_df['Processed_Summary'], tokenizer)
test_dataset = SummaryDataset(test_df['Processed_Text'], test_df['Processed_Summary'], tokenizer)

# Evaluate ROUGE scores on the test set
final_scores_df = compute_rouge_scores(model, test_dataset, tokenizer, test_df)

# Save the final DataFrame with ROUGE scores to CSV
final_scores_df.to_csv("final_rouge_scores_2nd_Setting.csv", index=False)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

Skipping generation for input of length 329 as it exceeds max_length of 256.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Skipping generation for input of length 359 as it exceeds max_length of 256.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

Skipping generation for input of length 276 as it exceeds max_length of 256.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

Skipping generation for input of length 531 as it exceeds max_length of 256.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

In [15]:
final_scores_df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,...,Processed_Text,ROUGE-1 Precision,ROUGE-1 Recall,ROUGE-1 F1,ROUGE-2 Precision,ROUGE-2 Recall,ROUGE-2 F1,ROUGE-L Precision,ROUGE-L Recall,ROUGE-L F1
0,476269,B000SQLQ0Y,A31RSJTGLVV3TR,T. Wayne,5,8,1,1304812800,Made in China - With CANCER,If you don't mind the inevitable increased can...,...,nt mind inevitable increased cancer risk andor...,0.062500,1.000000,0.117647,0.021277,0.5,0.040816,0.041667,0.666667,0.078431
1,288473,B000ENUC3S,A2QN7FECIWB7D2,"Pm Rodgers ""pmiker""",0,1,5,1312070400,Real Cherry Flavor,While I did not at any time imagine that I was...,...,time imagine eating real piece pie flavor real...,0.078947,1.000000,0.146341,0.027027,0.5,0.051282,0.052632,0.666667,0.097561
2,199206,B0002NV04K,A26EXMDN188M0,Lysan,2,2,4,1299715200,A good buy...,"...money wise, in any case. They're much cheap...",...,money wise case re much cheaper stores area dr...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
3,229917,B000ZSX4GE,AM9LRYG6YXV83,"P. Verkhovensky ""uebermensch""",5,5,5,1212451200,"Unique product, well executed",The flavor of these nuts is similar to cinnamo...,...,flavor nuts similar cinnamon toast cereal yet ...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
4,271145,B000GG0BQ6,ABDDWK3DS2S5C,Amy Y. Chung,0,0,5,1308873600,Great Green Tea by Bigelow,This is a wonderful product by Bigelow. It he...,...,wonderful product bigelow helps keep bones str...,0.117647,0.500000,0.190476,0.000000,0.0,0.000000,0.058824,0.250000,0.095238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,558549,B000A11IRI,AL6V1X50WLAMY,Michelle Rama,8,8,5,1194048000,Awesome!,All of Ferdies products are delicious and well...,...,ferdies products delicious well worth price or...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
746,337262,B007OSBE1U,A2OEBM6XH6K7P4,The Glasscocks,2,3,5,1348531200,Great coffee experience,"Great coffee, very bold and smooth. It was ex...",...,great coffee bold smooth excellent price best ...,0.133333,0.666667,0.222222,0.071429,0.5,0.125000,0.133333,0.666667,0.222222
747,192723,B0031UBWLM,AOOIQU9NS7XU5,Jenny Miller,2,2,5,1330387200,Like Crack for Babies,My 6 month old son's favorite flavor. Wheneve...,...,6 month old son s favorite flavor whenever tee...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
748,426034,B000TQEWM2,APHXWZJ22M17M,T. Stratton,0,0,4,1210204800,Keurig Coffee,I like this coffee a lot. Also the Gloria Jea...,...,like coffee lot also gloria jean variety pack ...,0.034483,0.500000,0.064516,0.000000,0.0,0.000000,0.034483,0.500000,0.064516
