In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm.auto import tqdm

file_path = '/content/drive/MyDrive/IR_Assignment4/Reviews.csv'
df = pd.read_csv(file_path)

In [None]:

import nltk
nltk.download('punkt')
nltk.download('stopwords')

def clean_and_tokenize_text(text):

    processed_text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    words = word_tokenize(processed_text)
    filtered_words = [word for word in words if word not in stopwords_list]
    return ' '.join(filtered_words)


stopwords_list = set(stopwords.words('english'))

df.dropna(subset=['Text', 'Summary'], inplace=True)

from tqdm import tqdm
tqdm.pandas(desc="Processing 'Text' column")
df['Cleaned_Text'] = df['Text'].progress_apply(clean_and_tokenize_text)

tqdm.pandas(desc="Processing 'Summary' column")
df['Cleaned_Summary'] = df['Summary'].progress_apply(clean_and_tokenize_text)

df.to_csv('/content/drive/MyDrive/IR_Assignment4/Cleaned_Reviews.csv', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Processing 'Text' column: 100%|██████████| 568427/568427 [03:42<00:00, 2559.91it/s]
Processing 'Summary' column: 100%|██████████| 568427/568427 [00:59<00:00, 9497.66it/s] 


In [13]:
pip uninstall torch torchvision torchaudio -y


Found existing installation: torch 2.2.2
Uninstalling torch-2.2.2:
  Successfully uninstalled torch-2.2.2
Found existing installation: torchvision 0.17.2
Uninstalling torchvision-0.17.2:
  Successfully uninstalled torchvision-0.17.2
Found existing installation: torchaudio 2.2.2
Uninstalling torchaudio-2.2.2:
  Successfully uninstalled torchaudio-2.2.2


In [14]:
pip cache purge


Files removed: 93


In [15]:
pip install torch torchvision torchaudio


Collecting torch
  Downloading torch-2.2.2-cp310-cp310-manylinux1_x86_64.whl (755.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision
  Downloading torchvision-0.17.2-cp310-cp310-manylinux1_x86_64.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio
  Downloading torchaudio-2.2.2-cp310-cp310-manylinux1_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch, torchvision, torchaudio
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchtext 0.17.1 requires torch==2.2.1, but you have torch 2.2.2 which is incompatible.[0m[31m
[0mSu

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from sklearn.model_selection import train_test_split

import pandas as pd

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')

sample_size = 20000
max_length = 512

data_frame = pd.read_csv('/content/drive/MyDrive/IR_Assignment4/Cleaned_Reviews.csv')
sampled_data = data_frame.sample(n=sample_size, random_state=42)

class TextSummaryDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512, compute_device='cpu'):
        self.tokenizer = tokenizer
        self.texts = texts
        self.max_length = max_length
        self.compute_device = compute_device

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts.iloc[index]['Cleaned_Text'])
        summary = str(self.texts.iloc[index]['Cleaned_Summary'])

        try:
            input_encodings = self.tokenizer(text, truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")
            target_encodings = self.tokenizer(summary, truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")
        except Exception as error:
            print(f"Error while encoding text: {text} or summary: {summary}")
            raise error

        input_encodings = {key: value.squeeze().to(self.compute_device) for key, value in input_encodings.items()}
        input_encodings['labels'] = target_encodings['input_ids'].squeeze().to(self.compute_device)
        return input_encodings


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_data, eval_data = train_test_split(sampled_data, test_size=0.25, random_state=42)

train_dataset = TextSummaryDataset(train_data, tokenizer, max_length, compute_device=device)
eval_dataset = TextSummaryDataset(eval_data, tokenizer, max_length, compute_device=device)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8)

train_args = TrainingArguments(
    output_dir='./model_results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./training_logs',
    logging_steps=10,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=train_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

trainer.train()

model.save_pretrained('/content/drive/MyDrive/IR_Assignment4/finetuned_gpt2')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,5.669,5.568074
2,5.3782,5.463712


In [3]:
!pip install rouge


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [5]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from rouge import Rouge

model_dir = '/content/drive/MyDrive/IR_Assignment4/finetuned_gpt2'


tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained(model_dir)
tokenizer.pad_token = tokenizer.eos_token

def create_summary(input_text, summary_length=512):
    model.eval()
    compute_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(compute_device)

    encoded_inputs = tokenizer.encode_plus(
        input_text,
        return_tensors='pt',
        max_length=summary_length,
        pad_to_max_length=True,
        truncation=True
    ).to(compute_device)

    input_ids = encoded_inputs['input_ids']
    attention_masks = encoded_inputs['attention_mask']

    while len(input_ids[0]) < summary_length:
        output = model(input_ids, attention_mask=attention_masks)
        predictions = output.logits[:, -1, :]
        next_token = torch.argmax(predictions, dim=-1).unsqueeze(-1)

        if next_token == tokenizer.eos_token_id:
            break

        input_ids = torch.cat([input_ids, next_token], dim=-1)
        new_attention_mask = torch.ones((1, 1), device=compute_device)
        attention_masks = torch.cat([attention_masks, new_attention_mask], dim=-1)

    generated_text = tokenizer.decode(input_ids.squeeze(), skip_special_tokens=True)
    return generated_text

def evaluate_summary(true_summary, pred_summary):
    evaluation = Rouge()
    result = evaluation.get_scores(pred_summary, true_summary, avg=True)
    return result

test_input = "The Fender CD-60S Dreadnought Acoustic Guitar is a great instrument for beginners. It has a solid construction, produces a rich sound, and feels comfortable to play. However, some users have reported issues with the tuning stability."
expected_summary = "Good for beginners but has tuning stability issues."

computed_summary = create_summary(test_input)
print("Computed Summary:", computed_summary)

rouge_results = evaluate_summary(expected_summary, computed_summary)
print("ROUGE Scores:", rouge_results)


Computed Summary: The Fender CD-60S Dreadnought Acoustic Guitar is a great instrument for beginners. It has a solid construction, produces a rich sound, and feels comfortable to play. However, some users have reported issues with the tuning stability.
ROUGE Scores: {'rouge-1': {'r': 0.75, 'p': 0.17647058823529413, 'f': 0.2857142826303855}, 'rouge-2': {'r': 0.2857142857142857, 'p': 0.05714285714285714, 'f': 0.09523809246031753}, 'rouge-l': {'r': 0.625, 'p': 0.14705882352941177, 'f': 0.2380952350113379}}


