<a href="https://colab.research.google.com/github/appliedcode/mthree-c422/blob/mthree-c422-Avantika/Implement_a_Transformer_Model_for_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
#Import the necessary Libraries
import re
import pandas as pd
import torch
from sklearn.metrics import classification_report
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, logging
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
logging.set_verbosity_error()

# 1. Load Train and Test data
train_df = pd.read_csv('https://raw.githubusercontent.com/appliedcode/mthree-c422/refs/heads/main/Exercises/day-10/Data/Train.csv')
test_df = pd.read_csv('https://raw.githubusercontent.com/appliedcode/mthree-c422/refs/heads/main/Exercises/day-10/Data/Test.csv')

train_df.head(5)

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [10]:
print(f"Total rows in dataset = {len(train_df)} \n")                                        # total rows in dataset
print(f"Total negative and positive in dataset {train_df['label'].value_counts()} \n")      # total unqiue values of label

Total rows in dataset = 500 

Total negative and positive in dataset label
0    260
1    240
Name: count, dtype: int64 



In [11]:
# Check for GPU availability
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

Using device: cuda


In [12]:
# 2. Clean text
def clean_text(text):
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train_df["cleaned_text"] = train_df["text"].apply(clean_text)
test_df["cleaned_text"] = test_df["text"].apply(clean_text)


In [13]:
# 3. Tokenization
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class TweetDataset(Dataset):
    def __init__(self, df):
        self.encodings = tokenizer(df["cleaned_text"].tolist(), truncation=True, padding=True)
        self.labels = df["label"].tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = TweetDataset(train_df)
test_dataset = TweetDataset(test_df)

In [14]:
# 4. Load model
num_labels = len(set(train_df["label"]) | set(test_df["label"]))
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

# 5. Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=8,    # Increase batch size if memory allows
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)

# 6. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
)

# 7. Train
trainer.train()

  trainer = Trainer(
  return forward_call(*args, **kwargs)


{'loss': 0.7359, 'grad_norm': 5.21240234375, 'learning_rate': 4.2857142857142856e-05, 'epoch': 0.15873015873015872}
{'loss': 0.6711, 'grad_norm': 1.7415807247161865, 'learning_rate': 3.492063492063492e-05, 'epoch': 0.31746031746031744}
{'loss': 0.7127, 'grad_norm': 13.125226020812988, 'learning_rate': 2.6984126984126984e-05, 'epoch': 0.47619047619047616}
{'loss': 0.6669, 'grad_norm': 4.672375679016113, 'learning_rate': 1.9047619047619046e-05, 'epoch': 0.6349206349206349}
{'loss': 0.6237, 'grad_norm': 4.720478534698486, 'learning_rate': 1.1111111111111112e-05, 'epoch': 0.7936507936507936}
{'loss': 0.5832, 'grad_norm': 5.1618499755859375, 'learning_rate': 3.1746031746031746e-06, 'epoch': 0.9523809523809523}
{'train_runtime': 58.068, 'train_samples_per_second': 8.611, 'train_steps_per_second': 1.085, 'train_loss': 0.6604682642316061, 'epoch': 1.0}


TrainOutput(global_step=63, training_loss=0.6604682642316061, metrics={'train_runtime': 58.068, 'train_samples_per_second': 8.611, 'train_steps_per_second': 1.085, 'train_loss': 0.6604682642316061, 'epoch': 1.0})

In [15]:
# 8. Evaluate
predictions = trainer.predict(test_dataset)
preds = torch.argmax(torch.tensor(predictions.predictions), axis=1)

# 9. Report
print("\nClassification Report:\n")
print(classification_report(test_df["label"], preds))


  return forward_call(*args, **kwargs)



Classification Report:

              precision    recall  f1-score   support

           0       0.78      0.94      0.86        85
           1       0.90      0.66      0.76        65

    accuracy                           0.82       150
   macro avg       0.84      0.80      0.81       150
weighted avg       0.83      0.82      0.81       150



In [16]:
# Convert predictions and actual labels to lists
predicted_labels = preds.tolist()
actual_labels = test_df["label"].tolist()

# Compare in a DataFrame
comparison_df = pd.DataFrame({
    "text": test_df["cleaned_text"].tolist(),
    "actual": actual_labels,
    "predicted": predicted_labels
})

# Print a sample comparison
comparison_df.head(20)


Unnamed: 0,text,actual,predicted
0,I always wrote this series off as being a comp...,0,0
1,1st watched 1272002 3 out of 10DirSteve Purcel...,0,0
2,This movie was so poorly written and directed ...,0,0
3,The most interesting thing about Miryang Secre...,1,1
4,when i first read about berlin am meer i didnt...,0,0
5,I saw this film on September 1st 2005 in India...,1,1
6,I saw a screening of this movie last night I h...,0,0
7,William Hurt may not be an American matinee id...,1,1
8,IT IS A PIECE OF CRAP not funny at all during ...,0,0
9,IM BOUT IT1997br br Developed published by No ...,0,0
