In [1]:
# 📦 Install required libraries
!pip install transformers datasets scikit-learn --quiet
!pip install evaluate --quiet


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
torch.cuda.is_available()

True

In [3]:
# 📚 Import required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments


In [4]:
# 📚 Load the Amazon Fine Food Reviews dataset
import pandas as pd

print("🔵 Loading dataset...")
df = pd.read_csv('/content/amazon_reviews.csv')

# 🛠️ Keep only 'Text' and 'Score' columns
df = df[['Text', 'Score']].rename(columns={'Text': 'text', 'Score': 'score'})

# 🔥 Map Sentiment (0: Negative, 1: Neutral, 2: Positive)
def map_sentiment(score):
    if score <= 2:
        return 0  # Negative
    elif score == 3:
        return 1  # Neutral
    else:
        return 2  # Positive

df['label'] = df['score'].apply(map_sentiment)

print(f"✅ Dataset loaded! Total samples: {len(df)}")
df.head()


🔵 Loading dataset...
✅ Dataset loaded! Total samples: 568454


Unnamed: 0,text,score,label
0,I have bought several of the Vitality canned d...,5,2
1,Product arrived labeled as Jumbo Salted Peanut...,1,0
2,This is a confection that has been around a fe...,4,2
3,If you are looking for the secret ingredient i...,2,0
4,Great taffy at a great price. There was a wid...,5,2


In [5]:
# 📂 Use full dataset without sampling
print(f"✅ Using full dataset with {len(df)} reviews for fine-tuning")
df.head()


✅ Using full dataset with 568454 reviews for fine-tuning


Unnamed: 0,text,score,label
0,I have bought several of the Vitality canned d...,5,2
1,Product arrived labeled as Jumbo Salted Peanut...,1,0
2,This is a confection that has been around a fe...,4,2
3,If you are looking for the secret ingredient i...,2,0
4,Great taffy at a great price. There was a wid...,5,2


In [6]:
# 🔥 Load RoBERTa tokenizer
from transformers import AutoTokenizer

print("🛠️ Loading RoBERTa tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
print("✅ Loaded RoBERTa tokenizer...")


🛠️ Loading RoBERTa tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

✅ Loaded RoBERTa tokenizer...


In [7]:
# 🧹 Split dataset into train and validation
from sklearn.model_selection import train_test_split

print("🛠️ Splitting dataset into train and validation sets...")
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

print(f"✅ Training samples: {len(train_texts)}")
print(f"✅ Validation samples: {len(val_texts)}")


🛠️ Splitting dataset into train and validation sets...
✅ Training samples: 454763
✅ Validation samples: 113691


In [8]:
# ✏️ Define tokenization function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=128)

# 🛠️ Format as Huggingface Datasets
from datasets import Dataset

train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
val_dataset = Dataset.from_dict({'text': val_texts, 'label': val_labels})

# 🧹 Tokenize the datasets
train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

print("✅ Tokenization and dataset formatting complete.")


Map:   0%|          | 0/454763 [00:00<?, ? examples/s]

Map:   0%|          | 0/113691 [00:00<?, ? examples/s]

✅ Tokenization and dataset formatting complete.


In [9]:
# 🔥 Load RoBERTa model
from transformers import AutoModelForSequenceClassification

print("🛠️ Loading RoBERTa model...")
model = AutoModelForSequenceClassification.from_pretrained(
    "cardiffnlp/twitter-roberta-base-sentiment",
    num_labels=3  # 3 classes: Negative, Neutral, Positive
)


🛠️ Loading RoBERTa model...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir="./logs",
    save_steps=10000,
    logging_steps=500,
    report_to="none"
)


In [11]:
# Define compute_metrics function
from sklearn.metrics import accuracy_score
import torch

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}


In [12]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


In [13]:
# ✅ Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 🔥 Start fine-tuning RoBERTa
trainer.train()

# 📈 Evaluate validation accuracy after training
results = trainer.evaluate()
print(f"✅ Final Validation Accuracy: {results['eval_accuracy'] * 100:.2f}%")

# 📂 Define save path in Google Drive
drive_model_dir = "/content/drive/MyDrive/roberta_finetuned/"
os.makedirs(drive_model_dir, exist_ok=True)

# 📦 Save model and tokenizer directly to Google Drive
model.save_pretrained(drive_model_dir)
tokenizer.save_pretrained(drive_model_dir)

# 📄 Save validation accuracy
accuracy = results['eval_accuracy']
with open(os.path.join(drive_model_dir, "accuracy.txt"), "w") as f:
    f.write(str(accuracy))

print(f"✅ Validation Accuracy Saved to Drive: {accuracy:.4f}")

# 🗜️ Zip and save to Google Drive
!zip -r "/content/drive/MyDrive/roberta_finetuned.zip" ./models/roberta_finetuned/

print("✅ Zipped model saved to Google Drive.")


Mounted at /content/drive


Step,Training Loss
500,0.3561
1000,0.3096
1500,0.2975
2000,0.2953
2500,0.2882
3000,0.2614
3500,0.2588
4000,0.2639
4500,0.2601
5000,0.2538


✅ Final Validation Accuracy: 92.92%
✅ Validation Accuracy Saved to Drive: 0.9292

zip error: Nothing to do! (try: zip -r /content/drive/MyDrive/roberta_finetuned.zip . -i ./models/roberta_finetuned/)
✅ Zipped model saved to Google Drive.


In [14]:
results = trainer.evaluate()
print(f"✅ Final Validation Accuracy: {results['eval_accuracy']*100:.2f}%")


✅ Final Validation Accuracy: 92.92%


In [None]:
model.save_pretrained("./models/roberta_finetuned/")
tokenizer.save_pretrained("./models/roberta_finetuned/")


In [16]:
with open('./models/roberta_finetuned/accuracy.txt', 'w') as f:
    f.write(str(results['eval_accuracy']))


In [17]:
!zip -r roberta_finetuned.zip ./models/roberta_finetuned/
from google.colab import files
files.download('roberta_finetuned.zip')


  adding: models/roberta_finetuned/ (stored 0%)
  adding: models/roberta_finetuned/model.safetensors (deflated 7%)
  adding: models/roberta_finetuned/tokenizer_config.json (deflated 75%)
  adding: models/roberta_finetuned/special_tokens_map.json (deflated 84%)
  adding: models/roberta_finetuned/merges.txt (deflated 53%)
  adding: models/roberta_finetuned/vocab.json (deflated 59%)
  adding: models/roberta_finetuned/config.json (deflated 52%)
  adding: models/roberta_finetuned/accuracy.txt (stored 0%)
  adding: models/roberta_finetuned/tokenizer.json (deflated 82%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>