In [None]:
!pip install transformers datasets torch scikit-learn accelerate



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import re
import string
import torch
from sklearn.model_selection import train_test_split

# Load IMDb dataset
df = pd.read_csv("/content/drive/MyDrive/IMDB Dataset.csv")

# Convert 'positive' to 1 and 'negative' to 0
df["sentiment"] = df["sentiment"].map({"negative": 0, "positive": 1})

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

# Apply cleaning
df["clean_text"] = df["review"].apply(clean_text)

# Split dataset into 70% train, 15% validation, 15% test
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["clean_text"], df["sentiment"], test_size=0.30, random_state=42, stratify=df["sentiment"]
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.50, random_state=24, stratify=temp_labels
)

# Convert to lists
train_texts, val_texts, test_texts = list(train_texts), list(val_texts), list(test_texts)
train_labels, val_labels, test_labels = list(train_labels), list(val_labels), list(test_labels)


In [None]:
from transformers import RobertaTokenizer

# Load RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Tokenization function
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=512)

# Tokenize datasets
train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)
test_encodings = tokenize_function(test_texts)


In [None]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Create PyTorch Datasets
train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)


In [None]:
from transformers import RobertaForSequenceClassification

# Load pre-trained RoBERTa model for classification
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)


# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

if torch.cuda.is_available():
    print("CUDA is available. Training on GPU.")
else:
    print("No GPU detected. Training on CPU.")

Using device: cuda
CUDA is available. Training on GPU.


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    fp16=True,
    fp16_full_eval=True,
    save_total_limit=2,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    dataloader_num_workers=4,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    auto_find_batch_size=True,

)

print("GPU training optimized with Mixed Precision (fp16) and memory-efficient settings!")


🚀 GPU training optimized with Mixed Precision (fp16) and memory-efficient settings!




In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.2365,0.165212
2,0.1225,0.174384
3,0.0732,0.190813




TrainOutput(global_step=1641, training_loss=0.13667174332901735, metrics={'train_runtime': 3145.7429, 'train_samples_per_second': 33.378, 'train_steps_per_second': 0.522, 'total_flos': 2.76266608128e+16, 'train_loss': 0.13667174332901735, 'epoch': 3.0})

In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Get model predictions
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)

# Print detailed classification report
print(classification_report(test_labels, preds, target_names=["Negative", "Positive"]))



              precision    recall  f1-score   support

    Negative       0.95      0.95      0.95      3750
    Positive       0.95      0.95      0.95      3750

    accuracy                           0.95      7500
   macro avg       0.95      0.95      0.95      7500
weighted avg       0.95      0.95      0.95      7500



In [None]:
# Define the function for predicting sentiment
def predict_sentiment(text):
    model.eval()  # Set model to evaluation mode
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512).to("cuda" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits).item()

    return "Positive" if prediction == 1 else "Negative"

# Example Sentiment Predictions
print(predict_sentiment("This movie was absolutely fantastic!"))  # Expected: "Positive"
print(predict_sentiment("I hated every second of this film."))  # Expected: "Negative"

Positive
Negative
