In [2]:
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [3]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report
import torch

In [5]:
# Load IMDb dataset from CSV
df = pd.read_csv("/content/drive/MyDrive/INST 750/Assignment 1/IMDB Dataset.csv")

# Display first few rows
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [6]:
# Convert 'positive' to 1 and 'negative' to 0
df["sentiment"] = df["sentiment"].map({"negative": 0, "positive": 1})

In [7]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

In [8]:
# Apply cleaning
df["clean_text"] = df["review"].apply(clean_text)

In [9]:
# First split: 70% train, 30% temp (validation + test)
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["clean_text"], df["sentiment"], test_size=0.30, random_state=42, stratify=df["sentiment"]
)

# Second split: 15% validation, 15% test
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.50, random_state=24, stratify=temp_labels
)

# Check dataset sizes
print(f"Training Size: {len(train_texts)}")
print(f"Validation Size: {len(val_texts)}")
print(f"Test Size: {len(test_texts)}")


Training Size: 35000
Validation Size: 7500
Test Size: 7500


In [10]:
vectorizer = TfidfVectorizer(max_features=20000, stop_words="english", norm="l2")
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

In [11]:
# Train Linear SVM
clf_linsvm = LinearSVC(C=0.1, class_weight="balanced", dual=False)
clf_linsvm.fit(X_train, train_labels)

# Predictions
preds_linsvm = clf_linsvm.predict(X_test)

In [12]:
# Evaluate Performance
print("Linear SVM Accuracy:", accuracy_score(test_labels, preds_linsvm))
print(classification_report(test_labels, preds_linsvm))


Linear SVM Accuracy: 0.8941333333333333
              precision    recall  f1-score   support

           0       0.90      0.89      0.89      3750
           1       0.89      0.90      0.90      3750

    accuracy                           0.89      7500
   macro avg       0.89      0.89      0.89      7500
weighted avg       0.89      0.89      0.89      7500



In [13]:
# Train Logistic Regression model
clf = LogisticRegression(max_iter=1000, C=10)
clf.fit(X_train, train_labels)

# Predictions
preds = clf.predict(X_test)

# Evaluate Performance
print("Accuracy:", accuracy_score(test_labels, preds))
print(classification_report(test_labels, preds))


Accuracy: 0.89
              precision    recall  f1-score   support

           0       0.89      0.89      0.89      3750
           1       0.89      0.89      0.89      3750

    accuracy                           0.89      7500
   macro avg       0.89      0.89      0.89      7500
weighted avg       0.89      0.89      0.89      7500

