In [4]:
import pandas as pd

df = pd.read_csv("/Users/ajinkyamawal/Review-Sentiment-Classifier/capterra_reviews.csv")

# Create a binary label: 1 if rating >= 4, else 0
df['label'] = (df['overall_rating'] >= 4).astype(int)

# Keep only text and label columns
df = df[['overall_text', 'label']].rename(columns={'review': 'text'})
df = df.dropna()

df.to_csv('capterra_reviews_binary.csv', index=False)

In [5]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('capterra_reviews_binary.csv')
train, temp = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
val, test = train_test_split(temp, test_size=0.5, stratify=temp['label'], random_state=42)

train.to_csv('train.csv', index=False)
val.to_csv('val.csv', index=False)
test.to_csv('test.csv', index=False)


In [7]:
pip install datasets

Collecting datasets
  Using cached datasets-4.4.2-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from datasets)
  Using cached filelock-3.20.3-py3-none-any.whl.metadata (2.1 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Using cached pyarrow-22.0.0-cp313-cp313-macosx_12_0_arm64.whl.metadata (3.2 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Using cached dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting httpx<1.0.0 (from datasets)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.6.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (13 kB)
Collecting multiprocess<0.70.19 (from datasets)
  Using cached multiprocess-0.70.18-py313-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.10.0,>=2023.1.0 (from fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Using cached fsspec-2025.10.0-py3-none-any.whl.metad

In [8]:
from datasets import load_dataset, DatasetDict

data_files = {
    "train": "train.csv",
    "validation": "val.csv",
    "test": "test.csv",
}
datasets = load_dataset("csv", data_files=data_files)


  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 3881 examples [00:00, 414411.76 examples/s]
Generating validation split: 485 examples [00:00, 206040.46 examples/s]
Generating test split: 486 examples [00:00, 163152.85 examples/s]


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt, num_labels=2
)


In [None]:
for name, param in model.bert.named_parameters():
    param.requires_grad = False


In [None]:
def preprocess(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = datasets.map(preprocess, batched=True)


In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")
roc_auc = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = np.exp(logits) / np.exp(logits).sum(-1, keepdims=True)
    preds = np.argmax(logits, axis=1)
    auc = roc_auc.compute(prediction_scores=probs[:, 1], references=labels)["roc_auc"]
    acc = accuracy.compute(predictions=preds, references=labels)["accuracy"]
    return {"accuracy": acc, "roc_auc": auc}


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="bert-review-sentiment",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="roc_auc",
    greater_is_better=True,
    report_to="none",
)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


In [None]:
results = trainer.evaluate(tokenized_datasets["test"])
print(results)
