In [1]:
!pip install transformers datasets torch tqdm


Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm


In [28]:
df = pd.read_csv("/content/amazon_alexa.tsv", sep="\t")


In [29]:
df.rename(columns={"verified_reviews": "text", "feedback": "label"}, inplace=True)


In [30]:
df.dropna(inplace=True)


In [31]:
df["label"] = df["label"].astype(int)


In [32]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(train_texts)


In [35]:
adasyn = ADASYN(sampling_strategy="auto", random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_train_tfidf, train_labels)


In [36]:
train_texts_resampled = [" ".join(vectorizer.inverse_transform(x)[0]) for x in X_resampled]
train_labels_resampled = y_resampled

In [37]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx], padding="max_length", truncation=True, max_length=256, return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [38]:
train_dataset = SentimentDataset(train_texts_resampled, train_labels_resampled)
test_dataset = SentimentDataset(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [39]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [40]:
for param in model.bert.encoder.layer[:6].parameters():
    param.requires_grad = False


In [41]:
class_weights = compute_class_weight("balanced", classes=np.unique(train_labels_resampled), y=train_labels_resampled)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)


In [42]:
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss(weight=class_weights)


In [43]:
epochs = 5
model.train()

for epoch in range(epochs):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)

        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

print("✅ Training Complete!")

Epoch 1: 100%|██████████| 292/292 [02:43<00:00,  1.79it/s, loss=0.00167]
Epoch 2: 100%|██████████| 292/292 [02:41<00:00,  1.81it/s, loss=0.00331]
Epoch 3: 100%|██████████| 292/292 [02:41<00:00,  1.81it/s, loss=0.0821]
Epoch 4: 100%|██████████| 292/292 [02:41<00:00,  1.81it/s, loss=0.000288]
Epoch 5: 100%|██████████| 292/292 [02:41<00:00,  1.81it/s, loss=0.00848]

✅ Training Complete!





In [44]:
model.eval()
correct, total = 0, 0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        correct += (predictions == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f" Test Accuracy: {accuracy:.4f}")

✅ Test Accuracy: 0.9000


In [46]:
def predict_sentiment(review):
    model.eval()
    with torch.no_grad():
        encoding = tokenizer(review, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
        input_ids = encoding["input_ids"].to(device)
        attention_mask = encoding["attention_mask"].to(device)

        output = model(input_ids, attention_mask=attention_mask)
        prediction = torch.argmax(output.logits, dim=1).item()

        return "Positive" if prediction == 1 else "Negative"


new_reviews = [
    "This product is amazing! Works perfectly.",
    "I didnt like this product"
]

for review in new_reviews:
    print(f"Review: {review} → Sentiment: {predict_sentiment(review)}")

Review: This product is amazing! Works perfectly. → Sentiment: Positive
Review: I didnt like this product → Sentiment: Negative


In [47]:
import pickle
import torch

with open("sentiment_model.pkl", "wb") as f:
    pickle.dump(model.state_dict(), f)

print("Model saved successfully using pickle!")


✅ Model saved successfully using pickle!


In [49]:
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
