In [1]:
import torch

print("CUDA available:", torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

CUDA available: False
Using device: cpu


In [2]:
!pip3 install torch torchvision


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [None]:
# Step 0: Install PyTorch (if not already)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Step 1: Load CSV files
import pandas as pd

train_df = pd.read_csv("/content/Train.csv")  # Must have 'review' & 'sentiment' columns
valid_df = pd.read_csv("/content/Valid.csv")
test_df  = pd.read_csv("/content/Test.csv")

print("Train size:", len(train_df))
print("Valid size:", len(valid_df))
print("Test size:", len(test_df))

# Step 2: Preprocess text
import re
def tokenize(text):
    text = re.sub(r"[^a-zA-Z ]", "", text)
    return text.lower().split()

train_tokenized = [tokenize(t) for t in train_df['text']]
valid_tokenized = [tokenize(t) for t in valid_df['text']]
test_tokenized  = [tokenize(t) for t in test_df['text']]

train_labels = (train_df['label'] == 'positive').astype(int).tolist()
valid_labels = (valid_df['label'] == 'positive').astype(int).tolist()
test_labels  = (test_df['label'] == 'positive').astype(int).tolist()

# Step 3: Build vocabulary from training data
word2idx = {"<pad>":0, "<unk>":1}
for sent in train_tokenized:
    for word in sent:
        if word not in word2idx:
            word2idx[word] = len(word2idx)

def encode_sentences(tokenized_list):
    return [[word2idx.get(w,1) for w in sent] for sent in tokenized_list]

train_encoded = encode_sentences(train_tokenized)
valid_encoded = encode_sentences(valid_tokenized)
test_encoded  = encode_sentences(test_tokenized)

# Step 4: Dataset & Dataloader
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class ReviewDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels
    def __len__(self):
        return len(self.sequences)
    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx]), torch.tensor(self.labels[idx])

def collate_fn(batch):
    seqs = [item[0] for item in batch]
    labs = [item[1] for item in batch]
    padded = pad_sequence(seqs, batch_first=True)
    return padded, torch.tensor(labs)

train_loader = DataLoader(ReviewDataset(train_encoded, train_labels), batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(ReviewDataset(valid_encoded, valid_labels), batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(ReviewDataset(test_encoded,  test_labels),  batch_size=32, shuffle=False, collate_fn=collate_fn)

# Step 5: Build RNN Model
import torch.nn as nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        x = self.embed(x)
        _, h = self.rnn(x)
        h = h.squeeze(0)
        out = self.fc(h)
        return self.sigmoid(out).squeeze()

model = SentimentRNN(len(word2idx)).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Step 6: Training
epochs = 3  # increase for better accuracy
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for seqs, labs in train_loader:
        seqs, labs = seqs.to(device), labs.float().to(device)
        optimizer.zero_grad()
        preds = model(seqs)
        loss = criterion(preds, labs)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

# Step 7: Validate
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for seqs, labs in valid_loader:
        seqs, labs = seqs.to(device), labs.to(device)
        preds = (model(seqs) >= 0.5).long()
        correct += (preds == labs).sum().item()
        total += labs.size(0)
print("Validation Accuracy: {:.2f}%".format(100*correct/total))

# Step 8: Test
correct = 0
total = 0
with torch.no_grad():
    for seqs, labs in test_loader:
        seqs, labs = seqs.to(device), labs.to(device)
        preds = (model(seqs) >= 0.5).long()
        correct += (preds == labs).sum().item()
        total += labs.size(0)
print("Test Accuracy: {:.2f}%".format(100*correct/total))

# Step 9: Predict on new sentences
def predict_sentiment(text):
    model.eval()
    tokens = [word2idx.get(w,1) for w in tokenize(text)]
    tensor = torch.tensor(tokens).unsqueeze(0).to(device)
    with torch.no_grad():
        score = model(tensor).item()
    return score

new_sentences = [
    "I really loved this movie, it was amazing!",
    "This was the worst film I have ever seen.",
    "The acting was okay, but the plot was boring."
]

for s in new_sentences:
    score = predict_sentiment(s)
    print(f"Sentence: {s}\nPredicted sentiment score (0=neg,1=pos): {score:.4f}\n")