In [5]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence

In [6]:
df = pd.read_csv('IMDB Dataset.csv')
df['sentiment'] = LabelEncoder().fit_transform(df['sentiment'])  # Convert labels to 0 (negative), 1 (positive)

In [7]:
def tokenize(text):
    return text.lower().split()

df['tokens'] = df['review'].apply(tokenize)

In [8]:
all_tokens = [token for tokens in df['tokens'] for token in tokens]
vocab = set(all_tokens)
word2idx = {word: idx + 1 for idx, word in enumerate(vocab)}  # +1 to reserve idx=0 for padding
vocab_size = len(word2idx) + 1


In [9]:
# Convert tokens to indices
df['indices'] = df['tokens'].apply(lambda x: [word2idx[word] for word in x if word in word2idx])

In [10]:
# Custom Dataset
class IMDBDataset(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        return torch.tensor(self.reviews[idx]), torch.tensor(self.labels[idx])

# Split dataset
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = IMDBDataset(train_data['indices'].tolist(), train_data['sentiment'].tolist())
test_dataset = IMDBDataset(test_data['indices'].tolist(), test_data['sentiment'].tolist())

In [11]:
# Collate function for padding
def collate_fn(batch):
    reviews, labels = zip(*batch)
    reviews = pad_sequence(reviews, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)
    return reviews, labels

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)


In [12]:
# CNN Model
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, kernel_sizes, num_filters):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, embed_dim)) for k in kernel_sizes
        ])
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)  # Add channel dimension
        x = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [torch.max_pool1d(feature, feature.size(2)).squeeze(2) for feature in x]
        x = torch.cat(x, dim=1)
        x = self.dropout(x)
        return self.fc(x)

# Hyperparameters
embed_dim = 100
kernel_sizes = [3, 4, 5]
num_filters = 100
num_classes = 2
learning_rate = 1e-3
num_epochs = 5

# Model, loss, optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TextCNN(vocab_size, embed_dim, num_classes, kernel_sizes, num_filters).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for reviews, labels in tqdm(train_loader):
        reviews, labels = reviews.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(reviews)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Evaluation
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for reviews, labels in test_loader:
        reviews, labels = reviews.to(device), labels.to(device)
        outputs = model(reviews)
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
print(f"Test Accuracy: {correct / total:.4f}")

100%|██████████| 625/625 [36:00<00:00,  3.46s/it]


Epoch 1/5, Loss: 0.6278


100%|██████████| 625/625 [34:50<00:00,  3.35s/it]


Epoch 2/5, Loss: 0.4760


100%|██████████| 625/625 [35:31<00:00,  3.41s/it]


Epoch 3/5, Loss: 0.3873


100%|██████████| 625/625 [36:02<00:00,  3.46s/it]


Epoch 4/5, Loss: 0.2935


100%|██████████| 625/625 [36:47<00:00,  3.53s/it]


Epoch 5/5, Loss: 0.2140
Test Accuracy: 0.8796


In [19]:
# Save the trained model
torch.save(model.state_dict(), "text_cnn_model.pth")
print("Model saved as text_cnn_model.pth")

Model saved as text_cnn_model.pth


In [18]:
# Preprocess and predict sentiment of input text
def predict_sentiment(model, text, word2idx, device):
    model.eval()
    tokens = tokenize(text)  # Tokenize input
    indices = torch.tensor([word2idx.get(word, 0) for word in tokens])  # Convert to indices, use 0 if word not in vocab
    indices = indices.unsqueeze(0).to(device)  # Add batch dimension
    with torch.no_grad():
        output = model(indices)
        prediction = torch.argmax(output, dim=1).item()  # Get the predicted class (0 or 1)
    return "Positive" if prediction == 1 else "Negative"

# Test input text
input_text = input("Enter a movie review: ")
result = predict_sentiment(model, input_text, word2idx, device)
print(f"Predicted Sentiment: {result}")


Enter a movie review: This movie is boring and violent!
Predicted Sentiment: Negative
