In [None]:
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from collections import Counter
import numpy as np
import pandas as pd

# Load your normalized_data
data = pd.read_csv("normalized_data.csv")

In [None]:
# Encode labels as integers
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])  # 0 for negative, 1 for positive

# Prepare the vocabulary
all_tokens = [token for tokens in data['normalized_tokens'].apply(eval).tolist() for token in tokens]
vocab = Counter(all_tokens)
vocab = {word: idx + 1 for idx, (word, _) in enumerate(vocab.most_common())}  # Reserve index 0 for padding
vocab_size = len(vocab) + 1  # Add one for padding index

# Convert normalized_tokens into sequences of indices
def tokens_to_indices(tokens, vocab, max_len=100):
    indices = [vocab.get(token, 0) for token in tokens[:max_len]]  # Truncate or pad tokens
    return indices + [0] * (max_len - len(indices))

max_len = 60  # Set maximum sequence length
data['token_indices'] = data['normalized_tokens'].apply(lambda x: tokens_to_indices(eval(x), vocab, max_len))

# Split into training and testing sets
X = np.array(data['token_indices'].tolist())
y = np.array(data['label_encoded'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Define Dataset class
class SentimentDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = torch.tensor(inputs, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

In [None]:

# Create Dataloaders
train_dataset = SentimentDataset(X_train, y_train)
test_dataset = SentimentDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:
# Define CNN model
class CNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_classes, kernel_sizes, num_filters):
        super(CNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.convs = nn.ModuleList([
            nn.Sequential(
                nn.Conv2d(1, num_filters, (k, embed_size)),
                nn.BatchNorm2d(num_filters),
                nn.ReLU()
            ) for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(0.6)
        self.fc = nn.Linear(len(kernel_sizes) * num_filters, num_classes)

    def forward(self, x):
        x = self.embedding(x)  # [batch_size, seq_len, embed_size]
        x = x.unsqueeze(1)  # Add channel dimension: [batch_size, 1, seq_len, embed_size]
        x = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]  # Apply each Conv2D
        x = [torch.max(pool, 2)[0] for pool in x]  # Max pooling
        x = torch.cat(x, 1)  # Concatenate feature maps
        x = self.dropout(x)
        x = self.fc(x)
        return x


In [None]:
# Hyperparameters
embed_size = 128
num_classes = len(label_encoder.classes_)
kernel_sizes = [2, 3, 4, 5, 6]
num_filters = 150

# Initialize model, loss, and optimizer
model = CNNModel(vocab_size, embed_size, num_classes, kernel_sizes, num_filters)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)


In [None]:
# Training loop
def train_model(model, train_loader, test_loader, criterion, optimizer, scheduler, epochs=10):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        all_preds = []
        all_labels = []

        # Progress bar for batches
        print(f"\nEpoch {epoch+1}/{epochs}")
        progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc="Training")

        for batch_idx, (inputs, labels) in progress_bar:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            # Update progress bar
            progress_bar.set_postfix({"Batch Loss": loss.item()})

        # Calculate epoch-level train accuracy
        train_accuracy = accuracy_score(all_labels, all_preds)
        print(f"Epoch {epoch+1} Summary: Loss = {total_loss / len(train_loader):.4f}, Train Accuracy = {train_accuracy:.4f}")

        # Adjust learning rate
        scheduler.step()

        # Evaluate on validation set
        evaluate_model(model, test_loader)

In [None]:
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    val_accuracy = accuracy_score(all_labels, all_preds)
    print(f"Validation Loss: {total_loss / len(test_loader):.4f}, Validation Accuracy: {val_accuracy:.4f}")

In [None]:
# Train the model
train_model(model, train_loader, test_loader, criterion, optimizer, scheduler, epochs=10)



Epoch 1/10


Training: 100%|██████████| 526/526 [04:09<00:00,  2.11it/s, Batch Loss=1.02]


Epoch 1 Summary: Loss = 0.8822, Train Accuracy = 0.5955
Validation Loss: 0.6269, Validation Accuracy: 0.6369

Epoch 2/10


Training: 100%|██████████| 526/526 [04:06<00:00,  2.13it/s, Batch Loss=0.628]


Epoch 2 Summary: Loss = 0.6469, Train Accuracy = 0.6622
Validation Loss: 0.5183, Validation Accuracy: 0.7361

Epoch 3/10


Training: 100%|██████████| 526/526 [04:12<00:00,  2.09it/s, Batch Loss=0.445]


Epoch 3 Summary: Loss = 0.5678, Train Accuracy = 0.7179
Validation Loss: 0.4966, Validation Accuracy: 0.7475

Epoch 4/10


Training: 100%|██████████| 526/526 [04:18<00:00,  2.03it/s, Batch Loss=0.262]


Epoch 4 Summary: Loss = 0.4434, Train Accuracy = 0.7934
Validation Loss: 0.4728, Validation Accuracy: 0.7803

Epoch 5/10


Training: 100%|██████████| 526/526 [04:17<00:00,  2.04it/s, Batch Loss=0.685]


Epoch 5 Summary: Loss = 0.3905, Train Accuracy = 0.8311
Validation Loss: 0.4512, Validation Accuracy: 0.7915

Epoch 6/10


Training: 100%|██████████| 526/526 [04:21<00:00,  2.01it/s, Batch Loss=0.0945]


Epoch 6 Summary: Loss = 0.3380, Train Accuracy = 0.8588
Validation Loss: 0.4512, Validation Accuracy: 0.7970

Epoch 7/10


Training: 100%|██████████| 526/526 [04:19<00:00,  2.03it/s, Batch Loss=0.277]


Epoch 7 Summary: Loss = 0.2614, Train Accuracy = 0.9002
Validation Loss: 0.4785, Validation Accuracy: 0.8019

Epoch 8/10


Training: 100%|██████████| 526/526 [04:21<00:00,  2.01it/s, Batch Loss=0.169]


Epoch 8 Summary: Loss = 0.2275, Train Accuracy = 0.9160
Validation Loss: 0.4907, Validation Accuracy: 0.8005

Epoch 9/10


Training: 100%|██████████| 526/526 [04:22<00:00,  2.01it/s, Batch Loss=0.206]


Epoch 9 Summary: Loss = 0.2054, Train Accuracy = 0.9245
Validation Loss: 0.5065, Validation Accuracy: 0.8036

Epoch 10/10


Training: 100%|██████████| 526/526 [04:23<00:00,  2.00it/s, Batch Loss=0.117]


Epoch 10 Summary: Loss = 0.1640, Train Accuracy = 0.9429
Validation Loss: 0.5480, Validation Accuracy: 0.8012
