In [1]:
def load_short_reviews(file_path, label):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read().strip()
    reviews = content.split()
    labels = [label] * len(reviews)
    lengths = ['short'] * len(reviews)
    return reviews, labels, lengths

# Load short positive and negative reviews
short_positive_reviews, short_positive_labels, short_positive_lengths = load_short_reviews('positive-words.txt', 1)
short_negative_reviews, short_negative_labels, short_negative_lengths = load_short_reviews('negative-words.txt', 0)

In [2]:
# Combine short reviews and labels
short_reviews = short_positive_reviews + short_negative_reviews
short_labels = short_positive_labels + short_negative_labels
short_lengths = short_positive_lengths + short_negative_lengths

In [3]:
def load_imdb_data(data_dir):
    import os
    texts = []
    labels = []
    for label in ['pos', 'neg']:
        dir_name = os.path.join(data_dir, label)
        for fname in os.listdir(dir_name):
            if fname.endswith('.txt'):
                with open(os.path.join(dir_name, fname), encoding='utf-8') as f:
                    texts.append(f.read())
                    labels.append(1 if label == 'pos' else 0)
    return texts, labels

train_texts, train_labels = load_imdb_data('aclImdb/train')
test_texts, test_labels = load_imdb_data('aclImdb/test')

In [4]:
# Combine long reviews and labels
long_reviews = train_texts + test_texts
long_labels = train_labels + test_labels
long_lengths = ['long'] * len(long_reviews)

In [5]:
import pandas as pd

# Create DataFrame for long reviews
long_df = pd.DataFrame({
    'review': long_reviews,
    'label': long_labels,
    'length': long_lengths
})

# Create DataFrame for short reviews
short_df = pd.DataFrame({
    'review': short_reviews,
    'label': short_labels,
    'length': short_lengths
})

# Combine long and short reviews into one DataFrame
all_reviews_df = pd.concat([long_df, short_df], ignore_index=True)

In [6]:
from sklearn.model_selection import train_test_split

# Perform stratified train-test split
train_df, test_df = train_test_split(
    all_reviews_df,
    test_size=0.2,
    stratify=all_reviews_df[['label', 'length']],
    random_state=42
)

In [7]:
# Verify the distribution in training set
print("Training set distribution:")
print(train_df['label'].value_counts())
print(train_df['length'].value_counts())

# Verify the distribution in test set
print("\nTest set distribution:")
print(test_df['label'].value_counts())
print(test_df['length'].value_counts())

Training set distribution:
label
0    23826
1    21605
Name: count, dtype: int64
length
long     40000
short     5431
Name: count, dtype: int64

Test set distribution:
label
0    5957
1    5401
Name: count, dtype: int64
length
long     10000
short     1358
Name: count, dtype: int64


In [8]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [9]:
# Updated TransformerEncoder
class TransformerEncoder(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_layers, dropout=0.1):
        super().__init__()
        self.model_dim = model_dim
        self.embedding = nn.Embedding(input_dim, model_dim)
        self.pos_encoder = PositionalEncoding(model_dim, dropout)
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=model_dim,
            nhead=num_heads,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        self.dropout = nn.Dropout(dropout)
        self.fc_out = nn.Linear(model_dim, 1)  # For binary classification

    def forward(self, src, src_mask=None):
        src = self.embedding(src) * math.sqrt(self.model_dim)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_key_padding_mask=src_mask)
        output = output.mean(dim=1)  # Global average pooling (dim=1 for batch_first=True)
        output = self.fc_out(self.dropout(output))
        return output.squeeze()

In [10]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
        )
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position.float() * div_term)
        pe[:, 1::2] = torch.cos(position.float() * div_term)
        pe = pe.unsqueeze(1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [12]:
# Updated Hyperparameters
INPUT_DIM = 10000  # Assuming this remains unchanged
MODEL_DIM = 256    # Increased from 128 to 256
NUM_HEADS = 8      # Increased from 4 to 8
NUM_LAYERS = 4     # Increased from 2 to 4
DROPOUT = 0.2      # Increased from 0.1 to 0.2
BATCH_SIZE = 32    # Increased from 16 to 32
MAX_SEQ_LEN = 256  # Increased from 128 to 256

In [13]:
# Initialize the updated model
model = TransformerEncoder(
    input_dim=INPUT_DIM,
    model_dim=MODEL_DIM,
    num_heads=NUM_HEADS,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT
)

In [17]:
from collections import Counter
import re

def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.strip().split()

# Build vocabulary from training data
counter = Counter()
for text in train_df['review']:
    tokens = tokenize(text)
    counter.update(tokens)

# Keep most common words
vocab_size = 10000
most_common = counter.most_common(vocab_size - 2)  # Reserve 2 for PAD and UNK tokens
word2idx = {word: idx + 2 for idx, (word, _) in enumerate(most_common)}
word2idx['<PAD>'] = 0
word2idx['<UNK>'] = 1

# Update INPUT_DIM
INPUT_DIM = vocab_size

In [18]:
# Update the encoding function to accommodate longer sequences
def encode(text):
    tokens = tokenize(text)
    indices = [word2idx.get(token, word2idx['<UNK>']) for token in tokens]
    if len(indices) > MAX_SEQ_LEN:
        indices = indices[:MAX_SEQ_LEN]
    else:
        indices += [word2idx['<PAD>']] * (MAX_SEQ_LEN - len(indices))
    return indices

train_sequences = [encode(text) for text in train_df['review']]
test_sequences = [encode(text) for text in test_df['review']]

In [19]:
import torch
from torch.utils.data import DataLoader, TensorDataset

train_inputs = torch.LongTensor(train_sequences)
train_labels_tensor = torch.FloatTensor(train_df['label'].tolist())
test_inputs = torch.LongTensor(test_sequences)
test_labels_tensor = torch.FloatTensor(test_df['label'].tolist())

train_dataset = TensorDataset(train_inputs, train_labels_tensor)
test_dataset = TensorDataset(test_inputs, test_labels_tensor)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)


In [20]:
# Update the training loop with gradient clipping and learning rate scheduler
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4)
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

TransformerEncoder(
  (embedding): Embedding(10000, 256)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=2048, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=2048, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (dropout): Dropout(p=0.2, inplace=False)
  (fc_out): Linear(in_features=256, out_features=1, bias=True)
)

In [21]:
num_epochs = 20  # Increased number of epochs to accommodate deeper model

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    scheduler.step()  # Update learning rate
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

# Evaluation remains similar, with detailed metrics as previously outlined

Epoch 1/20, Loss: 0.5651
Epoch 2/20, Loss: 0.4740
Epoch 3/20, Loss: 0.4438
Epoch 4/20, Loss: 0.4222
Epoch 5/20, Loss: 0.4066
Epoch 6/20, Loss: 0.3919
Epoch 7/20, Loss: 0.3838
Epoch 8/20, Loss: 0.3738
Epoch 9/20, Loss: 0.3655
Epoch 10/20, Loss: 0.3541
Epoch 11/20, Loss: 0.3369
Epoch 12/20, Loss: 0.3334
Epoch 13/20, Loss: 0.3323


KeyboardInterrupt: 

In [22]:
from torch.utils.data import Dataset

class IndexedTensorDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels
    
    def __getitem__(self, index):
        return self.inputs[index], self.labels[index], index
    
    def __len__(self):
        return len(self.inputs)

test_dataset = IndexedTensorDataset(test_inputs, test_labels_tensor)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)


In [23]:
#Evaluation loop
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

model.eval()
all_preds = []
all_labels = []
all_lengths = []

with torch.no_grad():
    for inputs, labels, indices in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        outputs = model(inputs)
        preds = torch.round(torch.sigmoid(outputs))
        
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        
        # Get review lengths from test_df using indices
        batch_lengths = test_df.iloc[indices.numpy()]['length'].tolist()
        all_lengths.extend(batch_lengths)

# Convert to NumPy arrays
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)
all_lengths = np.array(all_lengths)

In [24]:
# Calculate overall metrices
accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.4f}")

target_names = ['Negative', 'Positive']
print("\nOverall Classification Report:")
print(classification_report(all_labels, all_preds, target_names=target_names))

Test Accuracy: 0.8366

Overall Classification Report:
              precision    recall  f1-score   support

    Negative       0.83      0.86      0.85      5957
    Positive       0.84      0.81      0.82      5401

    accuracy                           0.84     11358
   macro avg       0.84      0.84      0.84     11358
weighted avg       0.84      0.84      0.84     11358



In [25]:
# Calculate metrices broken down by review length
lengths = np.unique(all_lengths)

for length in lengths:
    indices = np.where(all_lengths == length)
    length_labels = all_labels[indices]
    length_preds = all_preds[indices]
    
    print(f"\nClassification Report for {length.capitalize()} Reviews:")
    print(classification_report(length_labels, length_preds, target_names=target_names))
    
    cm_length = confusion_matrix(length_labels, length_preds)
    print(f"Confusion Matrix for {length.capitalize()} Reviews:")
    print(cm_length)


Classification Report for Long Reviews:
              precision    recall  f1-score   support

    Negative       0.86      0.85      0.86      5000
    Positive       0.85      0.86      0.86      5000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Confusion Matrix for Long Reviews:
[[4256  744]
 [ 686 4314]]

Classification Report for Short Reviews:
              precision    recall  f1-score   support

    Negative       0.71      0.93      0.81       957
    Positive       0.38      0.10      0.16       401

    accuracy                           0.69      1358
   macro avg       0.55      0.52      0.48      1358
weighted avg       0.62      0.69      0.62      1358

Confusion Matrix for Short Reviews:
[[891  66]
 [360  41]]


In [26]:
# Calculate metrices broken down by class
from sklearn.metrics import precision_score, recall_score, f1_score

for class_label, class_name in enumerate(target_names):
    indices = np.where(all_labels == class_label)
    class_labels = all_labels[indices]
    class_preds = all_preds[indices]
    
    precision = precision_score(class_labels, class_preds, pos_label=class_label)
    recall = recall_score(class_labels, class_preds, pos_label=class_label)
    f1 = f1_score(class_labels, class_preds, pos_label=class_label)
    
    print(f"\nMetrics for {class_name} Class:")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")


Metrics for Negative Class:
Precision: 1.0000
Recall:    0.8640
F1-Score:  0.9271

Metrics for Positive Class:
Precision: 1.0000
Recall:    0.8063
F1-Score:  0.8928


In [27]:
'''
import pickle

# Save the trained model
torch.save(model.state_dict(), 'sentiment_model.pt')

# Save the word2idx mapping
with open('word2idx.pkl', 'wb') as f:
    pickle.dump(word2idx, f)
'''

"\nimport pickle\n\n# Save the trained model\ntorch.save(model.state_dict(), 'sentiment_model.pt')\n\n# Save the word2idx mapping\nwith open('word2idx.pkl', 'wb') as f:\n    pickle.dump(word2idx, f)\n"

In [28]:
# Save the model state after 12 epochs
torch.save(model.state_dict(), 'model_after_12_epochs.pt')

In [29]:
# Load the saved model state
model.load_state_dict(torch.load('model_after_12_epochs.pt'))

  model.load_state_dict(torch.load('model_after_12_epochs.pt'))


<All keys matched successfully>

In [30]:
start_epoch = 14
num_epochs = 20  # Increased number of epochs to accommodate deeper model

for epoch in range(start_epoch, num_epochs + 1):
    model.train()
    total_loss = 0
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    #scheduler.step()  # Update learning rate
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

# Evaluation remains similar, with detailed metrics as previously outlined

Epoch 15/20, Loss: 0.3297
Epoch 16/20, Loss: 0.3283
Epoch 17/20, Loss: 0.3262
Epoch 18/20, Loss: 0.3243
Epoch 19/20, Loss: 0.3245
Epoch 20/20, Loss: 0.3228
Epoch 21/20, Loss: 0.3190


In [31]:
from torch.utils.data import Dataset

class IndexedTensorDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels
    
    def __getitem__(self, index):
        return self.inputs[index], self.labels[index], index
    
    def __len__(self):
        return len(self.inputs)

test_dataset = IndexedTensorDataset(test_inputs, test_labels_tensor)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)


In [32]:
#Evaluation loop
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

model.eval()
all_preds = []
all_labels = []
all_lengths = []

with torch.no_grad():
    for inputs, labels, indices in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        outputs = model(inputs)
        preds = torch.round(torch.sigmoid(outputs))
        
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        
        # Get review lengths from test_df using indices
        batch_lengths = test_df.iloc[indices.numpy()]['length'].tolist()
        all_lengths.extend(batch_lengths)

# Convert to NumPy arrays
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)
all_lengths = np.array(all_lengths)

In [33]:
# Calculate overall metrices
accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.4f}")

target_names = ['Negative', 'Positive']
print("\nOverall Classification Report:")
print(classification_report(all_labels, all_preds, target_names=target_names))

Test Accuracy: 0.8376

Overall Classification Report:
              precision    recall  f1-score   support

    Negative       0.82      0.88      0.85      5957
    Positive       0.86      0.79      0.82      5401

    accuracy                           0.84     11358
   macro avg       0.84      0.84      0.84     11358
weighted avg       0.84      0.84      0.84     11358



In [34]:
# Calculate metrices broken down by review length
lengths = np.unique(all_lengths)

for length in lengths:
    indices = np.where(all_lengths == length)
    length_labels = all_labels[indices]
    length_preds = all_preds[indices]
    
    print(f"\nClassification Report for {length.capitalize()} Reviews:")
    print(classification_report(length_labels, length_preds, target_names=target_names))
    
    cm_length = confusion_matrix(length_labels, length_preds)
    print(f"Confusion Matrix for {length.capitalize()} Reviews:")
    print(cm_length)


Classification Report for Long Reviews:
              precision    recall  f1-score   support

    Negative       0.84      0.88      0.86      5000
    Positive       0.87      0.84      0.86      5000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Confusion Matrix for Long Reviews:
[[4401  599]
 [ 815 4185]]

Classification Report for Short Reviews:
              precision    recall  f1-score   support

    Negative       0.72      0.90      0.80       957
    Positive       0.41      0.16      0.23       401

    accuracy                           0.68      1358
   macro avg       0.56      0.53      0.51      1358
weighted avg       0.63      0.68      0.63      1358

Confusion Matrix for Short Reviews:
[[865  92]
 [338  63]]


In [1]:
# Calculate metrices broken down by class
from sklearn.metrics import precision_score, recall_score, f1_score

for class_label, class_name in enumerate(target_names):
    indices = np.where(all_labels == class_label)
    class_labels = all_labels[indices]
    class_preds = all_preds[indices]
    
    precision = precision_score(class_labels, class_preds, pos_label=class_label)
    recall = recall_score(class_labels, class_preds, pos_label=class_label)
    f1 = f1_score(class_labels, class_preds, pos_label=class_label)
    
    print(f"\nMetrics for {class_name} Class:")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")

NameError: name 'target_names' is not defined