In [1]:
!kaggle datasets download -d kazanova/sentiment140

# Unzip the downloaded file
!unzip sentiment140.zip

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 98% 79.0M/80.9M [00:05<00:00, 21.8MB/s]
100% 80.9M/80.9M [00:05<00:00, 16.5MB/s]
Archive:  sentiment140.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm import tqdm

# Load and preprocess the data
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']
df = df[['target', 'text']]
df['target'] = df['target'].map({0: 0, 4: 1})

# Tokenization and vocabulary building
def build_vocab(texts, max_words=10000):
    word_freq = {}
    for text in texts:
        for word in text.lower().split():
            word_freq[word] = word_freq.get(word, 0) + 1

    vocab = ['<PAD>', '<UNK>'] + sorted(word_freq, key=word_freq.get, reverse=True)[:max_words-2]
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    return word_to_idx

word_to_idx = build_vocab(df['text'])

# Tokenize and pad sequences
def tokenize(text, word_to_idx, max_len=100):
    tokens = [word_to_idx.get(word, word_to_idx['<UNK>']) for word in text.lower().split()[:max_len]]
    if len(tokens) < max_len:
        tokens += [word_to_idx['<PAD>']] * (max_len - len(tokens))
    return tokens

X = np.array([tokenize(text, word_to_idx) for text in df['text']])
y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# PyTorch Dataset
class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.LongTensor(texts)
        self.labels = torch.LongTensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Transformer Block
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim, num_heads)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )
        self.layernorm1 = nn.LayerNorm(embed_dim)
        self.layernorm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        attn_output, _ = self.attention(x, x, x)
        x = self.layernorm1(x + self.dropout(attn_output))
        ff_output = self.ff(x)
        x = self.layernorm2(x + self.dropout(ff_output))
        return x

# Sentiment Analysis Model
class SentimentModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, ff_dim, num_layers, max_len, num_classes):
        super(SentimentModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_encoding = nn.Parameter(torch.randn(1, max_len, embed_dim))
        self.transformer_blocks = nn.ModuleList(
            [TransformerBlock(embed_dim, num_heads, ff_dim) for _ in range(num_layers)]
        )
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        self.classifier = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x) + self.pos_encoding
        x = x.permute(1, 0, 2)  # (seq_len, batch, embedding_dim)
        for block in self.transformer_blocks:
            x = block(x)
        x = x.permute(1, 2, 0)  # (batch, embedding_dim, seq_len)
        x = self.global_avg_pool(x).squeeze(-1)
        x = self.classifier(x)
        return x

# Hyperparameters
vocab_size = len(word_to_idx)
embed_dim = 64
num_heads = 8
ff_dim = 512
num_layers = 4
max_len = 100
num_classes = 2
batch_size = 64
num_epochs = 2
learning_rate = 0.001

# Create datasets and dataloaders
train_dataset = SentimentDataset(X_train, y_train)
test_dataset = SentimentDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Initialize model, loss function, and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SentimentModel(vocab_size, embed_dim, num_heads, ff_dim, num_layers, max_len, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_acc = 0
    for batch_x, batch_y in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_acc += (predicted == batch_y).sum().item()

    train_loss /= len(train_loader)
    train_acc /= len(train_loader.dataset)

    # Validation
    model.eval()
    val_loss = 0
    val_acc = 0
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_acc += (predicted == batch_y).sum().item()

    val_loss /= len(test_loader)
    val_acc /= len(test_loader.dataset)

    print(f'Epoch {epoch+1}/{num_epochs}:')
    print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
    print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

# Save the model
torch.save(model.state_dict(), 'sentiment_model.pth')

# Function to predict sentiment with confidence
def predict_sentiment(text):
    model.eval()
    tokens = torch.LongTensor([tokenize(text, word_to_idx)]).to(device)
    with torch.no_grad():
        output = model(tokens)
        probabilities = torch.nn.functional.softmax(output, dim=1)
        confidence, prediction = torch.max(probabilities, 1)
    return prediction.item(), confidence.item()



Epoch 1/2: 100%|██████████| 20000/20000 [06:32<00:00, 50.94it/s]


Epoch 1/2:
Train Loss: 0.4686, Train Acc: 0.7732
Val Loss: 0.4355, Val Acc: 0.7957


Epoch 2/2: 100%|██████████| 20000/20000 [06:26<00:00, 51.68it/s]


Epoch 2/2:
Train Loss: 0.4282, Train Acc: 0.8002
Val Loss: 0.4290, Val Acc: 0.7999


In [15]:
# Example usage
texts = [
    "Oh no! aliens are here",
    "Oh yes! santa is here",
    "Hitler isnt dead",
    "The food was dry. "
]

for text in texts:
    sentiment, confidence = predict_sentiment(text)
    print(f"Text: {text}")
    print(f"Sentiment: {'Positive' if sentiment == 1 else 'Negative'}")
    print(f"Confidence: {confidence:.2f}")
    print()

Text: Oh no! aliens are here
Sentiment: Negative
Confidence: 0.92

Text: Oh yes! santa is here
Sentiment: Positive
Confidence: 0.96

Text: Hitler isnt dead
Sentiment: Negative
Confidence: 0.89

Text: The food was dry. 
Sentiment: Negative
Confidence: 0.76



In [16]:
import pickle

with open('word_to_idx.pkl', 'wb') as f:
    pickle.dump(word_to_idx, f)
