In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sentiment-analysis-dataset/training.1600000.processed.noemoticon.csv
/kaggle/input/sentiment-analysis-dataset/train.csv
/kaggle/input/sentiment-analysis-dataset/testdata.manual.2009.06.14.csv
/kaggle/input/sentiment-analysis-dataset/test.csv


In [2]:
# [1] Import Libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [3]:
import os
import nltk

# Define the custom directory path
custom_nltk_data_path = '/kaggle/working/nltk_data'

# Create the directory if it doesn't exist
os.makedirs(custom_nltk_data_path, exist_ok=True)

# Set the NLTK data path to the custom directory
nltk.data.path.append(custom_nltk_data_path)

# Download the necessary NLTK resources
nltk.download('punkt', download_dir=custom_nltk_data_path)
nltk.download('stopwords', download_dir=custom_nltk_data_path)
nltk.download('wordnet', download_dir=custom_nltk_data_path)
nltk.download('omw-1.4', download_dir=custom_nltk_data_path)

# Verify if the resources are downloaded
print("NLTK resources downloaded to:", custom_nltk_data_path)


[nltk_data] Downloading package punkt to /kaggle/working/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
NLTK resources downloaded to: /kaggle/working/nltk_data


In [4]:
import os
import nltk

# Define the custom directory path
custom_nltk_data_path = '/kaggle/working/nltk_data'

# Create the directory if it doesn't exist
os.makedirs(custom_nltk_data_path, exist_ok=True)

# Set the NLTK data path to the custom directory
nltk.data.path.append(custom_nltk_data_path)

# Manually download the required resource
nltk.download('wordnet', download_dir=custom_nltk_data_path)

# Verify if NLTK can find the resource
try:
    wordnet = nltk.corpus.wordnet
    print("WordNet resource found.")
except LookupError as e:
    print(f"Resource not found: {e}")

# Check the paths being searched by NLTK
print("Paths searched by NLTK:")
print(nltk.data.path)


[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
WordNet resource found.
Paths searched by NLTK:
['/root/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data', '/kaggle/working/nltk_data', '/kaggle/working/nltk_data']


In [5]:
import zipfile
import os

# Define the path where the zipped files are located
nltk_data_path = '/kaggle/working/nltk_data'

# List of zipped files in your nltk_data path
zip_files = ['stopwords.zip', 'wordnet.zip', 'omw-1.4.zip']

# Unzip each file
for zip_file in zip_files:
    zip_path = os.path.join(nltk_data_path, 'corpora', zip_file)
    if os.path.exists(zip_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(os.path.join(nltk_data_path, 'corpora'))
        print(f"Unzipped {zip_file}")
    else:
        print(f"{zip_file} not found.")


Unzipped stopwords.zip
Unzipped wordnet.zip
Unzipped omw-1.4.zip


In [6]:
# [3] Initialize preprocessing tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


In [7]:
# [4] Enhanced preprocessing function
def preprocess_text(text):
    """
    Comprehensive text preprocessing for tweets
    """
    text = text.lower()
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'@\w+', '@user', text)
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)


In [8]:
# [5] Model Architecture Classes
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(self.d_k)
        
        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, -1e9)
            
        attention_weights = F.softmax(attention_scores, dim=-1)
        output = torch.matmul(attention_weights, V)
        return output, attention_weights
    
    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        output, attention_weights = self.scaled_dot_product_attention(Q, K, V, mask)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        output = self.W_o(output)
        return output, attention_weights


In [9]:
# [7] Load and preprocess data
def load_twitter_data(filepath):
    """Load and preprocess Twitter sentiment dataset with enhanced preprocessing"""
    print("Loading and preprocessing data...")
    df = pd.read_csv(filepath, encoding='windows-1252')
    df = df.dropna(subset=['text'])
    df['text'] = df['text'].apply(preprocess_text)
    sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['sentiment'] = df['sentiment'].map(sentiment_map)
    df = df[df['text'].str.len() > 0].reset_index(drop=True)
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df['text'].values,
        df['sentiment'].values,
        test_size=0.2,
        random_state=42,
        stratify=df['sentiment']
    )
    print(f"Training samples: {len(train_texts)}")
    print(f"Validation samples: {len(val_texts)}")
    return train_texts, val_texts, train_labels, val_labels


In [10]:
# [8] Training with learning rate scheduling and gradient clipping
def train_model(model, train_loader, val_loader, epochs=10, learning_rate=0.0001, device='cuda'):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
    criterion = nn.CrossEntropyLoss()
    best_val_loss = float('inf')
    history = {'train_loss': [], 'val_loss': [], 'val_accuracy': []}
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}'):
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            optimizer.zero_grad()
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)
        history['train_loss'].append(avg_train_loss)
        model.eval()
        val_loss = 0
        predictions, true_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                preds = torch.argmax(outputs, dim=1)
                predictions.extend(preds.cpu().numpy())
                true_labels.extend(labels.cpu().numpy())
        avg_val_loss = val_loss / len(val_loader)
        history['val_loss'].append(avg_val_loss)
        accuracy = np.mean(np.array(predictions) == np.array(true_labels))
        history['val_accuracy'].append(accuracy)
        scheduler.step(avg_val_loss)
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': best_val_loss,
            }, 'best_model.pt')
    return history


In [11]:
# First, we need the positional encoding class
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length=512):
        super().__init__()
        
        # Create a matrix of positional encodings
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        
        # Apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term)
        # Apply cosine to odd indices
        pe[:, 1::2] = torch.cos(position * div_term)
        
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# Then the transformer block
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        
        # Multi-head attention layer
        self.attention = MultiHeadAttention(d_model, num_heads)
        # Layer normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        
        # Feed-forward neural network
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model)
        )
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        # Apply attention and add residual connection
        attention_output, _ = self.attention(x, x, x, mask)
        x = self.norm1(x + self.dropout(attention_output))
        
        # Apply feed-forward and add residual connection
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        
        return x

# Finally, the main transformer model
class TwitterSentimentTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=256, num_heads=8, num_layers=4, d_ff=1024, max_seq_length=128, dropout=0.1):
        super().__init__()
        
        # Word embedding layer
        self.embedding = nn.Embedding(vocab_size, d_model)
        # Positional encoding layer
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
        
        # Stack of transformer blocks
        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        
        # Classification layers
        self.fc1 = nn.Linear(d_model, d_model // 2)
        self.fc2 = nn.Linear(d_model // 2, 3)  # 3 classes: negative, neutral, positive
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        # Convert input tokens to embeddings
        x = self.embedding(x)
        # Add positional encoding
        x = self.positional_encoding(x)
        
        # Pass through transformer blocks
        for transformer_block in self.transformer_blocks:
            x = transformer_block(x, mask)
        
        # Global average pooling
        x = torch.mean(x, dim=1)
        
        # Classification
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader
# Dataset class definition
class TwitterDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_length=128):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Convert text to tokens
        tokens = [self.vocab.get(word, self.vocab['<UNK>']) for word in str(text).split()]
        
        # Pad or truncate
        if len(tokens) < self.max_length:
            tokens = tokens + [self.vocab['<PAD>']] * (self.max_length - len(tokens))
        else:
            tokens = tokens[:self.max_length]
        
        return {
            'input_ids': torch.tensor(tokens, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def build_vocabulary(texts, max_vocab_size=10000):
    """Build vocabulary from texts"""
    word_freq = {}
    for text in texts:
        for word in str(text).split():
            word_freq[word] = word_freq.get(word, 0) + 1
    
    # Sort words by frequency
    sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
    
    # Create vocabulary with special tokens
    vocab = {
        '<PAD>': 0,
        '<UNK>': 1,
    }
    
    # Add most frequent words
    for word, _ in sorted_words[:max_vocab_size-2]:  # -2 for special tokens
        vocab[word] = len(vocab)
    
    return vocab

# Now you can use this with your training code:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load and preprocess data
train_texts, val_texts, train_labels, val_labels = load_twitter_data("/kaggle/input/sentiment-analysis-dataset/train.csv")

# Build vocabulary
print("Building vocabulary...")
vocab = build_vocabulary(train_texts)
print(f"Vocabulary size: {len(vocab)}")

# Create data loaders
train_loader = DataLoader(TwitterDataset(train_texts, train_labels, vocab), batch_size=32, shuffle=True)
val_loader = DataLoader(TwitterDataset(val_texts, val_labels, vocab), batch_size=32)

# Initialize model
model = TwitterSentimentTransformer(
    vocab_size=len(vocab),
    d_model=256,
    num_heads=8,
    num_layers=4
).to(device)


Using device: cuda
Loading and preprocessing data...
Training samples: 21933
Validation samples: 5484
Building vocabulary...
Vocabulary size: 10000


In [13]:
# [9] Main execution
torch.manual_seed(42)
np.random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
train_texts, val_texts, train_labels, val_labels = load_twitter_data("/kaggle/input/sentiment-analysis-dataset/train.csv")
train_loader = DataLoader(TwitterDataset(train_texts, train_labels, vocab), batch_size=32, shuffle=True)
val_loader = DataLoader(TwitterDataset(val_texts, val_labels, vocab), batch_size=32)
model = TwitterSentimentTransformer(
    vocab_size=len(vocab),
    d_model=256,
    num_heads=8,
    num_layers=4
).to(device)
history = train_model(model, train_loader, val_loader, epochs=10, device=device)





Using device: cuda
Loading and preprocessing data...
Training samples: 21933
Validation samples: 5484


Epoch 1/10: 100%|██████████| 686/686 [00:25<00:00, 26.69it/s]
Epoch 2/10: 100%|██████████| 686/686 [00:25<00:00, 26.97it/s]
Epoch 3/10: 100%|██████████| 686/686 [00:26<00:00, 25.96it/s]
Epoch 4/10: 100%|██████████| 686/686 [00:27<00:00, 24.77it/s]
Epoch 5/10: 100%|██████████| 686/686 [00:28<00:00, 23.70it/s]
Epoch 6/10: 100%|██████████| 686/686 [00:27<00:00, 24.68it/s]
Epoch 7/10: 100%|██████████| 686/686 [00:27<00:00, 24.66it/s]
Epoch 8/10: 100%|██████████| 686/686 [00:28<00:00, 24.37it/s]
Epoch 9/10: 100%|██████████| 686/686 [00:28<00:00, 24.48it/s]
Epoch 10/10: 100%|██████████| 686/686 [00:27<00:00, 24.65it/s]


In [16]:
import torch

# Assuming `model` is your trained PyTorch model
torch.save(model.state_dict(), 'model.pth')
