In [76]:
import pandas as pd
import torch
import os

def is_running_on_kaggle():
    return 'KAGGLE_URL_BASE' in os.environ

print('Running on Kaggle' if is_running_on_kaggle() else 'Running on local machine')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

Running on local machine
Using cuda device


In [77]:
DATA_PATH = '/kaggle/input/sentiment-analysis-dataset/' if is_running_on_kaggle() else 'data/'
TRAIN_FILE = DATA_PATH + 'train.csv'
TEST_FILE = DATA_PATH + 'test.csv'
TRAIN_SPLIT = 0.8
BLOCK_SIZE = 32

In [78]:
torch.manual_seed(0)

<torch._C.Generator at 0x778302970df0>

In [79]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
for word in ['not', 'no', 'nor']:
    stop_words.remove(word)

In [80]:
from nltk.stem import PorterStemmer
import nltk
import preprocessor as p
import string
import re
import contractions

def clean_tweet(row):
    text = row['text']
    p.set_options(p.OPT.URL, p.OPT.EMOJI)
    clean_text = p.clean(text)
    lower_text = clean_text.lower()
    
    expanded_text = contractions.fix(lower_text)
    fixed_text = expanded_text.replace('[^\w\s]',' ').replace('\s\s+', ' ').translate(str.maketrans('', '', string.punctuation))
    return fixed_text

def tokenize_tweet(row):
    text = row['cleaned_text']
    tokens = p.tokenize(text).split()
    st = PorterStemmer()
    tokens = [st.stem(word) for word in tokens]
    filtered_tokens = [w for w in tokens if not w in stop_words]
    return filtered_tokens

In [81]:
# df.head(10)

In [82]:
# df['cleaned_text']= df.apply(clean_tweet, axis=1)
# df.head(10)

In [83]:
# df['tokens'] = df.apply(tokenize_tweet, axis=1)
# df.head(10)

In [84]:
import pandas as pd
from sklearn.model_selection import train_test_split


df = pd.read_csv(TRAIN_FILE)

# Define the sizes of training and validation sets
train_df, val_df = train_test_split(df, train_size=TRAIN_SPLIT, random_state=42)

In [85]:
import torch
import random
from torch.utils.data import Dataset

class SentimentDataset(Dataset):
    def __init__(self, df, vocab=None):
        self.data = df
        self.data['cleaned_text'] = self.data.apply(clean_tweet, axis=1)
        self.data['tokens'] = self.data.apply(tokenize_tweet, axis=1)
        if 'sentiment' in self.data.columns:
            label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
            df['label'] = df['sentiment'].map(label_mapping)
    
    def build_vocab(self):
        all_tokens = set([token for tokens in self.data['tokens'] for token in tokens])
        vocab = {token: i for i, token in enumerate(all_tokens)}
        vocab['<UNK>'] = len(vocab)
        self.vocab = vocab

    def encode_sentence(self, tokens):
        encoded = torch.tensor([self.vocab[token] if token in self.vocab else self.vocab['<UNK>'] for token in tokens], dtype=torch.long)
        pos = [True] * len(encoded) + [False] * (BLOCK_SIZE-len(encoded))
        random.shuffle(pos)
        padded = torch.ones(BLOCK_SIZE, dtype=torch.long) * self.vocab['<UNK>']
        i = 0
        for j, p in enumerate(pos):
            if p:
                padded[j] = encoded[i]
                i += 1
        return padded
    
    def set_vocab(self, vocab):
        self.vocab = vocab

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        if 'encoding' not in self.data.columns:
            self.data['encoding'] = self.data['tokens'].apply(self.encode_sentence)
        encoding = self.data.iloc[idx]['encoding']
        label = self.data.iloc[idx]['label'] if 'label' in self.data.columns else None
        return encoding, label

In [98]:
# Create an instance of the TrainingDataset
train_data = SentimentDataset(train_df)
val_data = SentimentDataset(val_df)
train_data.build_vocab()
val_data.set_vocab(train_data.vocab)

In [99]:
# Print the sizes of the training and validation sets
print("Training set size:", len(train_data))
print("Validation set size:", len(val_data))

Training set size: 19785
Validation set size: 4947


In [100]:
display(train_data[:10])

(8570     [tensor(18258), tensor(18258), tensor(18258), ...
 24337    [tensor(18258), tensor(18258), tensor(18258), ...
 21593    [tensor(1545), tensor(18258), tensor(18258), t...
 19424    [tensor(4392), tensor(18258), tensor(18258), t...
 11547    [tensor(18258), tensor(2669), tensor(18258), t...
 22493    [tensor(18258), tensor(18258), tensor(18258), ...
 20805    [tensor(18258), tensor(18258), tensor(18258), ...
 9419     [tensor(18258), tensor(18258), tensor(18258), ...
 14627    [tensor(18258), tensor(18258), tensor(18258), ...
 1608     [tensor(18258), tensor(18258), tensor(18258), ...
 Name: encoding, dtype: object,
 8570     1
 24337    1
 21593    0
 19424    2
 11547    1
 22493    1
 20805    2
 9419     2
 14627    0
 1608     1
 Name: label, dtype: int64)

In [89]:
import torch.nn as nn

class TransformerSentimentModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_classes, num_heads, hidden_dim, num_layers, max_seq_len):
        super(TransformerSentimentModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_seq_len, embed_size))
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size,
                                                   nhead=num_heads,
                                                   dim_feedforward=hidden_dim,
                                                   dropout=0.2)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embed_size, num_classes)

    def forward(self, src, src_key_padding_mask=None):
        src = self.embedding(src) + self.positional_encoding[:, :src.size(1), :]
        src = src.permute(1, 0, 2)  # (S, N, E) for transformer
        output = self.transformer_encoder(src, src_key_padding_mask=src_key_padding_mask)
        output = output.mean(dim=0)  # Global average pooling
        output = self.fc(output)
        return output

In [102]:
from sklearn.metrics import f1_score, confusion_matrix

torch.manual_seed(0)

# model = SentimentClassifier(len(dataset.vocab), 128)
model = TransformerSentimentModel(vocab_size=len(train_data.vocab),
                                  embed_size=48,
                                  num_classes=3,
                                  num_heads=8,
                                  hidden_dim=2048,
                                  num_layers=6,
                                  max_seq_len=BLOCK_SIZE)
model.to(device)

criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=128, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=128, shuffle=True)

epochs = 100
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for inputs, labels in train_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_dataloader)
    
    model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in val_dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    val_loss /= len(val_dataloader)
    
    f1 = f1_score(all_labels, all_preds, average='macro')
    cm = confusion_matrix(all_labels, all_preds)
    acc = sum([1 if p == l else 0 for p, l in zip(all_preds, all_labels)]) / len(all_preds)
    
    print(f"Epoch {epoch+1}/{epochs} => Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, F1: {f1:.4f}")
    print(f"Accuracy: {acc:.4f}")
    display(cm)



Epoch 1/100 => Train Loss: 1.1026, Validation Loss: 1.0890, F1: 0.1892
Accuracy: 0.3964


array([[   0, 1406,    0],
       [   0, 1961,    0],
       [   0, 1580,    0]])

Epoch 2/100 => Train Loss: 1.0885, Validation Loss: 1.0907, F1: 0.1892
Accuracy: 0.3964


array([[   0, 1406,    0],
       [   0, 1961,    0],
       [   0, 1580,    0]])

Epoch 3/100 => Train Loss: 1.0887, Validation Loss: 1.0915, F1: 0.1892
Accuracy: 0.3964


array([[   0, 1406,    0],
       [   0, 1961,    0],
       [   0, 1580,    0]])

KeyboardInterrupt: 

In [101]:
train_vocab = set(train_data.vocab)
val_vocab = set(val_data.vocab)
missing_vocab = val_vocab - train_vocab
print(len(train_vocab))
print(len(val_vocab))
print(len(missing_vocab))
display(missing_vocab)

18259
18259
0


set()

In [97]:
df_test = pd.read_csv(TEST_FILE)
dataset_test = SentimentDataset(df_test)
dataset_test.build_vocab()
print(len(dataset_test.vocab))

5206
