### TODO
- Replace tfidf input with word embedding + positional embedding
- Fix validation tests
- Look at output from model

### Setup

In [None]:
# RUN SETUP.SH BEFORE RUNNING THIS IPYNB

import pandas as pd
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB # Naive Bayes Classifier
from sklearn.linear_model import LogisticRegression # Logistic Regression Classifier
from sklearn.neural_network import MLPClassifier # Multi Layer Perceptron, simple Neural Network
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import ADASYN, RandomOverSampler
from scipy.sparse import hstack, csr_matrix
import spacy
import re
import numpy as np
from spacytextblob.spacytextblob import SpacyTextBlob

In [None]:
SEED = 42 # seed random state for comparison, testing
PARTITION_SIZE = 500 # Adjust lower if potato PC and higher if gaming rig or want results closer to actual
enable_all_data = True # SET TO FALSE IF PREPROCESSING TAKES A LONG TIME (True = test on PARTITION_SIZE training and PARTITION_SIZE testing samples)

### Pre-processing

In [None]:
df = pd.read_csv('raw_data/fulltrain.csv', header=None, index_col = False)
df.head()

In [None]:
df = df if enable_all_data else df.sample(n=PARTITION_SIZE, random_state=SEED)

X_train = df.iloc[:, 1] 
y_train = df.iloc[:, 0]

print(X_train)
print(y_train)

print(len(X_train))
print(len(y_train))

y_train.value_counts()

In [None]:
# Processing data: tokenize the text for NLP Machine Learning
# Lemmatization, Case-folding (lowercase), Stopword removal, Punctuation removal
# Eric
personal_pronouns = ["i", "me", "mine", "my", "myself", "our", "ours", "we", \
                     "their", "you", "your", "he", "she", "it", "its", "we", "they", "me", \
                     "him", "her", "us", "them", "his", "hers", "herself", \
                        "himself", "itself", "themselves", "ourselves", "yourself", "yourselves"]
spacy_preprocess_model = spacy.load("en_core_web_lg")
spacy_preprocess_model.Defaults.stop_words -= set(personal_pronouns)

In [None]:
# Processing data: tokenize the text for NLP Machine Learning
# Case-folding (lowercase), Stopword removal, Punctuation removal

def preprocess(sentence):
    '''
    Preprocessing strategies:
    1) Tokenization
    2) Punctuation removal
    3) Stopword removal
    4) Lowercase
    '''
    tokens = spacy_preprocess_model(sentence)
    ls_sentence = [token.text.lower() for token in tokens if not (token.is_punct and token not in ["!", "?"]) and not token.is_stop]
    return ls_sentence

#### Save and load preprocessed data

In [None]:
# quickload pre-processed data
# replace 'raw_data/*.csv' with .csv file containing preprocessed data
X_train = pd.read_csv('strip_punct_stop_lower.csv', index_col=False).iloc[:, 0]
X_train.head()

# Reload constants if preprocessing cells are not executed
personal_pronouns = ["i", "me", "mine", "my", "myself", "our", "ours", "we", \
                     "their", "you", "your", "he", "she", "it", "its", "we", "they", "me", \
                     "him", "her", "us", "them", "his", "hers", "herself", \
                        "himself", "itself", "themselves", "ourselves", "yourself", "yourselves"]
spacy_model = spacy.load("en_core_web_lg")
spacy_model.Defaults.stop_words -= set(personal_pronouns)
spacy_model.add_pipe('spacytextblob')

#### Train-Validation Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=SEED)

#### Feature Engineering

In [None]:
# Feature set:
# 1) TF-IDF
tfidf_vectorizer = CountVectorizer(ngram_range=(1, 1))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

In [None]:
# Consolidation of feature sets into single vector:
X_train = hstack([X_train_tfidf])

#### Oversampling

In [None]:
# RandomOverSampler
ros = RandomOverSampler(random_state=SEED)
X_train, y_train = ros.fit_resample(X_train, y_train)

### Model Architecture

In [None]:
import torch
import torch.nn as nn

#### Modules

In [159]:
class Attention(nn.Module):
    ### Implements Scaled Dot Product Attention
    def __init__(self):
        super().__init__()

    def forward(self, Q, K, V, mask=None, dropout=None):
        # All inputshapes: (batch_size B, seq_len L, model_size D)

        # Perform Q*K^T (* is the dot product here)
        # We have to use torch.matmul since we work with batches!
        out = torch.matmul(Q, K.transpose(1, 2)) # => shape (B, L, D)

        # scale alignment scores
        out = out / (Q.shape[-1] **0.5)

        # Push through softmax layer
        out = nn.functional.softmax(out, dim=-1)

        # Multiply scaled alignment scores with values V
        return torch.matmul(out, V)


class AttentionHead(nn.Module):
    def __init__(self, model_size, qkv_size):
        super().__init__()
        self.Wq = nn.Linear(model_size, qkv_size)
        self.Wk = nn.Linear(model_size, qkv_size)
        self.Wv = nn.Linear(model_size, qkv_size)
        self.attention = Attention()

    def forward(self, queries, keys, values):
        # Computes scaled dot-product attention
        return self.attention(self.Wq(queries),
                              self.Wk(keys),
                              self.Wv(values))
 
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, model_size, qkv_size):
        super().__init__()

        # Define num_heads attention heads
        self.heads = nn.ModuleList(
            [ AttentionHead(model_size, qkv_size) for _ in range(num_heads) ]
        )

        # Linear layer to "unify" all heads into one
        self.Wo = nn.Linear(num_heads * qkv_size, model_size)

    def forward(self, query, key, value):
        # Compute the outputs for all attention heads
        out_heads = [ head(query, key, value) for head in self.heads ]

        # Concatenate output of all attention heads
        out = torch.cat(out_heads, dim=-1)

        # Unify concatenated output to the model size
        return self.Wo(out)

class FeedForward(nn.Module):
    def __init__(self, model_size, hidden_size=2048):
        super().__init__()

        self.net = nn.Sequential(
            nn.Linear(model_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, model_size),
        )
    
    def forward(self, X):
        return self.net(X)

class TransformerEncoderLayer(nn.Module):
    def __init__(self, model_size, num_heads, ff_hidden_size, dropout):
        super().__init__()

        # Define sizes of Q/K/V based on model size and number of heads
        qkv_size = max(model_size // num_heads, 1)

        # MultiHeadAttention block
        self.mhal = MultiHeadAttention(num_heads, model_size, qkv_size)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(model_size)

        # FeedForward block
        self.ff = FeedForward(model_size, ff_hidden_size)
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(model_size)

    def forward(self, source):
        # MultiHeadAttention block
        out1 = self.mhal(source, source, source)
        out1 = self.dropout1(out1)
        out1 = self.norm1(out1 + source)

        # FeedForward block
        out2 = self.ff(out1)
        out2 = self.dropout2(out2)
        out2 = self.norm2(out2)

        return out2

class TransformerEncoder(nn.Module):
    def __init__(self,
                 num_layers=6,
                 model_size=512,
                 num_heads=8,
                 ff_hidden_size=2048,
                 dropout=0.1):
        super().__init__()

        #define num_layers (N) encoder layers
        self.layers = nn.ModuleList(
            [ TransformerEncoderLayer(model_size,
                                    num_heads,
                                    ff_hidden_size,
                                    dropout)
              for _ in range(num_layers)
            ]
        )
    
    def forward(self, source):
        # Push through each encoder layer
        for l in self.layers:
            source = l(source)
        return source


#### Classifier

In [160]:
class EncoderOnlyClassifier(nn.Module):
    def __init__(self,
                 input_size,
                 num_layers=6,
                 model_size=512,
                 num_heads=8,
                 ff_hidden_size=2048,
                 dropout=0.1,
                 num_classes=4):
        super().__init__()

        self.encoder = TransformerEncoder(num_layers, model_size, num_heads, ff_hidden_size, dropout)
        self.linear = nn.Linear(model_size, num_classes)
    
    def forward(self, source):
        output = self.encoder(output)
        output = self.linear(output)
        return output

### Training

In [161]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        # Convert sparse matrix to dense tensor
        X_tensor = torch.tensor(self.X[idx].todense()).float()
        y_tensor = torch.tensor(self.y[idx]).long()
        return X_tensor, y_tensor

In [162]:
model = EncoderOnlyClassifier(input_size=X_train.shape[1])

In [163]:
dataset = CustomDataset(X_train, y_train)

In [164]:
loss_fn = nn.CrossEntropyLoss()

optimiser = torch.optim.Adam(params=model.parameters(), lr=0.005)

loader = torch.utils.data.DataLoader(dataset, batch_size=256)

num_epochs = 1

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    total_loss = 0.0

    for batch_idx, (inputs, labels) in enumerate(loader):
        optimiser.zero_grad()  # Zero the gradients
        outputs = model(inputs)  # Forward pass
        loss = loss_fn(outputs, labels - 1)  # Compute the loss
        loss.backward()  # Backward pass
        optimiser.step()  # Update the parameters

        total_loss += loss.item()

    # Print average loss for the epoch
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(loader)}')


torch.Size([256, 64]) torch.Size([256, 64])


IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)

### Validation

In [None]:
X_val_tfidf = tfidf_vectorizer.transform(X_val)

In [None]:
# Consolidation of feature sets:
X_val_final = hstack([X_val_tfidf])

In [None]:
val_dataset = CustomDataset(X_val_final, y_val)

In [None]:
# obtain predictions on test data
model.eval()

y_pred_val = []

val_loader = torch.utils.data.DataLoader(val_dataset)

with torch.no_grad():
    for batch_idx, (inputs, labels) in enumerate(val_loader):  # Assuming test_loader is your DataLoader for test data
        outputs = torch.argmax(model(inputs))
        y_pred_val.extend(outputs)

In [None]:
# evaluate model training metrics with macro f1 score
f1_score(y_val, y_pred_val, average='macro')

### Test Data

In [None]:
# TEST DATA 
test_df = pd.read_csv('raw_data/balancedtest.csv', index_col = False)
test_df = test_df if enable_all_data else test_df.sample(PARTITION_SIZE)

In [None]:
X_test = test_df.iloc[:, 1]
y_test = test_df.iloc[:, 0]

# print(X_test.head())
# print(y_test.head())

In [None]:
# Preprocess test data to match steps on training data
X_test_ls = X_test.apply(preprocess)
X_test_sentence = X_test_ls.apply(lambda sentence: ' '.join(sentence))

X_test = X_test_sentence

#### Feature Engineering (Test Data)

In [None]:
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# Consolidation of feature transformations into single vector
# Eric
X_test_final = hstack([X_test_tfidf])

In [None]:
test_dataset = CustomDataset(X_test_final, y_test)

In [None]:
# obtain predictions on test data
model.eval()

y_pred = []

test_loader = torch.utils.data.DataLoader(test_dataset)

for inputs, labels in test_loader:  # Assuming test_loader is your DataLoader for test data
    with torch.no_grad():
        outputs = model(inputs)
        outputs = torch.argmax(outputs)
        y_pred.append(outputs.item())

In [None]:
# evaluate model training metrics with macro f1 score
f1_score(y_test, y_pred, average='macro')