### Setup

The following directories/files are not included in the repository, to be extracted from `.zip` files in the following structure
```md
├── models
│   └── BiLSTM
│       ├── config.json
│       └── model.safetensors
└── raw_data
    ├── balancedtest.csv
    └── fulltrain.csv
```

In [9]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import torchtext

from sklearn.metrics import accuracy_score, precision_recall_fscore_support 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import Dataset, DataLoader

In [10]:
print(torch.cuda.is_available())
torch.cuda.device(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

True


In [11]:
SEED = 42 # seed random state for comparison, testing
PARTITION_SIZE = 500 # Adjust lower if potato PC and higher if gaming rig or want results closer to actual
enable_all_data = True # SET TO FALSE IF PREPROCESSING TAKES A LONG TIME (True = test on PARTITION_SIZE training and PARTITION_SIZE testing samples)

### Pre-processing

In [12]:
df = pd.read_csv('../input/lun-raw/fulltrain.csv', header=None, index_col = False)
df.head()

Unnamed: 0,0,1
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...


In [13]:
df = df if enable_all_data else df.sample(n=PARTITION_SIZE, random_state=SEED)

X = df.iloc[:, 1] 
y = df.iloc[:, 0] - 1

y.value_counts()

0
2    17870
0    14047
3     9995
1     6942
Name: count, dtype: int64

### Dataset

In [14]:
class CustomDataset(Dataset):
    def __init__(self, X, y, vectorizer):
        self.X = X
        self.y = y
        self.vectorizer = vectorizer

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        text = self.X.iloc[index]
        label = self.y.iloc[index]

        tfidf_tensor = torch.tensor(self.vectorizer.transform([text]).todense()).float().to(device)
        
        return tfidf_tensor, label


In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [16]:
# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)

In [17]:
train_dataset = CustomDataset(X_train, y_train, vectorizer)
val_dataset = CustomDataset(X_val, y_val, vectorizer)
# train_dataset = CustomDataset(X, y, vectorizer)

In [18]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [22]:
print(X_train_vectorized.shape)
# print(X_train_vectorized[0])
# print(X_train_vectorized[0].shape)
print(X_train.shape)
print(y.shape)
# print(X_train[34996])
# print(y_train[34996])

(39083, 202377)
(39083,)
(48854,)


### Model

In [24]:
class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(BiLSTM, self).__init__()
        # self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 4)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)

        # Forward pass through LSTM
        out, _ = self.lstm(x, (h0, c0))

        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

### Training

In [28]:
model = BiLSTM(202377, 128, 2).to(device)

In [31]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

In [32]:
EPOCHS = 5

for epoch in range(EPOCHS):
    model.train()
    for texts, labels in train_loader:
        texts = texts.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(texts)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for texts, labels in val_loader:
            texts = texts.to(device)
            labels = labels.to(device)
            outputs = model(texts)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        print(f'Epoch [{epoch+1}/{EPOCHS}], Val Accuracy: {100 * correct / total:.2f}%')

Epoch [1/5], Val Accuracy: 52.88%
Epoch [2/5], Val Accuracy: 69.67%
Epoch [3/5], Val Accuracy: 89.95%
Epoch [4/5], Val Accuracy: 93.71%
Epoch [5/5], Val Accuracy: 95.11%


### Save model

In [38]:
torch.save(model.state_dict(), "/kaggle/working/BiLSTM")

### Evaluation

In [40]:
# TEST DATA 
test_df = pd.read_csv('../input/lun-raw/balancedtest.csv', index_col = False)
test_df = test_df if enable_all_data else test_df.sample(PARTITION_SIZE)

In [42]:
X_test = test_df.iloc[:, 1]
y_test = test_df.iloc[:, 0] - 1

In [43]:
test_dataset = CustomDataset(X_test, y_test, vectorizer)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [47]:
model.eval()
correct = 0
total = 0
y_true = []
y_pred = []

with torch.no_grad():
    for texts, labels in test_loader:
        texts = texts.to(device)
        labels = labels.to(device)
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

In [50]:
test_accuracy = accuracy_score(y_true, y_pred)
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')

In [51]:
print(f'Test Accuracy: {test_accuracy:.8f},\tTest Precision: {test_precision:.8f},\tTest Recall: {test_recall:.8f},\tTest f1: {test_f1:.8f}')

Test Accuracy: 0.74458153,	Test Precision: 0.75284128,	Test Recall: 0.74459012,	Test f1: 0.73768862


In [52]:
print(pd.Series(y_pred).value_counts())

2    975
3    837
0    702
1    485
Name: count, dtype: int64


In [54]:
class_test_precision, class_test_recall, class_test_f1, class_ = precision_recall_fscore_support(y_true, y_pred)
for i in range(4):
    print(f'Class {i}:\tTest Precision: {class_test_precision[i]:.8f},\tTest Recall: {class_test_recall[i]:.8f},\tTest f1: {class_test_f1[i]:.8f}')

Class 0:	Test Precision: 0.82193732,	Test Recall: 0.77036048,	Test f1: 0.79531358
Class 1:	Test Precision: 0.74845361,	Test Recall: 0.48400000,	Test f1: 0.58785425
Class 2:	Test Precision: 0.62974359,	Test Recall: 0.81866667,	Test f1: 0.71188406
Class 3:	Test Precision: 0.81123059,	Test Recall: 0.90533333,	Test f1: 0.85570258
