### Setup

The following directories/files are not included in the repository, to be extracted from `.zip` files in the following structure
```md
├── models
│   └── distilbert_model_no_pretrain
│       ├── config.json
│       └── model.safetensors
└── raw_data
    ├── balancedtest.csv
    └── fulltrain.csv
```

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np

from sklearn.metrics import accuracy_score, precision_recall_fscore_support 
from sklearn.model_selection import train_test_split
from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
from typing import Optional

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(torch.cuda.is_available())
torch.cuda.device(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

True


In [3]:
SEED = 42 # seed random state for comparison, testing
PARTITION_SIZE = 500 # Adjust lower if potato PC and higher if gaming rig or want results closer to actual
enable_all_data = True # SET TO FALSE IF PREPROCESSING TAKES A LONG TIME (True = test on PARTITION_SIZE training and PARTITION_SIZE testing samples)

### Pre-processing

In [4]:
df = pd.read_csv('raw_data/fulltrain.csv', header=None, index_col = False)
df.head()

Unnamed: 0,0,1
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...


In [5]:
df = df if enable_all_data else df.sample(n=PARTITION_SIZE, random_state=SEED)

X_train = df.iloc[:, 1] 
y_train = df.iloc[:, 0] - 1

# print(X_train)
# print(y_train)

# print(len(X_train))
# print(len(y_train))

y_train.value_counts()

0
2    17870
0    14047
3     9995
1     6942
Name: count, dtype: int64

### Dataset

In [6]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, X, y, tokenizer):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        encoding = self.tokenizer.encode_plus(
            str(self.X[idx]),
            add_special_tokens=True,
            max_length=512,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        y_tensor = torch.tensor(self.y[idx]).long()
        return encoding['input_ids'].flatten().to(device), encoding['attention_mask'].flatten().to(device), y_tensor.to(device)

### Model

In [7]:
# config = DistilBertConfig(num_labels=4)
# model = DistilBertForSequenceClassification(config=config).to(device)

In [8]:
model = DistilBertForSequenceClassification.from_pretrained('models/distilbert_model_no_pretrain').to(device)

In [9]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

### Training

In [9]:
train_dataset = CustomDataset(X_train, y_train, tokenizer)

In [10]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [11]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

EPOCHS = 5

for epoch in range(EPOCHS):
    model.train()  # Set the model to training mode
    total_loss = 0.0

    for batch_idx, (input_ids, attention_mask, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Print average loss for the epoch
    print(f'Epoch {epoch + 1}/{EPOCHS},\nLoss: {total_loss / len(train_loader)}')

Epoch 1/5,
Loss: 0.24656167650380623
Epoch 2/5,
Loss: 0.06251932591843018
Epoch 3/5,
Loss: 0.030941288717102975
Epoch 4/5,
Loss: 0.018407969535647883
Epoch 5/5,
Loss: 0.014596801375869687


### Save Model

In [12]:
# model.save_pretrained("models/distilbert_model_no_pretrain")

### Evaluation

In [10]:
# TEST DATA 
test_df = pd.read_csv('raw_data/balancedtest.csv', index_col = False)
test_df = test_df if enable_all_data else test_df.sample(PARTITION_SIZE)

In [11]:
X_test = test_df.iloc[:, 1]
y_test = test_df.iloc[:, 0] - 1

In [12]:
test_dataset = CustomDataset(X_test, y_test, tokenizer)

In [13]:
# obtain predictions on test data
model.eval()

y_pred = []
y_true_test = []

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

with torch.no_grad():
    for batch_idx, (input_ids, attention_mask, labels) in enumerate(test_loader):
        outputs = model(input_ids, attention_mask)
        y_pred.extend(torch.argmax(outputs.logits.cpu(), dim=1).numpy())
        y_true_test.extend(labels.cpu().numpy())

In [14]:
test_accuracy = accuracy_score(y_true_test, y_pred)
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(y_true_test, y_pred, average='macro')

In [17]:
print(f'Test Accuracy: {test_accuracy:.8f},\tTest Precision: {test_precision:.8f},\tTest Recall: {test_recall:.8f},\tTest f1: {test_f1:.8f}')

Test Accuracy: 0.54618206,	Test Precision: 0.55088861,	Test Recall: 0.54614286,	Test f1: 0.49958813


In [15]:
print(pd.Series(y_pred).value_counts())

3    1447
1     908
0     354
2     290
Name: count, dtype: int64


In [16]:
class_test_precision, class_test_recall, class_test_f1, class_ = precision_recall_fscore_support(y_true_test, y_pred)
for i in range(4):
    print(f'Class {i}:\tTest Precision: {class_test_precision[i]:.8f},\tTest Recall: {class_test_recall[i]:.8f},\tTest f1: {class_test_f1[i]:.8f}')

Class 0:	Test Precision: 0.90677966,	Test Recall: 0.42857143,	Test f1: 0.58204896
Class 1:	Test Precision: 0.57158590,	Test Recall: 0.69200000,	Test f1: 0.62605549
Class 2:	Test Precision: 0.21724138,	Test Recall: 0.08400000,	Test f1: 0.12115385
Class 3:	Test Precision: 0.50794748,	Test Recall: 0.98000000,	Test f1: 0.66909422
