### Setup

In [None]:
import pandas as pd
import torch
import torch.nn as nn

from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer
from typing import Optional

In [None]:
print(torch.cuda.is_available())
torch.cuda.device(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
SEED = 42 # seed random state for comparison, testing
PARTITION_SIZE = 500 # Adjust lower if potato PC and higher if gaming rig or want results closer to actual
enable_all_data = True # SET TO FALSE IF PREPROCESSING TAKES A LONG TIME (True = test on PARTITION_SIZE training and PARTITION_SIZE testing samples)

### Pre-processing

In [None]:
df = pd.read_csv('/kaggle/input/labeled-unreliable-news-lun/fulltrain.csv', header=None, index_col = False)
df.head()

In [None]:
df = df if enable_all_data else df.sample(n=PARTITION_SIZE, random_state=SEED)

X_train = df.iloc[:, 1] 
y_train = df.iloc[:, 0]

# print(X_train)
# print(y_train)

# print(len(X_train))
# print(len(y_train))

# y_train.value_counts()

#### Train-Validation Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train.values, y_train.values, test_size=0.2, random_state=SEED)

#### Feature Engineering

#### Oversampling

### Model Architecture

#### Modules

In [None]:
class WordEmbeddings(nn.Module):
    """
    Adapted from: 
    - (Embedding layer) https://github.com/huggingface/transformers/blob/main/src/transformers/models/distilbert/modeling_distilbert.py
    - (DistilBertConfig) https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/distilbert/configuration_distilbert.py#L45
    """

    def __init__(self,
                 vocab_size=30522,
                 max_position_embeddings=1024,
                 dim=768,
                 dropout=0.1,
                 pad_token_id=0):
        super().__init__()
        self.word_embeddings = nn.Embedding(vocab_size, dim, padding_idx=pad_token_id)
        self.position_embeddings = nn.Embedding(max_position_embeddings, dim)

        self.LayerNorm = nn.LayerNorm(dim, eps=1e-12)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "position_ids", torch.arange(max_position_embeddings).expand((1, -1)), persistent=False
        )
            
    def forward(self, input_ids: torch.Tensor, input_embeds: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Parameters:
            input_ids (torch.Tensor):
                torch.tensor(bs, max_seq_length) The token ids to embed.
            input_embeds (*optional*, torch.Tensor):
                The pre-computed word embeddings. Can only be passed if the input ids are `None`.

        Returns: torch.tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type
        embeddings)
        """
        if input_ids is not None:
            input_embeds = self.word_embeddings(input_ids)  # (bs, max_seq_length, dim)

        seq_length = input_embeds.size(1)

        # Setting the position-ids to the registered buffer in constructor, it helps
        # when tracing the model without passing position-ids, solves
        # if hasattr(self, "position_ids"):
        #     position_ids = self.position_ids[:, :seq_length]
        # else:
        if hasattr(self, "position_ids"):
            position_ids = self.position_ids[:, :seq_length]
        else:
            position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)  # (max_seq_length)
            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)  # (bs, max_seq_length)

        position_embeddings = self.position_embeddings(position_ids)  # (bs, max_seq_length, dim)

        embeddings = input_embeds + position_embeddings  # (bs, max_seq_length, dim)
        embeddings = self.LayerNorm(embeddings)  # (bs, max_seq_length, dim)
        embeddings = self.dropout(embeddings)  # (bs, max_seq_length, dim)
        return embeddings

class Attention(nn.Module):
    ### Implements Scaled Dot Product Attention
    def __init__(self):
        super().__init__()

    def forward(self, Q, K, V, mask=None, dropout=None):
        # All inputshapes: (batch_size B, seq_len L, model_size D)

        # Perform Q*K^T (* is the dot product here)
        # We have to use torch.matmul since we work with batches!
        out = torch.matmul(Q, K.transpose(1, 2)) # => shape (B, L, D)

        # scale alignment scores
        out = out / (Q.shape[-1] **0.5)

        # Push through softmax layer
        out = nn.functional.softmax(out, dim=-1)

        # Multiply scaled alignment scores with values V
        return torch.matmul(out, V)

class AttentionHead(nn.Module):
    def __init__(self, model_size, qkv_size):
        super().__init__()
        self.Wq = nn.Linear(model_size, qkv_size)
        self.Wk = nn.Linear(model_size, qkv_size)
        self.Wv = nn.Linear(model_size, qkv_size)
        self.attention = Attention()

    def forward(self, queries, keys, values):
        # Computes scaled dot-product attention
        return self.attention(self.Wq(queries),
                              self.Wk(keys),
                              self.Wv(values))
 
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, model_size, qkv_size):
        super().__init__()

        # Define num_heads attention heads
        self.heads = nn.ModuleList(
            [ AttentionHead(model_size, qkv_size) for _ in range(num_heads) ]
        )

        # Linear layer to "unify" all heads into one
        self.Wo = nn.Linear(num_heads * qkv_size, model_size)

    def forward(self, query, key, value):
        # Compute the outputs for all attention heads
        out_heads = [ head(query, key, value) for head in self.heads ]

        # Concatenate output of all attention heads
        out = torch.cat(out_heads, dim=-1)

        # Unify concatenated output to the model size
        return self.Wo(out)

class FeedForward(nn.Module):
    def __init__(self, model_size, hidden_size=2048):
        super().__init__()

        self.net = nn.Sequential(
            nn.Linear(model_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, model_size),
        )
    
    def forward(self, X):
        return self.net(X)

class TransformerEncoderLayer(nn.Module):
    def __init__(self, model_size, num_heads, ff_hidden_size, dropout):
        super().__init__()

        # Define sizes of Q/K/V based on model size and number of heads
        qkv_size = max(model_size // num_heads, 1)

        # MultiHeadAttention block
        self.mhal = MultiHeadAttention(num_heads, model_size, qkv_size)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(model_size)

        # FeedForward block
        self.ff = FeedForward(model_size, ff_hidden_size)
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(model_size)

    def forward(self, source):
        # MultiHeadAttention block
        out1 = self.mhal(source, source, source)
        out1 = self.dropout1(out1)
        out1 = self.norm1(out1 + source)

        # FeedForward block
        out2 = self.ff(out1)
        out2 = self.dropout2(out2)
        out2 = self.norm2(out2)

        return out2

class TransformerEncoder(nn.Module):
    def __init__(self,
                 num_layers=6,
                 model_size=768,
                 num_heads=8,
                 ff_hidden_size=2048,
                 dropout=0.1):
        super().__init__()

        #define num_layers (N) encoder layers
        self.layers = nn.ModuleList(
            [ TransformerEncoderLayer(model_size,
                                    num_heads,
                                    ff_hidden_size,
                                    dropout)
              for _ in range(num_layers)
            ]
        )
    
    def forward(self, source):
        # Push through each encoder layer
        for l in self.layers:
            source = l(source)
        return source


#### Classifier

In [None]:
class EncoderOnlyClassificationModel(nn.Module):
    """
    References: 
    - (DistilBertForSequenceClassification) https://github.com/huggingface/transformers/blob/main/src/transformers/models/distilbert/modeling_distilbert.py
    - (DistilBertConfig) https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/distilbert/configuration_distilbert.py#L45
    """
    def __init__(self,
                 num_layers=6,
                 model_size=768,
                 num_heads=8,
                 ff_hidden_size=2048,
                 encoder_dropout=0.1,
                 classifier_dropout=0.2,
                 num_classes=4):
        super().__init__()

        self.embeddings = WordEmbeddings(dim=model_size)
        self.encoder = TransformerEncoder(num_layers, model_size, num_heads, ff_hidden_size, encoder_dropout)
        self.pre_classifier = nn.Linear(model_size, model_size)
        self.classifier = nn.Linear(model_size, num_classes)
        self.dropout = nn.Dropout(classifier_dropout)
    
    def forward(self, input_ids, attention_mask):
        embeddings = self.embeddings(input_ids, attention_mask)
        output = self.encoder(embeddings)[:, 0]
        output = self.pre_classifier(output)
        output = nn.ReLU()(output)
        output = self.dropout(output)
        output = self.classifier(output)
        return output

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, X, y, tokenizer=DistilBertTokenizer.from_pretrained('distilbert-base-uncased')):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        encoding = self.tokenizer.encode_plus(
            str(self.X[idx]),
            add_special_tokens=True,
            max_length=1024,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        y_tensor = torch.tensor(self.y[idx]).long()
        return encoding['input_ids'].flatten().to(device), encoding['attention_mask'].flatten().to(device), y_tensor.to(device)

In [None]:
model = EncoderOnlyClassificationModel().to(device)

In [None]:
train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_val, y_val)

In [None]:
loss_fn = nn.CrossEntropyLoss()

optimiser = torch.optim.AdamW(model.parameters(), lr=2e-5)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

torch.manual_seed(SEED)

EPOCHS = 2

for epoch in range(EPOCHS):
    model.train()  # Set the model to training mode
    total_loss = 0.0

    for batch_idx, (input_ids, attention_mask, labels) in enumerate(train_loader):
        optimiser.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels - 1)
        loss.backward()
        optimiser.step()

        total_loss += loss.item()
        
    # obtain predictions on val data
    model.eval()
    y_pred_val = []
    y_true_val = []
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32)

    with torch.no_grad():
        for batch_idx, (input_ids, attention_mask, labels) in enumerate(val_loader):
            outputs = model(input_ids, attention_mask).cpu()
            y_pred_val.extend(torch.argmax(outputs, dim=1).numpy())
            y_true_val.extend(labels.cpu().numpy())

    val_accuracy = accuracy_score(y_true_val - 1, y_pred_val)
    val_f1 = f1_score(y_true_val - 1, y_pred_val, average='macro')
    # Print average loss for the epoch
    print(f'Epoch {epoch + 1}/{EPOCHS},\nLoss: {total_loss / len(loader)},\nValidation Accuracy: {val_accuracy:.4f},\nValidation f1: {val_f1:.4f}')

### Save model

In [None]:
checkpoint = {'model': EncoderOnlyClassificationModel(),
              'state_dict': model.state_dict(),
              'optimiser' : optimiser.state_dict()}

torch.save(checkpoint, 'checkpoint.pth')

### Test Data

In [None]:
# TEST DATA 
test_df = pd.read_csv('/kaggle/input/labeled-unreliable-news-lun/balancedtest.csv', index_col = False)
test_df = test_df if enable_all_data else test_df.sample(PARTITION_SIZE)

In [None]:
X_test = test_df.iloc[:, 1]
y_test = test_df.iloc[:, 0]

#### Feature Engineering (Test Data)

In [None]:
test_dataset = CustomDataset(X_test, y_test)

In [None]:
# obtain predictions on test data
model.eval()

y_pred = []
y_true_test = []

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

with torch.no_grad():
    for batch_idx, (input_ids, attention_mask, labels) in enumerate(test_loader):
        outputs = model(input_ids, attention_mask).cpu()
        y_pred.extend(torch.argmax(outputs, dim=1).numpy())
        y_true_test.extend(labels.cpu().numpy())

In [None]:
test_accuracy = accuracy_score(y_true_test - 1, y_pred)
test_f1 = f1_score(y_true_test - 1, y_pred, average='macro')

print(f'Test Accuracy: {test_accuracy:.4f},\nTest f1: {test_f1:.4f}')