In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from gensim.models import KeyedVectors
import torch.nn as nn
import numpy as np
import json
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import torch.nn.functional as F
import warnings
from torch import optim
warnings.filterwarnings('always')
load = False

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('device: ' + str(device))

In [None]:
class Tokenizer:
    def __init__(self,word2id):
        self.word2id = word2id
        self.id2word = {i: word for word, i in word2id.items()}
    def tokenize(self,text,max_length=512):
        oov_id, pad_id = self.word2id.get("<oov>"), self.word2id.get("<pad>")
        w = torch.LongTensor(max_length).fill_(pad_id)
        m = torch.LongTensor(max_length).fill_(0)
        words = str(text).strip(".").split()
        for i, wi in enumerate(words[:max_length]):
            w[i] = self.word2id.get(wi, oov_id)
            if w[i]!=oov_id:
                m[i]=1
        return w,m

In [None]:
class ClickBaitDataSet(Dataset):
    def __init__(self,df,tokenizer):
        self.df=df
        self.tokenizer=tokenizer
    def __getitem__(self, idx):
        body = self.df["body"][idx]
        body, bm = self.tokenizer.tokenize(body,max_length=512)
        label = self.df["label"][idx]
        return body,label
    def __len__(self):
        return len(self.df)

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        w2v = KeyedVectors.load_word2vec_format("./models/word2vec.bin", binary=True)
        index_to_key = w2v.index_to_key
        key_to_index = w2v.key_to_index
        index_to_key.append("<oov>")
        index_to_key.append("<pad>")
        key_to_index["<oov>"] = index_to_key.index("<oov>")
        key_to_index["<pad>"] = index_to_key.index("<pad>")
        weights = w2v.vectors
        weights = np.append(weights,np.array([[0]*50,[0]*50]),axis=0)
        self.tokenizer = Tokenizer(key_to_index)
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(weights),padding_idx=key_to_index["<pad>"],freeze=True)
        self.biGRU = nn.GRU(50, 64, bidirectional=False, batch_first=True)
        self.pred =  nn.Linear(64 , 2)
    def forward(self,b):
        bemb = self.embedding(b) 
        gru_out, _ = self.biGRU(bemb)
        prediction_logits = self.pred(gru_out[:,-1,:])
        return prediction_logits

In [None]:
def train_func_epoch(epoch, model, dataloader, device, optimizer):
    model.train()
    total_loss = 0
    with tqdm(dataloader, unit="batch", total=len(dataloader)) as single_epoch:
        
        for step, batch in enumerate(single_epoch):

            single_epoch.set_description(f"Training- Epoch {epoch}")
            b, l  = batch 
            b = b.to(device)
            l = l.to(device)
            model.zero_grad()
            ll = model(b)
            loss = F.cross_entropy(ll,l)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            model.zero_grad()
            single_epoch.set_postfix(train_loss=total_loss/(step+1))
    return total_loss / len(dataloader)

In [None]:
def eval_func_epoch(model, dataloader, device, epoch):
    model.eval()
    total_loss = 0
    targets = []
    predictions = []
    with tqdm(dataloader, unit="batch", total=len(dataloader)) as single_epoch:
        for step, batch in enumerate(single_epoch):
            single_epoch.set_description(f"Evaluating- Epoch {epoch}")
            b, l  = batch 
            b = b.to(device)
            l = l.to(device)
            model.zero_grad()
            with torch.no_grad():
                ll = model(b)
                loss = F.cross_entropy(ll,l)
                total_loss += loss.item()
            single_epoch.set_postfix(train_loss=total_loss/(step+1))
            pred = torch.argmax(ll, dim=1).flatten().cpu().numpy()
            predictions.append(pred)
            targets.append(l.cpu().numpy())
    targets = np.concatenate(targets, axis=0)
    predictions = np.concatenate(predictions, axis=0)
    epoch_validation_loss = total_loss/len(dataloader)
    report = classification_report(targets, predictions, output_dict=True, labels=[0,1])
    tn, fp, fn, tp = confusion_matrix(targets, predictions).ravel()
    if epoch == "TESTING":
        ConfusionMatrixDisplay.from_predictions(targets, predictions)
        plt.savefig("confusion.png",dpi=300)
    return epoch_validation_loss, report, tn, fp, fn, tp

In [None]:
model = Model()
model_path="models/model.pt"
if load :
    loaded_state_dict = torch.load(model_path,  map_location=device)
    model.load_state_dict(loaded_state_dict)  
opt = optim.Adam(model.parameters(),lr = 1e-5)
model.to(device)
batch_size=16
epochs=30

In [None]:
train_df = pd.read_csv("../../data/clean/train.csv")#.head(500)
test_df = pd.read_csv("../../data/clean/train.csv")#.head(500)
val_df = pd.read_csv("../../data/clean/val.csv")#.head(500)
train_data = ClickBaitDataSet(train_df,model.tokenizer)
test_data = ClickBaitDataSet(test_df,model.tokenizer)
val_data = ClickBaitDataSet(val_df,model.tokenizer)
train_data_loader = DataLoader(train_data, batch_size=batch_size)
val_data_loader = DataLoader(val_data, batch_size=batch_size)
test_data_loader = DataLoader(test_data, batch_size=batch_size)

In [None]:
best_loss = np.inf
best_epoch = 0
for epoch in range(epochs):
    print(f"\n---------------------- Epoch: {epoch+1} ---------------------------------- \n")
    ## Training Loop
    train_loss = train_func_epoch(epoch+1, model, train_data_loader, device, opt)
    ## Validation loop
    val_loss, report, tn, fp, fn, tp = eval_func_epoch(model, val_data_loader, device, epoch+1)
    print(f"\nEpoch: {epoch+1} | Training loss: {train_loss} | Validation Loss: {val_loss}")
    print()
    print(report)
    print()
    print(f"TP: {tp} | FP: {fp} | TN: {tn}, FN: {fn} ")
    print(f"\n----------------------------------------------------------------------------")
    ## Save the model 
    if (val_loss < best_loss):
        torch.save(model.state_dict(), model_path)
        best_loss = val_loss
        best_epoch = epoch+1

In [None]:
loaded_state_dict = torch.load(model_path,  map_location=device)
model.load_state_dict(loaded_state_dict)
print(f"\n---------------------- Testing best model (at epoch: {best_epoch} )---------------------------------- \n")
test_loss,report, tn, fp, fn, tp = eval_func_epoch(model, test_data_loader, device, "TESTING")
print(f"\nTest loss: {test_loss}")
print()
print(report)
print()
print(f"TP: {tp} | FP: {fp} | TN: {tn}, FN: {fn} ")

with open("./report.json","w") as f:
    json.dump(report,f,indent=4) 