In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import numpy as np
import json
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import torch.nn.functional as F
import warnings
from torch import optim
from nltk import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
tqdm.pandas()
warnings.filterwarnings('always')

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('device: ' + str(device))

In [None]:
def split_to_trigrams(text):
    if text == None or pd.isnull(text):
        return ''
    # return text
    tokens = text.split(" ")
    delim_tokens = ['#' + token + '#' for token in tokens]
    trigram_tokens_list = []
    for token in delim_tokens:
        trigram_tokens_list += ["".join(k1) for k1 in list(ngrams(token, n = 3))]
    return ' '.join(trigram_tokens_list)

In [None]:
# Vectorizing with Word hashing - using training data to fit tfidf vectorizer
with open('../data/clean/train.csv', 'r') as f:
    data = pd.read_csv(f)#.head(1000)
    x1, x2 = data['heading'].progress_apply(lambda text: split_to_trigrams(text)).tolist(), data['body'].progress_apply(lambda text: split_to_trigrams(text)).tolist()
    vectorizer = TfidfVectorizer()
    vectorizer.fit(x1 + x2)

In [None]:
VOCAB_SIZE, WORD_DIM, FINAL_DIM, dropout = vectorizer.transform(["Hello there"]).toarray().shape[1], 300, 128, 0.2

In [None]:
class ClickBaitDataSet(Dataset):
    def __init__(self,df,vectorizer):
        self.df=df
        self.vectorizer=vectorizer
    def __getitem__(self, idx):
        heading = str(self.df["heading"][idx])
        body = str(self.df["body"][idx])
        if not heading or heading==" " or len(heading)==0:
            heading ="something"
        if not body or body==" " or len(body)==0:
            body ="something"
        heading = torch.tensor(self.vectorizer.transform([heading])[0].toarray().astype(np.float32))
        body = torch.tensor(self.vectorizer.transform([body])[0].toarray().astype(np.float32))
        label = self.df["label"][idx]
        return heading,body,label
    def __len__(self):
        return len(self.df)

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.l1 =  nn.Linear(VOCAB_SIZE ,WORD_DIM)
        self.l2 =  nn.Linear(WORD_DIM ,WORD_DIM)
        self.l3 =  nn.Linear(WORD_DIM ,FINAL_DIM)
        self.tanh = nn.Tanh()
        self.do = nn.Dropout(p=0.2)
        self.cos = torch.nn.CosineSimilarity(dim=1, eps=1e-08)
        self.l4 = nn.Linear(1,2)
    def forward(self,h,b):
        h = h.squeeze(1)
        b = b.squeeze(1)
        h = self.l1(h)
        h = self.tanh(h)
        h = self.l2(h)
        h = self.tanh(h)
        h = self.l3(h)
        h = self.tanh(h)
        b = self.l1(b)
        b = self.tanh(b)
        b = self.l2(b)
        b = self.tanh(b)
        b = self.l3(b)
        b = self.tanh(b)
        h = self.do(h)
        b = self.do(b)
        c = self.cos(h,b).unsqueeze(-1)
        out = self.l4(c)
        return out

In [None]:
def train_func_epoch(epoch, model, dataloader, device, optimizer):
    model.train()
    total_loss = 0
    with tqdm(dataloader, unit="batch", total=len(dataloader)) as single_epoch:
        
        for step, batch in enumerate(single_epoch):

            single_epoch.set_description(f"Training- Epoch {epoch}")
            h, b, l = batch 
            h = h.to(device)
            b = b.to(device)
            l = l.to(device)
            model.zero_grad()
            out = model(h,b)
            loss = F.cross_entropy(out,l)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            model.zero_grad()
            single_epoch.set_postfix(train_loss=total_loss/(step+1))
    return total_loss / len(dataloader)

In [None]:
def eval_func_epoch(model, dataloader, device, epoch):
    model.eval()
    total_loss = 0
    targets = []
    predictions = []
    with tqdm(dataloader, unit="batch", total=len(dataloader)) as single_epoch:
        for step, batch in enumerate(single_epoch):
            single_epoch.set_description(f"Evaluating- Epoch {epoch}")
            h, b, l = batch 
            h = h.to(device)
            b = b.to(device)
            l = l.to(device)
            model.zero_grad()
            with torch.no_grad():
                out = model(h,b)
                loss = F.cross_entropy(out,l)
                total_loss += loss.item()
            single_epoch.set_postfix(train_loss=total_loss/(step+1))
            pred = torch.argmax(out, dim=1).flatten().cpu().numpy()
            predictions.append(pred)
            targets.append(l.cpu().numpy())
    targets = np.concatenate(targets, axis=0)
    predictions = np.concatenate(predictions, axis=0)
    epoch_validation_loss = total_loss/len(dataloader)
    report = classification_report(targets, predictions, output_dict=True, labels=[0,1])
    tn, fp, fn, tp = confusion_matrix(targets, predictions).ravel()
    if epoch == "TESTING":
        ConfusionMatrixDisplay.from_predictions(targets, predictions)
        plt.savefig("confusion.png",dpi=300)
    return epoch_validation_loss, report, tn, fp, fn, tp

In [None]:
model = Model()
opt = optim.Adam(model.parameters(),lr = 1e-5)
model.to(device)
batch_size=32
epochs=8
model_path="models/model.pt"

In [None]:
train_df = pd.read_csv("../data/clean/train.csv")#.head(500)
test_df = pd.read_csv("../data/clean/train.csv")#.head(500)
val_df = pd.read_csv("../data/clean/val.csv")#.head(500)
train_data = ClickBaitDataSet(train_df,vectorizer)
test_data = ClickBaitDataSet(test_df,vectorizer)
val_data = ClickBaitDataSet(val_df,vectorizer)
train_data_loader = DataLoader(train_data, batch_size=16)
val_data_loader = DataLoader(val_data, batch_size=16)
test_data_loader = DataLoader(test_data, batch_size=16)

In [None]:
best_loss = np.inf
best_epoch = 0
for epoch in range(epochs):
    print(f"\n---------------------- Epoch: {epoch+1} ---------------------------------- \n")
    ## Training Loop
    train_loss = train_func_epoch(epoch+1, model, train_data_loader, device, opt)
    ## Validation loop
    val_loss, report, tn, fp, fn, tp = eval_func_epoch(model, val_data_loader, device, epoch+1)
    print(f"\nEpoch: {epoch+1} | Training loss: {train_loss} | Validation Loss: {val_loss}")
    print()
    print(report)
    print()
    print(f"TP: {tp} | FP: {fp} | TN: {tn}, FN: {fn} ")
    print(f"\n----------------------------------------------------------------------------")
    ## Save the model 
    if (val_loss < best_loss):
        torch.save(model.state_dict(), model_path)
        best_loss = val_loss
        best_epoch = epoch+1

In [None]:
loaded_state_dict = torch.load(model_path,  map_location=device)
model.load_state_dict(loaded_state_dict)
print(f"\n---------------------- Testing best model (at epoch: {best_epoch} )---------------------------------- \n")
test_loss,report, tn, fp, fn, tp = eval_func_epoch(model, test_data_loader, device, "TESTING")
print(f"\nTest loss: {test_loss}")
print()
print(report)
print()
print(f"TP: {tp} | FP: {fp} | TN: {tn}, FN: {fn} ")

with open("./report.json","w") as f:
    json.dump(report,f,indent=4) 