In [1]:
# import standard libraries
import os
import numpy as np

import warnings
import pandas as pd

from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

import datetime
import pickle
from nltk.corpus import stopwords
import re
import regex
import string
from urllib.parse import urlparse

In [2]:
# import ML libraries
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset
from sklearn.metrics import f1_score
import openai

# As dataset is imbalanced define a weighted sampler
from torch.utils.data import WeightedRandomSampler

In [None]:
# define preproccessing function
stop_words = set(stopwords.words('english'))

def preproccess(text):
    if isinstance(text, float):
        return [""]
    
    # split into tokens
    tokens = re.split('\s+', text)

    # remove punctuation
    tokens = ["".join([i for i in x if i not in string.punctuation]) for x in tokens]

    # remove numbers
    tokens = [re.sub("\d+", "", x) for x in tokens]

    # make all tokens lowercase
    tokens = [x.lower() for x in tokens]

    # remove tokens which are too short or too long
    tokens = [token for token in tokens if len(token) > 2 and len(token) < 15]

    # remove hyperlinks
    tokens = [token for token in tokens if not (token.startswith("http") or token.startswith("www") or token.endswith("com"))]

    # remove stop words
    #final = [word for word in tokens if word not in stop_words]

    if isinstance(tokens, float):
        return [""]

    return tokens

In [None]:
# define basic Bag of Words function
def BOW_bin(words, vocab):
    return [1 if word in words else 0 for word in vocab]

def BOW_freq(words, vocab):
    return [words.count(word) for word in vocab]

In [None]:
# Function for getting domain name form url
def extract_domain(url):
    if not isinstance(url, str):
        return ""
    return urlparse(url).netloc

In [None]:
# Function for gettings the embeddings of text
def get_embeddings(text, model = "text-similarity-ada-001"):
    embeddings = openai.Embedding.create(input = text, model = model)
    return embeddings["data"][0]["embedding"]

In [None]:
# define transformations of the data
class TextualTransform1(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]
        
        post["title"] = BOW_bin(preproccess(post["title"]), titles_vocab)
        post["text"] = BOW_bin(preproccess(post["text"]), text_vocab)

        return {'post': post, 'score': score}

class TextualTransform2(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]
        
        post["title"] = BOW_freq(preproccess(post["title"]), titles_vocab)
        post["text"] = BOW_freq(preproccess(post["text"]), text_vocab)

        return {'post': post, 'score': score}

class URLTransform(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]

        post["url"] = extract_domain(post["url"])

        return {'post': post, 'score': score}

class TensorTransform(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]

        title_list = post["title"]
        text_list = post["text"]
        time = post["time"]
        
        output = title_list + text_list
        output.append(time)

        output = torch.FloatTensor(output)
        score = torch.FloatTensor(score)

        return {"post": output, "score": score}

In [None]:
# define a index dict for getting word embeddings
titles_to_index = {word: index for index, word in enumerate(titles_vocab)}
text_to_index = {word: index + len(titles_to_index) for index, word in enumerate(text_vocab)}

In [None]:
# import datasets
from ipynb.fs.defs.datasets import HackerNewsPostDataset
from ipynb.fs.defs.datasets import EmbeddingsDataset
from ipynb.fs.defs.datasets import BertTitleEmbeddingDataset
from ipynb.fs.defs.datasets import BertProcessedTitleEmbeddingDataset
from ipynb.fs.defs.datasets import BertProcessedTitleEmbeddingTitleAndTextDataset
from ipynb.fs.defs.datasets import BertProcessedTitleEmbeddingTitleAndTimeDataset
from ipynb.fs.defs.datasets import BertProcessedTitleEmbeddingMulticlass
from ipynb.fs.defs.datasets import OpenAIEmbeddingDataset

In [None]:
# non url training dataset for BoW model
transforms = [TextualTransform1(), TensorTransform()]

post_training_dataset = HackerNewsPostDataset(training_data_indexed, scores, cutoff, transforms)
post_validation_dataset = HackerNewsPostDataset(validation_data_indexed, validation_scores, cutoff, transforms)
post_testing_dataset = HackerNewsPostDataset(testing_data_indexed, testing_scores, cutoff, transforms)

In [None]:
def define_sampler(sample_scores, cutoff):
    num_popular = (np.array(sample_scores) > cutoff).sum()
    unpopular_weight = 1 / ((len(sample_scores) - num_popular) / num_popular)
    popular_weight = 1


    class_weights = [unpopular_weight, popular_weight]
    weights = [0] * len(sample_scores)

    for index, score in enumerate(sample_scores):
        weight = class_weights[int(score > cutoff)]
        weights[index] = weight

    sampler = WeightedRandomSampler(weights, num_samples = len(weights), replacement = True)

    return sampler

In [None]:
# define function for collating batches
def collate_batch_embed(batch):
    lables, texts, offsets = [], [], [0]
    for post, score in batch:

        titles_indexs = [titles_to_index[word] for word in post[0]]
        text_indexs = [text_to_index[word] for word in post[1]]
        
        proccessed_input = torch.tensor(titles_indexs + text_indexs, dtype = torch.int64).to(device)

        texts.append(proccessed_input)
        lables.append([score])
        offsets.append(proccessed_input.size(0))

    lables = torch.tensor(lables, dtype = torch.int64).to(device)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim = 0).to(device)
    texts = torch.cat(texts)
    return lables, texts, offsets



In [1]:
def print_weights_and_gradients(model):
    for param in model.parameters():
        print(f"Gradient = {param.grad}")

In [None]:
# define function for training a embedding model
def train_model_embed_bag(model, train_loader, test_loader, loss_func, optimizer, device, num_epochs):
    iterations = 1
    accuracy_list = []
    loss_list = []
    embeddings_list = []

    # make sure model is being trained on gpu
    model.to(device)

    for epoch in range(num_epochs):
        print("Epoch {}.".format(epoch))
        running_loss = 0

        for index, batch in enumerate(train_loader):
            lables, texts, offsets = batch

            # set the model to track graidents for training
            model.train(True)

            # reset optimizer grads to zero
            optimizer.zero_grad()

            # forward pass of the model
            predictions, embeddings = model(texts, offsets)

            # calculate loss
            loss = loss_func(predictions, lables.float())

            # perform backwards pass
            loss.backward()

            # tune weights with optimizer
            optimizer.step()

            # stop gradient tracking for writing current model metrics
            model.train(False)          

            # track the running loss
            running_loss += loss.item()


            # add loss and embeddings to lists

            loss_list.append(loss.item())
            embeddings_list.append(embeddings)

            if iterations % 100 == 0:

                print(" Iteration {}. Running Loss {}.".format( iterations, running_loss / 1000))
    
                # reset running loss
                running_loss = 0

                """
                for test_index, test_batch in enumerate(test_loader):

                    test_lables, test_texts, test_offsets = test_batch

                    output_preds = model(test_texts, test_offsets)
                    output_preds = [1 if output > 0.0 else 0 for output in output_preds]

                    total = total + test_lables.size(0)

                    for i in range(len(test_lables)):
                        if output_preds[i] == test_lables[i]:
                            correct += 1
                

                accuracy = correct / total
                print("Iteration {}. Loss {}. Accuracy {}".format(iterations, loss.item(), accuracy))
                accuracy_list.append(accuracy)
                loss_list.append(loss.item())
                """
            iterations = iterations + 1
    return model, embeddings_list, loss_list

In [None]:
# define function for training a embedding model
def train_model_bert(model, train_loader, valid_loader, loss_func, optimizer, device, num_epochs, model_path):
    min_vloss = np.inf
    #min_loss = np.inf
    loss_list = []
    valid_losses = []

    # make sure model is being trained on gpu
    model = model.to(device)

    for epoch in range(num_epochs):
        print("Epoch {}.".format(epoch))
        running_loss = 0
        running_f1 = 0
        iterations = 1
        for batch_index, batch in enumerate(train_loader):

            (embeddings, score) = batch

            embeddings = embeddings.to(device)
            score = score.to(device)

            # set the model to track graidents for training
            model.train(True)

            # reset optimizer grads to zero
            optimizer.zero_grad()

            # forward pass of the model
            predictions = model(embeddings)

            # calculate loss
            loss = loss_func(predictions, score.float().unsqueeze(1))

            # perform backwards pass
            loss.backward()

            # tune weights with optimizer
            optimizer.step()     

            # track the running loss
            running_loss += loss.item()

            # track the running f1-score
            running_f1 += f1_score(score, [1 if x > 0.5 else 0 for x in nn.Sigmoid()(predictions).detach().numpy()], zero_division=1)

            if iterations % 100 == 0:

                print("Iteration {}. Current Min Loss {}. Running F1 {}.".format(iterations, running_loss / 100, running_f1 / 100))

                # add running loss = list
                loss_list.append(running_loss / 100)

                # reset running loss
                running_loss = 0

                # reset running f1-score
                running_f1 = 0

                # print current gradients
                #print_weights_and_gradients(model)

               
            iterations = iterations + 1
        
        # stop gradient tracking for writing current model metrics
        model.eval()

        # run current model on test dataset to find best model
        valid_loss_total = 0
        for test_index, test_batch in enumerate(valid_loader):

            (valid_e, valid_s) = test_batch

            valid_e = valid_e.to(device)
            valid_s = valid_s.to(device)

            valid_pred = model(valid_e)

            valid_loss = loss_func(valid_pred, valid_s.float().unsqueeze(1))
            valid_loss_total += valid_loss.item()

        valid_losses.append(valid_loss_total / len(valid_loader))
        print("vloss -", valid_loss_total)

        if valid_loss_total < min_vloss:
            torch.save(model.state_dict(), model_path)
            min_vloss = valid_loss_total

    return model, loss_list, valid_losses

In [None]:
# define function for training a embedding model
def train_model_bert_multi(model, train_loader, valid_loader, loss_func, optimizer, device, num_epochs):
    min_vloss = np.inf
    loss_list = []
    valid_losses = []

    # make sure model is being trained on gpu
    model = model.to(device)

    for epoch in range(num_epochs):
        print("Epoch {}.".format(epoch))
        running_loss = 0
        running_f1 = 0
        iterations = 1
        for batch_index, batch in enumerate(train_loader):

            (embeddings, score) = batch

            embeddings = embeddings.to(device)
            
            score = torch.tensor([x.numpy() for x in score], dtype=torch.float32).transpose(0, 1)

            score = score.to(device)

            # set the model to track graidents for training
            model.train(True)

            # reset optimizer grads to zero
            optimizer.zero_grad()

            # forward pass of the model
            predictions = model(embeddings)

            # calculate loss
            loss = loss_func(predictions, score)

            # perform backwards pass
            loss.backward()

            # tune weights with optimizer
            optimizer.step()     

            # track the running loss
            running_loss += loss.item()

            # track the running f1-score
            running_f1 += f1_score(score, [1 if x > 0.5 else 0 for x in nn.Sigmoid()(predictions).detatch.numpy()])

            if iterations % 10 == 0:

                print("Iteration {}. Running Loss {}. Running F1 {}".format(iterations, running_loss / 10, running_f1 / 10))

                # add running loss = list
                loss_list.append(running_loss / 10)

                # reset running loss
                running_loss = 0
               
            iterations = iterations + 1
        
        # stop gradient tracking for writing current model metrics
        model.eval()

        # run current model on test dataset to find best model
        valid_loss_total = 0
        for test_index, test_batch in enumerate(valid_loader):

            (valid_e, valid_s) = test_batch

            valid_e = valid_e.to(device)
            valid_s = valid_s.to(device)

            valid_pred = model(valid_e)

            valid_loss = loss_func(valid_pred, valid_s.float().unsqueeze(1))
            valid_loss_total += valid_loss.item()

        valid_losses.append(valid_loss_total / len(valid_loader))
        print("vloss -", valid_loss_total)

        if valid_loss_total < min_vloss:
            torch.save(model.state_dict(), "bert_model_best_3.pth")
            min_vloss = valid_loss_total

    return model, loss_list, valid_losses