In [None]:
# import standard libraries
import numpy as np

import warnings
import pandas as pd

from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

import datetime
import pickle
from nltk.corpus import stopwords
import re
import regex
import string
from urllib.parse import urlparse

In [None]:
# import ML libraries
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset

In [None]:
# import training data
with open("../data/training_data", "rb") as fb:
    training_data = pickle.load(fb)

In [None]:
# import testing data
with open("../data/testing_data", "rb") as fb:
    testing_data = pickle.load(fb)

In [None]:
# try set gpu as training device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# preprocessing
training_data = training_data.loc[training_data.type == "story"]
testing_data = testing_data.loc[testing_data.type == "story"]

In [None]:
training_data_indexed = training_data.reset_index(drop=True)
testing_data_indexed = testing_data.reset_index(drop=True)
print(training_data_indexed)

In [None]:
stop_words = set(stopwords.words('english'))

def preproccess(text):
    if isinstance(text, float):
        return [""]
    tokens = re.split('\s+', text)
    tokens = ["".join([i for i in x if i not in string.punctuation]) for x in tokens]
    tokens = [re.sub("\d-", "", x) for x in tokens]
    tokens = [x.lower() for x in tokens]
    tokens = [token for token in tokens if len(token) > 2]
    final = [word for word in tokens if word not in stop_words]

    if isinstance(final, float):
        return [""]

    return final

In [None]:
#training_titles_processed = [preproccess(x) for x in training_data_indexed["title"]]
#training_text_processed = [preproccess(x) for x in training_data_indexed["text"]]
training_titles_processed = training_data_indexed["title"].apply(lambda x: preproccess(x))
training_text_processed = training_data_indexed["text"].apply(lambda x: preproccess(x))
testing_titles_processed = testing_data_indexed["title"].apply(lambda x: preproccess(x))
testing_text_processed = testing_data_indexed["text"].apply(lambda x: preproccess(x))

In [None]:
#titles_vocab_train = list(dict.fromkeys(training_titles_processed.apply(pd.Series).stack().reset_index(drop = True)))
#titles_vocab_test = list(dict.fromkeys(testing_titles_processed.apply(pd.Series).stack().reset_index(drop = True)))
#titles_vocab = list(dict.fromkeys(titles_vocab_train + titles_vocab_test))

In [None]:
#text_vocab_train = list(dict.fromkeys(training_text_processed.apply(pd.Series).stack().reset_index(drop = True)))
#text_vocab_test = list(dict.fromkeys(testing_text_processed.apply(pd.Series).stack().reset_index(drop = True)))
#text_vocab = list(dict.fromkeys(text_vocab_train + text_vocab_test))

In [None]:
#with open("titles_vocab", "wb") as fb:
#    pickle.dump(titles_vocab, fb)

#with open("text_vocab", "wb") as fb:
#    pickle.dump(text_vocab, fb)

In [None]:
with open("titles_vocab", "rb") as fb:
    titles_vocab = pickle.load(fb)

with open("text_vocab", "rb") as fb:
    text_vocab = pickle.load(fb)

In [None]:
# add a special empty token for posts with empty titles or text
titles_vocab.append("<|empty|>")
text_vocab.append("<|empty|>")

In [None]:
print(training_data_indexed.loc[0])

In [None]:
req_cols_with_url = ["title", "url", "text", "time"]
req_cols_without_url = ["title", "text", "time"]

In [None]:
scores = training_data_indexed.score
training_data_indexed = training_data_indexed[req_cols_without_url]

testing_scores = testing_data_indexed.score
testing_data_indexed = testing_data_indexed[req_cols_without_url]

In [None]:
print(training_data_indexed.loc[0])

In [None]:
def BOW_bin(words, vocab):
    return [1 if word in words else 0 for word in vocab]

def BOW_freq(words, vocab):
    return [words.count(word) for word in vocab]

In [None]:
def extract_domain(url):
    if not isinstance(url, str):
        return ""
    return urlparse(url).netloc

In [None]:
# define cutoff pont for post being p

In [None]:
# define transformations of the data

class TextualTransform1(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]
        
        post["title"] = BOW_bin(preproccess(post["title"]), titles_vocab)
        post["text"] = BOW_bin(preproccess(post["text"]), text_vocab)

        return {'post': post, 'score': score}

class TextualTransform2(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]
        
        post["title"] = BOW_freq(preproccess(post["title"]), titles_vocab)
        post["text"] = BOW_freq(preproccess(post["text"]), text_vocab)

        return {'post': post, 'score': score}

class URLTransform(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]

        post["url"] = extract_domain(post["url"])

        return {'post': post, 'score': score}

class TensorTransform(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]

        title_list = post["title"]
        text_list = post["text"]
        time = post["time"]
        
        output = title_list + text_list
        output.append(time)

        output = torch.FloatTensor(output)
        score = torch.FloatTensor(score)

        return {"post": output, "score": score}
        


        


In [None]:
class HackerNewsPostDataset(Dataset):

    def __init__(self, data, labels, cutoff = None, transforms = None):
        self.posts = data
        self.scores = labels
        self.transforms = transforms
        self.cutoff = cutoff

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()

        post = self.posts.loc[index]
        score = self.scores[index]

        if self.cutoff:
            score = [(1 if isinstance(score, float) and score > self.cutoff else 0)]

        sample = {'post': post, 'score': score}

        if self.transforms:
            for transform in self.transforms:
                sample = transform(sample)

        return sample

In [None]:
titles_to_index = {word: index for index, word in enumerate(titles_vocab)}
text_to_index = {word: index for index, word in enumerate(text_vocab)}

In [None]:
class EmbeddingsDataset(Dataset):

    def __init__(self, data, lables, cutoff):
        self.posts = data
        self.scores = lables
        self.cutoff = cutoff

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):

        post = self.posts.loc[index]
        score = self.scores[index]

        title_indexs = preproccess(post["title"])
        text_indexs = preproccess(post["text"])

        if title_indexs == []:
            title_indexs = ["<|empty|>"]

        if text_indexs == []:
            text_indexs = ["<|empty|>"]

        time = post["time"]

        score = 1 if (isinstance(score, float) and score > self.cutoff) else 0

        #score = torch.FloatTensor(score)

        sample = ((title_indexs, text_indexs, time), score)
        return sample


In [None]:
# non url training dataset
transforms = [TextualTransform1(), TensorTransform()]
cutoff = 20

post_training_dataset = HackerNewsPostDataset(training_data_indexed, scores, cutoff, transforms)
post_testing_dataset = HackerNewsPostDataset(testing_data_indexed, testing_scores, cutoff, transforms)

In [None]:
embedding_dataset_train = EmbeddingsDataset(training_data_indexed, scores, cutoff)
embedding_dataset_test = EmbeddingsDataset(testing_data_indexed, testing_scores, cutoff)

In [None]:
embedding_dataset_train[0]

In [None]:
def collate_batch_embed(batch):
    lables, texts, offsets = [], [], [0]
    for post, score in batch:

        titles_indexs = [titles_to_index[word] for word in post[0]]
        text_indexs = [text_to_index[word] for word in post[1]]
        
        proccessed_input = torch.tensor(titles_indexs + text_indexs, dtype = torch.int64)

        texts.append(proccessed_input)
        lables.append([score])
        offsets.append(proccessed_input.size(0))

    lables = torch.tensor(lables, dtype = torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim = 0)
    texts = torch.cat(texts)
    return lables, texts, offsets



In [None]:
print(post_testing_dataset[0])

In [None]:
1960 * 5

In [None]:
# create dataloader
batch_size = 100
num_iterations = 9800
num_epochs = 5
train_loader = torch.utils.data.DataLoader(dataset=post_training_dataset, batch_size=batch_size, shuffle=True)

In [None]:
test_loader = torch.utils.data.DataLoader(dataset=post_testing_dataset, batch_size=100, shuffle=True)

In [None]:
train_loader_embed = torch.utils.data.DataLoader(dataset = embedding_dataset_train, batch_size=batch_size, shuffle=True, 
                                                collate_fn = collate_batch_embed)
test_loader_embed = torch.utils.data.DataLoader(dataset = embedding_dataset_test, batch_size=batch_size, shuffle=True, 
                                                collate_fn = collate_batch_embed)

In [None]:
it = iter(train_loader_embed)
first = next(it)
print(len(first[0]), first[0])
print(len(first[1]))
print(len(first[2]), first[2])

In [None]:
# basic Feed Forward Neural Network

class FFNetwork(nn.Module):
    def __init__(self, input_dimensions, hidden_dimensions, output_dimensions):
        super(FFNetwork, self).__init__()

        self.linear1 = nn.Linear(input_dimensions, hidden_dimensions)

        self.nonlinear = nn.ReLU()
    
        self.linear2 = nn.Linear(hidden_dimensions, output_dimensions)

        self.sigmoid = torch.sigmoid
    
    def forward(self, x):

        x = self.linear1(x)

        x = self.nonlinear(x)

        output = self.linear2(x)

        output = self.sigmoid(output)
        return output
        

In [None]:
# basic Feed Forward Neural Network (Regression)

class FFNetworkReg(nn.Module):
    def __init__(self, input_dimensions, hidden_dimensions, output_dimensions):
        super(FFNetworkReg, self).__init__()

        self.linear1 = nn.Linear(input_dimensions, hidden_dimensions)

        self.nonlinear = nn.ReLU()
    
        self.linear2 = nn.Linear(hidden_dimensions, output_dimensions)
    
    def forward(self, x):

        x = self.linear1(x)

        x = self.nonlinear(x)

        output = self.linear2(x)

        return output

In [None]:
class FFNetworkEmbedding(nn.Module):
    def __init__(self, input_dim, embedding_dim, output_dim):
        super(FFNetworkEmbedding, self).__init__()

        self.embed = nn.EmbeddingBag(input_dim, embedding_dim)

        self.linear1 = nn.Linear(embedding_dim, 2056)
        self.linear2 = nn.Linear(2056, 256)
        self.linear3 = nn.Linear(256, 64)
        self.linear4 = nn.Linear(64, 16)
        self.linear5 = nn.Linear(16, output_dim)

        self.relu = nn.ReLU()

    def forward(self, x, offsets):

        # apply an emedding bag layer to get average of all embeddings
        x = self.embed(x, offsets)

        # apply linear functions
        x = self.relu(self.linear1(x))

        x = self.relu(self.linear2(x))

        x = self.relu(self.linear3(x))

        x = self.relu(self.linear4(x))

        x = self.relu(self.linear5(x))

        return x
        

In [None]:
# define the dimensions of the basic model
input_dimensions = len(text_vocab) + len(titles_vocab) + 1
hidden_dimensions = 1000
output_dimensions = 1

# instantiate the class we are using for this model
model = FFNetwork(input_dimensions, hidden_dimensions, output_dimensions)

In [None]:
# define the dimensions of the basic model
input_dimensions_reg = 100
hidden_dimensions_reg = 1000
output_dimensions_reg = 1

# instantiate the class we are using for this model
model_reg = FFNetworkReg(input_dimensions_reg, hidden_dimensions_reg, output_dimensions_reg)

In [None]:
input_dim_embed = max(len(text_vocab), len(titles_vocab))
embedding_dim = 256
output_dimensions_embed = 1

model_embed = FFNetworkEmbedding(input_dim_embed, embedding_dim, output_dimensions_embed)

In [None]:
#it = iter(train_loader)
#print(model(torch.stack(next(it)["post"])))

In [None]:
# define loss functions class
loss_func = nn.BCEWithLogitsLoss()

In [None]:
# define optimizer class
learning_rate = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

In [None]:

#for (batch_index, batch) in enumerate(train_loader):
#    if(batch_index > 0):
#        break
#    print(model(batch["post"]))

In [None]:
#for (batch_index, batch) in enumerate(train_loader):
#    if(batch_index > 0):
#        break
#    print(model_reg(batch["post"]))

In [None]:
# define function for calculating the accuracy of the model
def get_model_accuracy(model, loader):
    correct = 0
    total = 0

    for (batch_index, batch) in enumerate(loader):
        
        model.float()
        posts = batch["post"]
        scores = batch["score"]

        # get predirction probablities
        predictions_prob = model(posts)

        # get class predictions
        _, predictied = torch.max(predictions_prob.data, 1)

        # calculate tota samples predicted and correct
        total = total + scores.size(0)
        correct = correct + (predictied == scores).sum()

    accuracy = 100 * correct / total
    return accuracy,

In [None]:
# define function for training model
def train_model(model, train_loader, test_loader, loss, optimizer):
    iteration = 0
    
    for epoch in range(num_epochs):
        print("Starting Epoch: " + str(epoch))
        for (batch_index, batch) in enumerate(train_loader):
            #print("Iteration " + str(iteration))
            model.float()
            posts = batch["post"]
            scores = batch["score"]

            # set grads to 0
            optimizer.zero_grad()

            predictions = model(posts)


            # calculate loss
            loss = loss_func(predictions, scores)

            # backwards pass to calculate gradients
            loss.backward()

            # update parameters
            optimizer.step()

            if iteration % 50 == 0:
                print("\n")
                accuracy = get_model_accuracy(model, test_loader)
                print("Iteration {}. Loss {}. Accuracy {}".format(iteration, loss.item(), accuracy))
                print("\n")

            iteration += 1

    return model

In [None]:
def train_model_embed(model, train_loader, test_loader, loss, optimizer):
    iterations = 0
    accuracy_list = []
    loss_list = []
    for epoch in range(num_epochs):
        for index, batch in enumerate(train_loader):
            lables, texts, offsets = batch

            model.train(True)

            optimizer.zero_grad()

            predictions = model(texts, offsets)

            loss = loss_func(predictions, lables.float())

            loss.backward()

            optimizer.step()

            if iterations % 100 == 0:
                total = 0
                correct = 0

                for test_index, test_batch in enumerate(test_loader):

                    test_lables, test_texts, test_offsets = test_batch

                    output_preds = model(test_texts, test_offsets)
                    output_preds = [1 if output > 0.0 else 0 for output in output_preds]

                    total = total + test_lables.size(0)

                    for i in range(len(test_lables)):
                        if output_preds[i] == test_lables[i]:
                            correct += 1

                accuracy = correct / total
                print("Iteration {}. Loss {}. Accuracy {}".format(iterations, loss.item(), accuracy))
                accuracy_list.append(accuracy)
                loss_list.append(loss.item())
        iterations = iterations + 1
    return model, accuracy_list, loss_list

In [None]:
model_embed_final, accs, losses = train_model_embed(model_embed, train_loader_embed, test_loader_embed, loss_func, optimizer)

In [None]:
model_final = train_model(model, train_loader, test_loader, loss_func, optimizer)