In [None]:
# import standard libraries
import numpy as np

import warnings
import pandas as pd

from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

import datetime
import pickle
from nltk.corpus import stopwords
import re
import regex
import string
from urllib.parse import urlparse

In [None]:
# import ML libraries
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset

In [None]:
# import training data
with open("../data/training_data", "rb") as fb:
    training_data = pickle.load(fb)

In [None]:
# import testing data
with open("../data/testing_data", "rb") as fb:
    testing_data = pickle.load(fb)

In [None]:
# try set gpu as training device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# preprocessing
training_data = training_data.loc[training_data.type == "story"]
testing_data = testing_data.loc[testing_data.type == "story"]

In [None]:
training_data_indexed = training_data.reset_index(drop=True)
testing_data_indexed = testing_data.reset_index(drop=True)
print(training_data_indexed)

In [None]:
stop_words = set(stopwords.words('english'))

def preproccess(text):
    if not isinstance(text, str):
        text = ""
    text = "".join([i for i in text if i not in string.punctuation])
    text = re.sub("\d-", "", text)
    lower = text.lower()
    tokens = re.split('\s+', lower)
    tokens = [token for token in tokens if len(token) > 2]
    final = [word for word in tokens if word not in stop_words]
    return final

In [None]:
#training_data_indexed["titles_processed"] = training_data_indexed["title"].apply(lambda x: preproccess(x))
#training_data_indexed["text_processed"] = training_data_indexed["text"].apply(lambda x: preproccess(x))

In [None]:
#titles_vocab = list(dict.fromkeys(sum(list(training_data_indexed.titles_processed), [])))

In [None]:
#text_vocab = list(dict.fromkeys(sum(list(training_data_indexed.text_processed), [])))

In [None]:
with open("titles_vocab", "rb") as fb:
    titles_vocab = pickle.load(fb)

with open("text_vocab", "rb") as fb:
    text_vocab = pickle.load(fb)

In [None]:
print(training_data_indexed.loc[0])

In [None]:
req_cols_with_url = ["title", "url", "text", "time"]
req_cols_without_url = ["title", "text", "time"]

In [None]:
scores = training_data_indexed.score
training_data_indexed = training_data_indexed[req_cols_without_url]

testing_scores = testing_data_indexed.score
testing_data_indexed = testing_data_indexed[req_cols_without_url]

In [None]:
print(training_data_indexed.loc[0])

In [None]:
def BOW_bin(words, vocab):
    return [1 if word in words else 0 for word in vocab]

def BOW_freq(words, vocab):
    return [words.count(word) for word in vocab]

In [None]:
def extract_domain(url):
    if not isinstance(url, str):
        return ""
    return urlparse(url).netloc

In [None]:
# define transformations of the data

class TextualTransform1(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]
        
        post["title"] = BOW_bin(preproccess(post["title"]), titles_vocab)
        post["text"] = BOW_bin(preproccess(post["title"]), text_vocab)

        return {'post': post, 'score': score}

class TextualTransform2(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]
        
        post["title"] = BOW_freq(preproccess(post["title"]), titles_vocab)
        post["text"] = BOW_freq(preproccess(post["title"]), text_vocab)

        return {'post': post, 'score': score}

class URLTransform(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]

        post["url"] = extract_domain(post["url"])

        return {'post': post, 'score': score}

class TensorTransform(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]

        title_list = post["title"]
        text_list = post["text"]
        time = post["time"]

        output = title_list + text_list
        output.append(time)
        output = torch.FloatTensor(output)
        output = [out.float() for out in output]

        return {"post": output, "score": score}
        


        


In [None]:
class HackerNewsPostDataset(Dataset):

    def __init__(self, data, labels, transforms = None):
        self.posts = data
        self.scores = labels
        self.transforms = transforms

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()

        post = self.posts.loc[index]
        score = self.scores[index]

        sample = {'post': post, 'score': score}

        if self.transforms:
            for transform in self.transforms:
                sample = transform(sample)

        return sample

In [None]:
# non url training dataset
transforms = [TextualTransform1(), TensorTransform()]

post_training_dataset = HackerNewsPostDataset(training_data_indexed, scores, transforms)
post_testing_dataset = HackerNewsPostDataset(testing_data_indexed, testing_scores, transforms)

In [None]:
1960 * 5

In [None]:
# create dataloader
batch_size = 100
num_iterations = 9800
num_epochs = 5
train_loader = torch.utils.data.DataLoader(dataset=post_training_dataset, batch_size=batch_size, shuffle=True)

In [None]:
test_loader = torch.utils.data.DataLoader(dataset=post_testing_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# basic Feed Forward Neural Network

class FFNetwork(nn.Module):
    def __init__(self, input_dimensions, hidden_dimensions, output_dimensions):
        super(FFNetwork, self).__init__()

        self.linear1 = nn.Linear(input_dimensions, hidden_dimensions)

        self.nonlinear = nn.ReLU()
    
        self.linear2 = nn.Linear(hidden_dimensions, output_dimensions)

        self.softmax = nn.Softmax()
    
    def forward(self, x):

        x2 = self.linear1(x)

        x3 = self.nonlinear(x2)

        output = self.linear2(x3)
        output = self.softmax(output)
        return output
        

In [None]:
# basic Feed Forward Neural Network (Regression)

class FFNetworkReg(nn.Module):
    def __init__(self, input_dimensions, hidden_dimensions, output_dimensions):
        super(FFNetworkReg, self).__init__()

        self.linear1 = nn.Linear(input_dimensions, hidden_dimensions)

        self.nonlinear = nn.ReLU()
    
        self.linear2 = nn.Linear(hidden_dimensions, output_dimensions)
    
    def forward(self, x):

        x2 = self.linear1(x)

        x3 = self.nonlinear(x2)

        output = self.linear2(x3)

        return output

In [None]:
# define the dimensions of the basic model
input_dimensions = len(titles_vocab) + len(text_vocab) + 1 
hidden_dimensions = 1000
output_dimensions = 2

# instantiate the class we are using for this model
model = FFNetwork(input_dimensions, hidden_dimensions, output_dimensions)

In [None]:
# define the dimensions of the basic model
input_dimensions_reg = len(titles_vocab) + len(text_vocab) + 1 
hidden_dimensions_reg = 1000
output_dimensions_reg = 1

# instantiate the class we are using for this model
model_reg = FFNetworkReg(input_dimensions_reg, hidden_dimensions_reg, output_dimensions_reg)

In [None]:
for batch in train_loader:
    print(model_reg(batch["posts"]))
    break

In [None]:
# define loss functions class
loss_func = nn.MSELoss()

In [None]:
# define optimizer class
learning_rate = 0.2
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

In [None]:

#for (batch_index, batch) in enumerate(train_loader):
#    if(batch_index > 0):
#        break
#    print(model(batch["post"]))

In [None]:
#for (batch_index, batch) in enumerate(train_loader):
#    if(batch_index > 0):
#        break
#    print(model_reg(batch["post"]))

In [None]:
# define function for calculating the accuracy of the model
def get_model_accuracy(model, loader):
    correct = 0
    total = 0
    for (batch_index, batch) in enumerate(loader):
        
        posts = batch["post"]
        scores = batch["score"]

        # get predirction probablities
        predictions_prob = model(posts)

        # get class predictions
        _, predictied = torch.max(predictions_prob.data, 1)

        # calculate tota samples predicted and correct
        total = total + scores.size(0)
        correct = correct + (predictied == scores).sum()

    accuracy = 100 * correct / total
    return accuracy,

In [None]:
# define function for training model
def train_model(model, train_loader, test_loader, loss, optimizer):
    iteration = 0
    for epoch in range(num_epochs):
        print("Starting Epoch: " + str(epoch))
        for (batch_index, batch) in enumerate(train_loader):
            print("Iteration " + str(iteration))

            posts = batch["post"]
            scores = batch["score"]

            # set grads to 0
            optimizer.zero_grad()

            # forward pass
            predictions = model(posts)

            # calculate loss
            loss = loss_func(predictions, scores)

            # backwards pass to calculate gradients
            loss.backward()

            # update parameters
            optimizer.step()

            if iteration % 10 == 0:
                print("\n")
                accuracy = get_model_accuracy(model, test_loader)
                print("Iteration {}. Loss {}. Accuracy {}".format(iteration, loss.item(), accuracy))
                print("\n")

            iteration += 1

    return model

In [None]:
model_final = train_model(model_reg, train_loader, test_loader, loss_func, optimizer)