In [56]:
# import standard libraries
import numpy as np

import warnings
import pandas as pd

from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

import datetime
import pickle
from nltk.corpus import stopwords
import re
import regex
import string
from urllib.parse import urlparse

In [57]:
# import ML libraries
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset

In [58]:
# import training data
with open("../data/training_data", "rb") as fb:
    training_data = pickle.load(fb)

In [59]:
# import testing data
with open("../data/testing_data", "rb") as fb:
    testing_data = pickle.load(fb)

In [60]:
# try set gpu as training device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [61]:
# preprocessing
training_data = training_data.loc[training_data.type == "story"]
testing_data = testing_data.loc[testing_data.type == "story"]

In [62]:
training_data_indexed = training_data.reset_index(drop=True)
testing_data_indexed = testing_data.reset_index(drop=True)
print(training_data_indexed)

                                                    title  \
0       Not sure if this is legal for US citizens but ...   
1       What happens to Americans who film police viol...   
2                                                     NaN   
3       Big data startup Gainsight uses big data to im...   
4                                                     NaN   
...                                                   ...   
457818  US lawmaker injects ISP throttle into Obama re...   
457819           Failure-Oblivious Computing (2004) [pdf]   
457820  Everything I Needed to Know I Learned in Math ...   
457821   Twitter employee live tweets WSOP from the table   
457822  Deep-learning vision system anticipates human ...   

                                                      url text  dead  \
0                                        https://betco.in  NaN   NaN   
1       http://www.theguardian.com/us-news/2015/aug/15...  NaN   NaN   
2                                                  

In [63]:
# define preproccessing function
stop_words = set(stopwords.words('english'))

def preproccess(text):
    if isinstance(text, float):
        return [""]
    
    # split into tokens
    tokens = re.split('\s+', text)

    # remove punctuation
    tokens = ["".join([i for i in x if i not in string.punctuation]) for x in tokens]

    # remove numbers
    tokens = [re.sub("\d+", "", x) for x in tokens]

    # make all tokens lowercase
    tokens = [x.lower() for x in tokens]

    # remove tokens which are too short or too long
    tokens = [token for token in tokens if len(token) > 2 and len(token) < 15]

    # remove hyperlinks
    tokens = [token for token in tokens if not (token.startswith("http") or token.startswith("www") or token.endswith("com"))]

    # remove stop words
    final = [word for word in tokens if word not in stop_words]

    if isinstance(final, float):
        return [""]

    return final

In [64]:
# apply preproccessing 

#training_titles_processed = [preproccess(x) for x in training_data_indexed["title"]]
#training_text_processed = [preproccess(x) for x in training_data_indexed["text"]]
training_titles_processed = training_data_indexed["title"].apply(lambda x: preproccess(x))
training_text_processed = training_data_indexed["text"].apply(lambda x: preproccess(x))
testing_titles_processed = testing_data_indexed["title"].apply(lambda x: preproccess(x))
testing_text_processed = testing_data_indexed["text"].apply(lambda x: preproccess(x))

In [65]:
# get unique words for titles
titles_vocab_train = list(dict.fromkeys(training_titles_processed.apply(pd.Series).stack().reset_index(drop = True)))
titles_vocab_test = list(dict.fromkeys(testing_titles_processed.apply(pd.Series).stack().reset_index(drop = True)))
titles_vocab = list(dict.fromkeys(titles_vocab_train + titles_vocab_test))


In [66]:
# get unique words for texts
text_vocab_train = list(dict.fromkeys(training_text_processed.apply(pd.Series).stack().reset_index(drop = True)))
text_vocab_test = list(dict.fromkeys(testing_text_processed.apply(pd.Series).stack().reset_index(drop = True)))
text_vocab = list(dict.fromkeys(text_vocab_train + text_vocab_test))


In [67]:
# save created vocabs
with open("titles_vocab", "wb") as fb:
    pickle.dump(titles_vocab, fb)

with open("text_vocab", "wb") as fb:
    pickle.dump(text_vocab, fb)


In [68]:
# load created vocabs
with open("titles_vocab", "rb") as fb:
    titles_vocab = pickle.load(fb)

with open("text_vocab", "rb") as fb:
    text_vocab = pickle.load(fb)

In [69]:
print(titles_vocab)



In [70]:
# add a special empty token for posts with empty titles or text
titles_vocab.append("<|empty|>")
text_vocab.append("<|empty|>")

In [71]:
print(titles_vocab)



In [72]:
print(training_data_indexed.loc[0])

title          Not sure if this is legal for US citizens but ...
url                                             https://betco.in
text                                                         NaN
dead                                                         NaN
by                                                  dickchaninin
score                                                        3.0
time                                                1304882921.0
type                                                       story
id                                                       2526540
parent                                                       NaN
descendants                                                  0.0
ranking                                                      NaN
deleted                                                      NaN
timestamp                                   2011-05-08T19:28:41Z
dates                                        2011-05-08 20:28:41
Name: 0, dtype: object


In [73]:
# define cols which are used in model
req_cols_with_url = ["title", "url", "text", "time"]
req_cols_without_url = ["title", "text", "time"]

In [74]:
# extract specific cols needed
scores = training_data_indexed.score
training_data_indexed = training_data_indexed[req_cols_without_url]

testing_scores = testing_data_indexed.score
testing_data_indexed = testing_data_indexed[req_cols_without_url]

In [75]:
print(training_data_indexed.loc[0])

title    Not sure if this is legal for US citizens but ...
text                                                   NaN
time                                          1304882921.0
Name: 0, dtype: object


In [76]:
# define basic Bag of Words function
def BOW_bin(words, vocab):
    return [1 if word in words else 0 for word in vocab]

def BOW_freq(words, vocab):
    return [words.count(word) for word in vocab]

In [77]:
# Function for getting domain name form url
def extract_domain(url):
    if not isinstance(url, str):
        return ""
    return urlparse(url).netloc

In [78]:
# define transformations of the data

class TextualTransform1(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]
        
        post["title"] = BOW_bin(preproccess(post["title"]), titles_vocab)
        post["text"] = BOW_bin(preproccess(post["text"]), text_vocab)

        return {'post': post, 'score': score}

class TextualTransform2(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]
        
        post["title"] = BOW_freq(preproccess(post["title"]), titles_vocab)
        post["text"] = BOW_freq(preproccess(post["text"]), text_vocab)

        return {'post': post, 'score': score}

class URLTransform(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]

        post["url"] = extract_domain(post["url"])

        return {'post': post, 'score': score}

class TensorTransform(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]

        title_list = post["title"]
        text_list = post["text"]
        time = post["time"]
        
        output = title_list + text_list
        output.append(time)

        output = torch.FloatTensor(output)
        score = torch.FloatTensor(score)

        return {"post": output, "score": score}
        


        


In [79]:
# define custom dataset for BoW model
class HackerNewsPostDataset(Dataset):

    def __init__(self, data, labels, cutoff = None, transforms = None):
        self.posts = data
        self.scores = labels
        self.transforms = transforms
        self.cutoff = cutoff

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()

        post = self.posts.loc[index]
        score = self.scores[index]

        if self.cutoff:
            score = [(1 if isinstance(score, float) and score > self.cutoff else 0)]

        sample = {'post': post, 'score': score}

        if self.transforms:
            for transform in self.transforms:
                sample = transform(sample)

        return sample

In [80]:
# define a index dict for getting word embeddings
titles_to_index = {word: index for index, word in enumerate(titles_vocab)}
text_to_index = {word: index for index, word in enumerate(text_vocab)}

In [81]:
# define custom dataset for word embeddings model
class EmbeddingsDataset(Dataset):

    def __init__(self, data, lables, cutoff):
        self.posts = data
        self.scores = lables
        self.cutoff = cutoff

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):

        post = self.posts.loc[index]
        score = self.scores[index]

        title_indexs = preproccess(post["title"])
        text_indexs = preproccess(post["text"])

        if title_indexs == []:
            title_indexs = ["<|empty|>"]

        if text_indexs == []:
            text_indexs = ["<|empty|>"]

        time = post["time"]

        score = 1 if (isinstance(score, float) and score > self.cutoff) else 0

        #score = torch.FloatTensor(score)

        sample = ((title_indexs, text_indexs, time), score)
        return sample


In [82]:
# non url training dataset for BoW model
transforms = [TextualTransform1(), TensorTransform()]
cutoff = 20

post_training_dataset = HackerNewsPostDataset(training_data_indexed, scores, cutoff, transforms)
post_testing_dataset = HackerNewsPostDataset(testing_data_indexed, testing_scores, cutoff, transforms)

In [83]:
# create embedding datasets
embedding_dataset_train = EmbeddingsDataset(training_data_indexed, scores, cutoff)
embedding_dataset_test = EmbeddingsDataset(testing_data_indexed, testing_scores, cutoff)

In [84]:
# As dataset is imbalanced define a weighted sampler
from torch.utils.data import WeightedRandomSampler

num_popular = (scores > cutoff).sum()

unpopular_weight = 1
popular_weight = (len(embedding_dataset_train) - num_popular) / num_popular


class_weights = [unpopular_weight, popular_weight]
weights = [0] * len(embedding_dataset_train)

for index, score in enumerate(scores):
    weight = class_weights[int(score > cutoff)]
    weights[index] = weight

sampler = WeightedRandomSampler(weights, num_samples = len(weights), replacement = True)

In [85]:
print(unpopular_weight, popular_weight)

1 11.369248642368897


In [86]:
embedding_dataset_train[0]

((['sure', 'legal', 'citizens', 'play', 'poker'], [''], 1304882921.0), 0)

In [87]:
# define function for collating batches
def collate_batch_embed(batch):
    lables, texts, offsets = [], [], [0]
    for post, score in batch:

        titles_indexs = [titles_to_index[word] for word in post[0]]
        text_indexs = [text_to_index[word] for word in post[1]]
        
        proccessed_input = torch.tensor(titles_indexs + text_indexs, dtype = torch.int64).to(device)

        texts.append(proccessed_input)
        lables.append([score])
        offsets.append(proccessed_input.size(0))

    lables = torch.tensor(lables, dtype = torch.int64).to(device)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim = 0).to(device)
    texts = torch.cat(texts)
    return lables, texts, offsets



In [88]:
print(post_testing_dataset[0])

{'post': tensor([0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
        1.4248e+09]), 'score': tensor([0.])}


In [89]:
1960 * 5

9800

In [90]:
# create dataloaders
batch_size = 128
num_iterations = 9800
num_epochs = 5
train_loader = torch.utils.data.DataLoader(dataset=post_training_dataset, batch_size=batch_size, shuffle=True)

In [91]:
test_loader = torch.utils.data.DataLoader(dataset=post_testing_dataset, batch_size=100, shuffle=True)

In [92]:
train_loader_embed = torch.utils.data.DataLoader(dataset = embedding_dataset_train, batch_size=batch_size, sampler=sampler, 
                                                collate_fn = collate_batch_embed)
test_loader_embed = torch.utils.data.DataLoader(dataset = embedding_dataset_test, batch_size=batch_size, sampler=sampler, 
                                                collate_fn = collate_batch_embed)

In [93]:
it = iter(train_loader_embed)
first = next(it)
print(len(first[0]), first[0])
print(len(first[1]))
print(len(first[2]), first[2])

128 tensor([[0],
        [1],
        [1],
        [0],
        [1],
        [0],
        [1],
        [1],
        [0],
        [0],
        [0],
        [1],
        [1],
        [1],
        [0],
        [1],
        [0],
        [0],
        [1],
        [1],
        [0],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [0],
        [0],
        [1],
        [0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [1],
        [0],
        [1],
        [0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [1],
        [0],
        [1],
        [1],
        [1],
        [0],
        [1],
        [1],
        [0],
        [0],
        [1],
        [0],
        [1],
        [0],
        [1],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [1],
        [1],
        [1],
        

In [94]:
# basic Feed Forward Neural Network

class FFNetwork(nn.Module):
    def __init__(self, input_dimensions, hidden_dimensions, output_dimensions):
        super(FFNetwork, self).__init__()

        self.linear1 = nn.Linear(input_dimensions, hidden_dimensions)

        self.nonlinear = nn.ReLU()
    
        self.linear2 = nn.Linear(hidden_dimensions, output_dimensions)

        self.sigmoid = torch.sigmoid
    
    def forward(self, x):

        x = self.linear1(x)

        x = self.nonlinear(x)

        output = self.linear2(x)

        output = self.sigmoid(output)
        return output
        

In [95]:
# basic Feed Forward Neural Network (Regression)

class FFNetworkReg(nn.Module):
    def __init__(self, input_dimensions, hidden_dimensions, output_dimensions):
        super(FFNetworkReg, self).__init__()

        self.linear1 = nn.Linear(input_dimensions, hidden_dimensions)

        self.nonlinear = nn.ReLU()
    
        self.linear2 = nn.Linear(hidden_dimensions, output_dimensions)
    
    def forward(self, x):

        x = self.linear1(x)

        x = self.nonlinear(x)

        output = self.linear2(x)

        return output

In [96]:
# Define Fully Connected FF network for the Embedding model

class FFNetworkEmbedding(nn.Module):
    def __init__(self, input_dim, embedding_dim, output_dim):
        super(FFNetworkEmbedding, self).__init__()

        self.embed = nn.EmbeddingBag(input_dim, embedding_dim)

        self.linear1 = nn.Linear(embedding_dim, 64)
        self.linear2 = nn.Linear(64, 16)
        self.linear3 = nn.Linear(16, output_dim)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        

    def forward(self, x, offsets):

        # apply an emedding bag layer to get average of all embeddings
        x = self.embed(x, offsets)

        # apply linear functions
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        x = self.relu(self.linear3(x))

        # apply dropout to avoid overfitting
        x = self.dropout(x)

        return x
        

In [97]:
# define the dimensions of the basic model
input_dimensions = len(text_vocab) + len(titles_vocab) + 1
hidden_dimensions = 1000
output_dimensions = 1

# instantiate the class we are using for this model
model = FFNetwork(input_dimensions, hidden_dimensions, output_dimensions).to(device)

In [98]:
# define the dimensions of the basic model
input_dimensions_reg = 100
hidden_dimensions_reg = 1000
output_dimensions_reg = 1

# instantiate the class we are using for this model
model_reg = FFNetworkReg(input_dimensions_reg, hidden_dimensions_reg, output_dimensions_reg).to(device)

In [99]:
# define dimensions of embedding model
input_dim_embed = max(len(text_vocab), len(titles_vocab))
embedding_dim = 1024
output_dimensions_embed = 1

# instansiate the embedding model
model_embed = FFNetworkEmbedding(input_dim_embed, embedding_dim, output_dimensions_embed).to(device)

In [100]:
#it = iter(train_loader)
#print(model(torch.stack(next(it)["post"])))

In [101]:
# define loss functions class
# we choose to use BCEWithLogitsLoss as it provides the functionality of both a
# sigmoid layer and BCE loss in one. Is more efficient (look up details for diss)
loss_func = nn.BCEWithLogitsLoss()

In [102]:
# define optimizer class
learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

In [103]:

#for (batch_index, batch) in enumerate(train_loader):
#    if(batch_index > 0):
#        break
#    print(model(batch["post"]))

In [104]:
#for (batch_index, batch) in enumerate(train_loader):
#    if(batch_index > 0):
#        break
#    print(model_reg(batch["post"]))

In [105]:
# define function for calculating the accuracy of a model (current only used for BoW model)
def get_model_accuracy(model, loader):
    correct = 0
    total = 0

    for (batch_index, batch) in enumerate(loader):
        
        model.float()
        posts = batch["post"]
        scores = batch["score"]

        # get predirction probablities
        predictions_prob = model(posts)

        # get class predictions
        _, predictied = torch.max(predictions_prob.data, 1)

        # calculate tota samples predicted and correct
        total = total + scores.size(0)
        correct = correct + (predictied == scores).sum()

    accuracy = 100 * correct / total
    return accuracy,

In [106]:
# define function for training a model (also only usable for the BoW model)
def train_model(model, train_loader, test_loader, loss, optimizer):
    iteration = 0
    
    for epoch in range(num_epochs):
        print("Starting Epoch: " + str(epoch))
        for (batch_index, batch) in enumerate(train_loader):
            #print("Iteration " + str(iteration))
            model.float()
            posts = batch["post"]
            scores = batch["score"]

            # set grads to 0
            optimizer.zero_grad()

            predictions = model(posts)


            # calculate loss
            loss = loss_func(predictions, scores)

            # backwards pass to calculate gradients
            loss.backward()

            # update parameters
            optimizer.step()

            if iteration % 50 == 0:
                print("\n")
                accuracy = get_model_accuracy(model, test_loader)
                print("Iteration {}. Loss {}. Accuracy {}".format(iteration, loss.item(), accuracy))
                print("\n")

            iteration += 1

    return model

In [107]:
# define function for training a embedding model
def train_model_embed(model, train_loader, test_loader, loss, optimizer):
    iterations = 0
    accuracy_list = []
    loss_list = []

    # make sure model is being trained on gpu
    model.to(device)

    for epoch in range(num_epochs):
        print("Epoch {}.".format(epoch))
        running_loss = 0

        for index, batch in enumerate(train_loader):
            lables, texts, offsets = batch

            # set the model to track graidents for training
            model.train(True)

            # reset optimizer grads to zero
            optimizer.zero_grad()

            # forward pass of the model
            predictions = model(texts, offsets)

            # calculate loss
            loss = loss_func(predictions, lables.float())

            # perform backwards pass
            loss.backward()

            # tune weights with optimizer
            optimizer.step()

            # stop gradient tracking for writing current model metrics
            model.train(False)          

            # track the running loss
            running_loss += loss.item()

            if iterations != 0 and iterations % 100 == 0:

                print(" Iteration {}. Running Loss {}.".format( iterations, running_loss / 100))
    
                # reset running loss
                running_loss = 0

                """
                for test_index, test_batch in enumerate(test_loader):

                    test_lables, test_texts, test_offsets = test_batch

                    output_preds = model(test_texts, test_offsets)
                    output_preds = [1 if output > 0.0 else 0 for output in output_preds]

                    total = total + test_lables.size(0)

                    for i in range(len(test_lables)):
                        if output_preds[i] == test_lables[i]:
                            correct += 1
                

                accuracy = correct / total
                print("Iteration {}. Loss {}. Accuracy {}".format(iterations, loss.item(), accuracy))
                accuracy_list.append(accuracy)
                loss_list.append(loss.item())
                """
            iterations = iterations + 1
    return model, accuracy_list, loss_list

In [108]:
# train embedding model
model_embed_final, accs, losses = train_model_embed(model_embed, train_loader_embed, test_loader_embed, loss_func, optimizer)

Epoch 0.
 Iteration 100. Running Loss 0.7000538402795792.
 Iteration 200. Running Loss 0.6931304585933685.
 Iteration 300. Running Loss 0.6931707602739334.
 Iteration 400. Running Loss 0.6930930346250535.


KeyboardInterrupt: 

In [None]:
# train BoW model
# currently this model takes way to long and takes up too much memory to work
model_final = train_model(model, train_loader, test_loader, loss_func, optimizer)