In [1]:
# import standard libraries
import os
import numpy as np

import warnings
import pandas as pd

from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

import datetime
import pickle
from nltk.corpus import stopwords
import re
import regex
import string
from urllib.parse import urlparse

In [2]:
# import ML libraries
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset

In [3]:
# import openai and define key
import openai
openai.api_key = "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"

In [4]:
# import training data
with open("../data/training_data", "rb") as fb:
    training_data = pickle.load(fb)

In [5]:
# import testing data
with open("../data/testing_data", "rb") as fb:
    testing_data = pickle.load(fb)

In [6]:
# try set gpu as training device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
#device = "cpu"

cuda


In [7]:
# preprocessing
training_data = training_data.loc[training_data.type == "story"]
testing_data = testing_data.loc[testing_data.type == "story"]

In [8]:
training_data_indexed = training_data.reset_index(drop=True)
testing_data_indexed = testing_data.reset_index(drop=True)
print(training_data_indexed)

                                                    title  \
0       Not sure if this is legal for US citizens but ...   
1       What happens to Americans who film police viol...   
2                                                     NaN   
3       Big data startup Gainsight uses big data to im...   
4                                                     NaN   
...                                                   ...   
457818  US lawmaker injects ISP throttle into Obama re...   
457819           Failure-Oblivious Computing (2004) [pdf]   
457820  Everything I Needed to Know I Learned in Math ...   
457821   Twitter employee live tweets WSOP from the table   
457822  Deep-learning vision system anticipates human ...   

                                                      url text  dead  \
0                                        https://betco.in  NaN   NaN   
1       http://www.theguardian.com/us-news/2015/aug/15...  NaN   NaN   
2                                                  

In [9]:
# define preproccessing function
stop_words = set(stopwords.words('english'))

def preproccess(text):
    if isinstance(text, float):
        return [""]
    
    # split into tokens
    tokens = re.split('\s+', text)

    # remove punctuation
    tokens = ["".join([i for i in x if i not in string.punctuation]) for x in tokens]

    # remove numbers
    tokens = [re.sub("\d+", "", x) for x in tokens]

    # make all tokens lowercase
    tokens = [x.lower() for x in tokens]

    # remove tokens which are too short or too long
    tokens = [token for token in tokens if len(token) > 2 and len(token) < 15]

    # remove hyperlinks
    tokens = [token for token in tokens if not (token.startswith("http") or token.startswith("www") or token.endswith("com"))]

    # remove stop words
    #final = [word for word in tokens if word not in stop_words]

    if isinstance(tokens, float):
        return [""]

    return tokens

In [10]:
# apply preproccessing 

#training_titles_processed = [preproccess(x) for x in training_data_indexed["title"]]
#training_text_processed = [preproccess(x) for x in training_data_indexed["text"]]
training_titles_processed = training_data_indexed["title"].apply(lambda x: preproccess(x))
training_text_processed = training_data_indexed["text"].apply(lambda x: preproccess(x))
testing_titles_processed = testing_data_indexed["title"].apply(lambda x: preproccess(x))
testing_text_processed = testing_data_indexed["text"].apply(lambda x: preproccess(x))

In [11]:
# flag for whether to create and save vocabs
write_vocabs = False

In [12]:

if write_vocabs:
    # get unique words for titles
    titles_vocab_train = list(dict.fromkeys(training_titles_processed.apply(pd.Series).stack().reset_index(drop = True)))
    titles_vocab_test = list(dict.fromkeys(testing_titles_processed.apply(pd.Series).stack().reset_index(drop = True)))
    titles_vocab = list(dict.fromkeys(titles_vocab_train + titles_vocab_test))


In [13]:
if write_vocabs:
    # get unique words for texts
    text_vocab_train = list(dict.fromkeys(training_text_processed.apply(pd.Series).stack().reset_index(drop = True)))
    text_vocab_test = list(dict.fromkeys(testing_text_processed.apply(pd.Series).stack().reset_index(drop = True)))
    text_vocab = list(dict.fromkeys(text_vocab_train + text_vocab_test))


In [14]:
# save created vocabs
if write_vocabs:
    with open("titles_vocab", "wb") as fb:
        pickle.dump(titles_vocab, fb)

    with open("text_vocab", "wb") as fb:
        pickle.dump(text_vocab, fb)


In [15]:
# load created vocabs
with open("titles_vocab", "rb") as fb:
    titles_vocab = pickle.load(fb)

with open("text_vocab", "rb") as fb:
    text_vocab = pickle.load(fb)

In [16]:
print(titles_vocab)



In [17]:
# add a special empty token for posts with empty titles or text
titles_vocab.append("<|empty|>")
text_vocab.append("<|empty|>")

In [18]:
print(training_data_indexed.loc[0])

title          Not sure if this is legal for US citizens but ...
url                                             https://betco.in
text                                                         NaN
dead                                                         NaN
by                                                  dickchaninin
score                                                        3.0
time                                                1304882921.0
type                                                       story
id                                                       2526540
parent                                                       NaN
descendants                                                  0.0
ranking                                                      NaN
deleted                                                      NaN
timestamp                                   2011-05-08T19:28:41Z
dates                                        2011-05-08 20:28:41
Name: 0, dtype: object


In [19]:
# define cols which are used in model
req_cols_with_url = ["title", "url", "text", "time"]
req_cols_without_url = ["title", "text", "time"]

In [20]:
# extract specific cols needed
scores = training_data_indexed.score
training_data_indexed = training_data_indexed[req_cols_without_url]

testing_scores = testing_data_indexed.score
testing_data_indexed = testing_data_indexed[req_cols_without_url]

In [21]:
training_data_indexed.title = training_data_indexed.title.fillna("")
training_data_indexed.text = training_data_indexed.fillna("")

testing_data_indexed.title = testing_data_indexed.title.fillna("")
testing_data_indexed.text = testing_data_indexed.text.fillna("")

In [22]:
print(training_data_indexed.loc[0])

title    Not sure if this is legal for US citizens but ...
text     Not sure if this is legal for US citizens but ...
time                                          1304882921.0
Name: 0, dtype: object


In [23]:
# define basic Bag of Words function
def BOW_bin(words, vocab):
    return [1 if word in words else 0 for word in vocab]

def BOW_freq(words, vocab):
    return [words.count(word) for word in vocab]

In [24]:
# Function for getting domain name form url
def extract_domain(url):
    if not isinstance(url, str):
        return ""
    return urlparse(url).netloc

In [25]:
# Function for gettings the embeddings of text
def get_embeddings(text, model = "text-similarity-ada-001"):
    embeddings = openai.Embedding.create(input = text, model = model)
    return embeddings["data"][0]["embedding"]

In [26]:
#training_data_indexed["title"] = training_data_indexed["title"].fillna("")
#training_data_indexed["text"] = training_data_indexed["text"].fillna("")

In [27]:
#training_titles_embeddings = training_data_indexed["title"].apply(lambda x: get_embeddings(x))
#text_titles_embeddings = training_data_indexed["text"].apply(lambda x: get_embeddings(x))

In [28]:
# define transformations of the data

class TextualTransform1(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]
        
        post["title"] = BOW_bin(preproccess(post["title"]), titles_vocab)
        post["text"] = BOW_bin(preproccess(post["text"]), text_vocab)

        return {'post': post, 'score': score}

class TextualTransform2(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]
        
        post["title"] = BOW_freq(preproccess(post["title"]), titles_vocab)
        post["text"] = BOW_freq(preproccess(post["text"]), text_vocab)

        return {'post': post, 'score': score}

class URLTransform(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]

        post["url"] = extract_domain(post["url"])

        return {'post': post, 'score': score}

class TensorTransform(object):

    def __call__(self, sample):
        post, score = sample["post"], sample["score"]

        title_list = post["title"]
        text_list = post["text"]
        time = post["time"]
        
        output = title_list + text_list
        output.append(time)

        output = torch.FloatTensor(output)
        score = torch.FloatTensor(score)

        return {"post": output, "score": score}
        


        


In [29]:
# define custom dataset for BoW model
class HackerNewsPostDataset(Dataset):

    def __init__(self, data, labels, cutoff = None, transforms = None):
        self.posts = data
        self.scores = labels
        self.transforms = transforms
        self.cutoff = cutoff

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()

        post = self.posts.loc[index]
        score = self.scores[index]

        if self.cutoff:
            score = [(1 if isinstance(score, float) and score > self.cutoff else 0)]

        sample = {'post': post, 'score': score}

        if self.transforms:
            for transform in self.transforms:
                sample = transform(sample)

        return sample

In [30]:
# define a index dict for getting word embeddings
titles_to_index = {word: index for index, word in enumerate(titles_vocab)}
text_to_index = {word: index + len(titles_to_index) for index, word in enumerate(text_vocab)}

In [31]:
print(titles_to_index)



In [32]:
# define custom dataset for word embeddings model
class EmbeddingsDataset(Dataset):

    def __init__(self, data, lables, cutoff):
        self.posts = data
        self.scores = lables
        self.cutoff = cutoff

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):

        post = self.posts.loc[index]
        score = self.scores[index]

        title_indexs = preproccess(post["title"])
        text_indexs = preproccess(post["text"])

        if title_indexs == []:
            title_indexs = ["<|empty|>"]

        if text_indexs == []:
            text_indexs = ["<|empty|>"]

        time = post["time"]

        score = 1 if (isinstance(score, float) and score > self.cutoff) else 0

        #score = torch.FloatTensor(score)

        sample = ((title_indexs, text_indexs, time), score)
        return sample


In [33]:
from ipynb.fs.defs.Embeddings import BertEmbeddings
from pytorch_transformers import BertTokenizer
from pytorch_transformers import BertModel

In [34]:
bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [35]:
class BertTitleEmbeddingDataset(Dataset):

    def __init__(self, data, labels, cutoff):
        self.posts = data
        self.labels = labels
        self.cutoff = cutoff

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):

        post = self.posts.loc[index]
        score = self.labels[index]

        bert_embedding_model = BertEmbeddings(bert_model, bert_tokenizer)

        sentence_embedding = bert_embedding_model.get_embedding(post["title"])

        score = 1 if (isinstance(score, float) and score > self.cutoff) else 0

        sample = (sentence_embedding, score)
        
        return sample


In [36]:
cutoff = 20

In [37]:
# non url training dataset for BoW model
transforms = [TextualTransform1(), TensorTransform()]

post_training_dataset = HackerNewsPostDataset(training_data_indexed, scores, cutoff, transforms)
post_testing_dataset = HackerNewsPostDataset(testing_data_indexed, testing_scores, cutoff, transforms)

In [38]:
# create embedding datasets
embedding_dataset_train = EmbeddingsDataset(training_data_indexed, scores, cutoff)
embedding_dataset_test = EmbeddingsDataset(testing_data_indexed, testing_scores, cutoff)

In [39]:
# create berd embedding datasets
bert_embedding_dataset_train = BertTitleEmbeddingDataset(training_data_indexed, scores, cutoff)
bert_embedding_dataset_test = BertTitleEmbeddingDataset(testing_data_indexed, testing_scores, cutoff)

In [40]:
# As dataset is imbalanced define a weighted sampler
from torch.utils.data import WeightedRandomSampler

In [41]:
def define_sampler():
    num_popular = (scores > cutoff).sum()
    unpopular_weight = 1
    popular_weight = (len(scores) - num_popular) / num_popular


    class_weights = [unpopular_weight, popular_weight]
    weights = [0] * len(scores)

    for index, score in enumerate(scores):
        weight = class_weights[int(score > cutoff)]
        weights[index] = weight

    sampler = WeightedRandomSampler(weights, num_samples = len(weights), replacement = True)

    return sampler

In [42]:
sampler = define_sampler()

In [43]:
embedding_dataset_train[0]

((['not',
   'sure',
   'this',
   'legal',
   'for',
   'citizens',
   'but',
   'you',
   'can',
   'play',
   'poker',
   'here',
   'now'],
  ['not',
   'sure',
   'this',
   'legal',
   'for',
   'citizens',
   'but',
   'you',
   'can',
   'play',
   'poker',
   'here',
   'now'],
  1304882921.0),
 0)

In [44]:
# define function for collating batches
def collate_batch_embed(batch):
    lables, texts, offsets = [], [], [0]
    for post, score in batch:

        titles_indexs = [titles_to_index[word] for word in post[0]]
        text_indexs = [text_to_index[word] for word in post[1]]
        
        proccessed_input = torch.tensor(titles_indexs + text_indexs, dtype = torch.int64).to(device)

        texts.append(proccessed_input)
        lables.append([score])
        offsets.append(proccessed_input.size(0))

    lables = torch.tensor(lables, dtype = torch.int64).to(device)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim = 0).to(device)
    texts = torch.cat(texts)
    return lables, texts, offsets



In [45]:
print(post_testing_dataset[0])

{'post': tensor([0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
        1.4248e+09]), 'score': tensor([0.])}


In [46]:
1960 * 5

9800

In [47]:
# create dataloaders
batch_size = 128
num_iterations = 9800
num_epochs = 5
train_loader = torch.utils.data.DataLoader(dataset=post_training_dataset, batch_size=batch_size, shuffle=True)

In [48]:
test_loader = torch.utils.data.DataLoader(dataset=post_testing_dataset, batch_size=100, shuffle=True)

In [49]:
train_loader_embed = torch.utils.data.DataLoader(dataset = embedding_dataset_train, batch_size=batch_size, sampler=sampler, 
                                                collate_fn = collate_batch_embed)
test_loader_embed = torch.utils.data.DataLoader(dataset = embedding_dataset_test, batch_size=batch_size, sampler=sampler, 
                                                collate_fn = collate_batch_embed)

In [50]:
train_loader_embed_single = torch.utils.data.DataLoader(dataset = embedding_dataset_train, batch_size=1, sampler=sampler, collate_fn = collate_batch_embed)

In [51]:
bert_train_loader = torch.utils.data.DataLoader(dataset=bert_embedding_dataset_train, batch_size=1, sampler=sampler)
bert_test_loader = torch.utils.data.DataLoader(dataset=bert_embedding_dataset_test, batch_size=1, sampler=sampler)

In [52]:
bert_train_loader_2_weighted = torch.utils.data.DataLoader(dataset=bert_embedding_dataset_train, batch_size = 100, sampler=sampler)
bert_test_loader_2_weighted = torch.utils.data.DataLoader(dataset=bert_embedding_dataset_test, batch_size=100, sampler=sampler)
bert_train_loader_2_unweighted = torch.utils.data.DataLoader(dataset=bert_embedding_dataset_train, batch_size = 100, shuffle=True)
bert_test_loader_2_unweighted = torch.utils.data.DataLoader(dataset=bert_embedding_dataset_test, batch_size=100, shuffle=True)

In [53]:
# basic Feed Forward Neural Network

class FFNetwork(nn.Module):
    def __init__(self, input_dimensions, hidden_dimensions, output_dimensions):
        super(FFNetwork, self).__init__()

        self.linear1 = nn.Linear(input_dimensions, hidden_dimensions)

        self.nonlinear = nn.ReLU()
    
        self.linear2 = nn.Linear(hidden_dimensions, output_dimensions)

        self.sigmoid = torch.sigmoid
    
    def forward(self, x):

        x = self.linear1(x)

        x = self.nonlinear(x)

        output = self.linear2(x)

        output = self.sigmoid(output)
        return output
        

In [54]:
# basic Feed Forward Neural Network (Regression)

class FFNetworkReg(nn.Module):
    def __init__(self, input_dimensions, hidden_dimensions, output_dimensions):
        super(FFNetworkReg, self).__init__()

        self.linear1 = nn.Linear(input_dimensions, hidden_dimensions)

        self.nonlinear = nn.ReLU()
    
        self.linear2 = nn.Linear(hidden_dimensions, output_dimensions)
    
    def forward(self, x):

        x = self.linear1(x)

        x = self.nonlinear(x)

        output = self.linear2(x)

        return output

In [55]:
# Define Fully Connected FF network for the Embedding model

class FFNetworkEmbedding(nn.Module):
    def __init__(self, input_dim, embedding_dim, output_dim):
        super(FFNetworkEmbedding, self).__init__()

        self.embed = nn.EmbeddingBag(input_dim, embedding_dim)

        self.hidden = nn.Sequential(
            nn.Linear(embedding_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 16),
            nn.ReLU(),
            nn.Linear(16, output_dim),
            nn.ReLU()
        )

        self.dropout = nn.Dropout(0.3)
        

    def forward(self, x, offsets):

        # apply an emedding bag layer to get average of all embeddings
        embeddings = self.embed(x, offsets)

        # apply linear functions
        output = self.hidden(embeddings)

        # apply dropout to avoid overfitting
        output = self.dropout(output)

        return output, embeddings
        

In [56]:
# Define Fully Connected FF network for the Embedding model

class FFNetworkEmbedding2(nn.Module):
    def __init__(self, input_dim, embedding_dim, output_dim):
        super(FFNetworkEmbedding2, self).__init__()

        self.embed = nn.EmbeddingBag(input_dim, embedding_dim)

        self.hidden = nn.Sequential(
            nn.Linear(embedding_dim, 2048),
            nn.ReLU(),

            nn.Linear(2048, 1024),
            nn.ReLU(),

            nn.Linear(1024, 512),
            nn.ReLU(),

            nn.Linear(512, 256),
            nn.ReLU(),

            nn.Linear(256, 128),
            nn.ReLU(),

            nn.Linear(128, 64),
            nn.ReLU(),
            
            nn.Linear(64, output_dim),
            nn.ReLU()
        )

        self.dropout = nn.Dropout(0.3)
        

    def forward(self, x, offsets):

        # apply an emedding bag layer to get average of all embeddings
        embeddings = self.embed(x, offsets)

        # apply linear functions
        output = self.hidden(embeddings)

        # apply dropout to avoid overfitting
        output = self.dropout(output)

        return output, embeddings

In [57]:
# define Fully Connected Network which uses open AI embeddings

class FFNetworkOpenEmbedding(nn.Module):

    def __init__(self):
        super(FFNetworkOpenEmbedding, self).__init__()

        

    def forward(self, x):

        

        return x

In [58]:
# define Fully Connected Network which uses bert embeddings

class FFNetworkBertEmbedding(nn.Module):

    def __init__(self, output_dim, embedding_dim = 768):
        super(FFNetworkBertEmbedding, self).__init__()

        self.hidden = nn.Sequential(
            nn.Linear(embedding_dim, 2048),
            nn.ReLU(),

            nn.Linear(2048, 1024),
            nn.ReLU(),

            nn.Linear(1024, 512),
            nn.ReLU(),

            nn.Linear(512, 256),
            nn.ReLU(),

            nn.Linear(256, 128),
            nn.ReLU(),

            nn.Linear(128, 64),
            nn.ReLU(),
            
            nn.Linear(64, output_dim),
        )

        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        
        output = self.hidden(x)

        output = self.dropout(output)

        return output
        

In [59]:
# define the dimensions of the basic model
input_dimensions = len(text_vocab) + len(titles_vocab) + 1
hidden_dimensions = 1000
output_dimensions = 1

# instantiate the class we are using for this model
model = FFNetwork(input_dimensions, hidden_dimensions, output_dimensions).to(device)

In [60]:
# define the dimensions of the basic model
input_dimensions_reg = 100
hidden_dimensions_reg = 1000
output_dimensions_reg = 1

# instantiate the class we are using for this model
model_reg = FFNetworkReg(input_dimensions_reg, hidden_dimensions_reg, output_dimensions_reg).to(device)

In [61]:
print(len(text_vocab), len(text_to_index))
print(len(titles_vocab), len(titles_to_index))
print(len(list(dict.fromkeys(text_vocab + titles_vocab))))

146957 146957
174270 174270
268589


In [62]:
# define dimensions of embedding model
input_dim_embed = len(text_vocab) + len(titles_vocab)
embedding_dim = 1024
output_dimensions_embed = 1

# instansiate the embedding model
model_embed = FFNetworkEmbedding(input_dim_embed, embedding_dim, output_dimensions_embed).to(device)
model_embed_2 = FFNetworkEmbedding2(input_dim_embed, embedding_dim, output_dimensions_embed).to(device)

In [63]:
# define dimensions of bert model
bert_embedding_dim = 768
bert_output_dim = 1

# instantiate bert model
bert_model_1 = FFNetworkBertEmbedding(bert_output_dim, bert_embedding_dim)

In [64]:
#it = iter(train_loader)
#print(model(torch.stack(next(it)["post"])))

In [65]:
# define loss functions class
# we choose to use BCEWithLogitsLoss as it provides the functionality of both a
# sigmoid layer and BCE loss in one. Is more efficient (look up details for diss)
loss_func = nn.BCEWithLogitsLoss()

In [66]:
# define optimizer class
learning_rate = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

In [67]:
# define embedding optimizer class
optimizer_embed = torch.optim.SGD(model_embed.parameters(), lr = learning_rate)

In [68]:
# define optimizer class for bert model
optimizer_bert_1 = torch.optim.SGD(bert_model_1.parameters(), lr = learning_rate)

In [69]:

#for (batch_index, batch) in enumerate(train_loader):
#    if(batch_index > 0):
#        break
#    print(model(batch["post"]))

In [70]:
#for (batch_index, batch) in enumerate(train_loader):
#    if(batch_index > 0):
#        break
#    print(model_reg(batch["post"]))

In [71]:
# define function for calculating the accuracy of a model (current only used for BoW model)
def get_model_accuracy(model, loader):
    correct = 0
    total = 0

    for (batch_index, batch) in enumerate(loader):
        
        model.float()
        posts = batch["post"]
        scores = batch["score"]

        # get predirction probablities
        predictions_prob = model(posts)

        # get class predictions
        _, predictied = torch.max(predictions_prob.data, 1)

        # calculate tota samples predicted and correct
        total = total + scores.size(0)
        correct = correct + (predictied == scores).sum()

    accuracy = 100 * correct / total
    return accuracy,

In [72]:
# define function for training a model (also only usable for the BoW model)
def train_model(model, train_loader, test_loader, loss, optimizer):
    iteration = 0
    
    for epoch in range(num_epochs):
        print("Starting Epoch: " + str(epoch))
        for (batch_index, batch) in enumerate(train_loader):
            #print("Iteration " + str(iteration))
            model.float()
            posts = batch["post"]
            scores = batch["score"]

            # set grads to 0
            optimizer.zero_grad()

            predictions = model(posts)


            # calculate loss
            loss = loss_func(predictions, scores)

            # backwards pass to calculate gradients
            loss.backward()

            # update parameters
            optimizer.step()

            if iteration % 50 == 0:
                print("\n")
                accuracy = get_model_accuracy(model, test_loader)
                print("Iteration {}. Loss {}. Accuracy {}".format(iteration, loss.item(), accuracy))
                print("\n")

            iteration += 1

    return model

In [73]:
# define function for training a embedding model
def train_model_embed_bag(model, train_loader, test_loader, loss, optimizer):
    iterations = 1
    accuracy_list = []
    loss_list = []
    embeddings_list = []

    # make sure model is being trained on gpu
    model.to(device)

    for epoch in range(num_epochs):
        print("Epoch {}.".format(epoch))
        running_loss = 0

        for index, batch in enumerate(train_loader):
            lables, texts, offsets = batch

            # set the model to track graidents for training
            model.train(True)

            # reset optimizer grads to zero
            optimizer.zero_grad()

            # forward pass of the model
            predictions, embeddings = model(texts, offsets)

            # calculate loss
            loss = loss_func(predictions, lables.float())

            # perform backwards pass
            loss.backward()

            # tune weights with optimizer
            optimizer.step()

            # stop gradient tracking for writing current model metrics
            model.train(False)          

            # track the running loss
            running_loss += loss.item()


            # add loss and embeddings to lists

            loss_list.append(loss.item())
            embeddings_list.append(embeddings)

            if iterations % 100 == 0:

                print(" Iteration {}. Running Loss {}.".format( iterations, running_loss / 1000))
    
                # reset running loss
                running_loss = 0

                """
                for test_index, test_batch in enumerate(test_loader):

                    test_lables, test_texts, test_offsets = test_batch

                    output_preds = model(test_texts, test_offsets)
                    output_preds = [1 if output > 0.0 else 0 for output in output_preds]

                    total = total + test_lables.size(0)

                    for i in range(len(test_lables)):
                        if output_preds[i] == test_lables[i]:
                            correct += 1
                

                accuracy = correct / total
                print("Iteration {}. Loss {}. Accuracy {}".format(iterations, loss.item(), accuracy))
                accuracy_list.append(accuracy)
                loss_list.append(loss.item())
                """
            iterations = iterations + 1
    return model, embeddings_list, loss_list

In [75]:
# define function for training a embedding model
def train_model_bert(model, train_loader, loss_func, optimizer):
    iterations = 1
    loss_list = []

    # make sure model is being trained on gpu
    model = model.to(device)

    for epoch in range(num_epochs):
        print("Epoch {}.".format(epoch))
        running_loss = 0

        for batch_index, batch in enumerate(train_loader):

            (embeddings, score) = batch

            embeddings = embeddings.to(device)
            score = score.to(device)

            # set the model to track graidents for training
            model.train(True)

            # reset optimizer grads to zero
            optimizer.zero_grad()

            # forward pass of the model
            predictions = model(embeddings)

            # calculate loss
            loss = loss_func(predictions, score.float().unsqueeze(1))

            # perform backwards pass
            loss.backward()

            # tune weights with optimizer
            optimizer.step()

            # stop gradient tracking for writing current model metrics
            model.train(False)          

            # track the running loss
            running_loss += loss.item()

            if iterations % 1000 == 0:

                print("Iteration {}. Running Loss {}.".format(iterations, running_loss / 1000))

                # add running loss = list
                loss_list.append(running_loss / 1000)

                # reset running loss
                running_loss = 0

            iterations = iterations + 1
    return model, loss_list

In [None]:
# train embedding model
model_embed_final, embeddings, losses = train_model_embed_bag(model_embed, train_loader_embed, test_loader_embed, loss_func, optimizer_embed)

In [None]:
model_embed_final_2, embeddings, losses = train_model_embed_bag(model_embed_2, train_loader_embed, test_loader_embed, loss_func, optimizer_embed)

In [76]:
model_bert_final, losses = train_model_bert(bert_model_1, bert_train_loader, loss_func, optimizer_bert_1)

Epoch 0.
Iteration 1000. Running Loss 0.7013723753690719.
Iteration 2000. Running Loss 0.7017070130109787.
Iteration 3000. Running Loss 0.7024943488538266.
Iteration 4000. Running Loss 0.7033841575682164.
Iteration 5000. Running Loss 0.6983439990878105.
Iteration 6000. Running Loss 0.7048793269097805.
Iteration 7000. Running Loss 0.6998509249687195.
Iteration 8000. Running Loss 0.7009232802689076.
Iteration 9000. Running Loss 0.6924948480029125.
Iteration 10000. Running Loss 0.6869441149921622.
Iteration 11000. Running Loss 0.6839974978136597.
Iteration 12000. Running Loss 0.6800896648082416.
Iteration 13000. Running Loss 0.6917835260708162.
Iteration 14000. Running Loss 0.6861135723820334.
Iteration 15000. Running Loss 0.6858046215696523.
Iteration 16000. Running Loss 0.686300017739497.
Iteration 17000. Running Loss 0.6852141328487306.
Iteration 18000. Running Loss 0.690586153404689.
Iteration 19000. Running Loss 0.6861344170545126.
Iteration 20000. Running Loss 0.6895740008927678.
It

KeyboardInterrupt: 

In [None]:
model_bert_final_2, losses_2 = train_model_bert(bert_model_1, bert_train_loader_2_unweighted, loss_func, optimizer_bert_1)

In [None]:
model_bert_final_3_weighted, losses_3 = train_model_bert(bert_model_1, bert_train_loader_2_weighted, loss_func, optimizer_bert_1)

In [None]:
with open("losses_bert", "wb") as fb:
    pickle.dump(losses, fb)

In [None]:
with open("model_bert", "wb") as fb:
    pickle.dump(model_bert_final, fb)

In [None]:
import matplotlib.pyplot as plt

In [None]:
loss_2 = [loss / 11 if loss > 1 else loss for loss in losses]

In [None]:
plt.figure(figsize = (20, 5))
plt.plot(np.log(np.arange(len(loss_2))), loss_2)
plt.xlabel("Training Iteration")
plt.ylabel("Average Loss")
plt.show()

In [None]:
plt.figure(figsize = (20, 5))
plt.boxplot(losses, vert = False, showfliers=False)

plt.show()

In [None]:
from torch.utils.tensorboard import SummaryWriter

In [None]:
len(losses) / 5

In [None]:
writer = SummaryWriter()
for i, loss in enumerate(losses):
    title = "Loss/Train" + str(np.floor((i / (len(losses) / 5))))
    writer.add_scalar(title, loss, i % (len(losses) / 5))
writer.close()

In [None]:
# train BoW model
# currently this model takes way to long and takes up too much memory to work
model_final = train_model(model, train_loader, test_loader, loss_func, optimizer)

In [None]:
def get_embeddings(loader, model):
    embeddings = []
    texts = []
    for index, batch in enumerate(loader):
        lables, text, offsets = batch
        _, embedding = model(text, offsets)
        embeddings.append(embedding)
        texts.append(text)
    return embeddings, texts

In [None]:
embeddings = get_embeddings(train_loader_embed, model_embed)

In [None]:
embeddings

In [None]:
embeddings2, texts = get_embeddings(train_loader_embed_single, model_embed)

In [None]:
embeddings2[0]

In [None]:
def get_bert_tokens(text):
    # add special start and end tokens
    text = "[CLS] " + text + " [SEP]"

    # tokenize sentence
    tokens = bert_tokenizer.tokenize(text)

    # get vocab indicies
    tokens_index = bert_tokenizer.convert_tokens_to_ids(tokens)

    # convert into pytorch tensors
    output = torch.tensor([tokens_index])

    return output

In [None]:
def get_bert_embedding(tokens, model):
    output = tokens.apply(lambda x: model(x))
    return output

In [None]:
training_titles = training_data_indexed["title"].fillna("")

In [None]:
titles_bert = training_titles.apply(lambda x: get_bert_tokens(x))

In [None]:
bert_embeddings = get_bert_embedding(titles_bert, bert_model)