In [1]:
# import standard libraries
import os
import numpy as np

import warnings
import pandas as pd

from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

import datetime
import pickle
from nltk.corpus import stopwords
import re
import regex
import string
from urllib.parse import urlparse

In [2]:
# import ML libraries
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset

In [3]:
# import embeddings modules
import openai
openai.api_key = "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"

from pytorch_transformers import BertTokenizer
from pytorch_transformers import BertModel

In [None]:
# import training data
with open("../data/training_data", "rb") as fb:
    training_data = pickle.load(fb)

In [None]:
# get non-comments and reset index
training_data = training_data.loc[training_data.type == "story"].reset_index(drop = True)

In [None]:
# extract relevent columns and remove nan values
training_data = training_data[["title", "text", "url", "score"]]
training_data.title = training_data.title.fillna("")
training_data.text = training_data.text.fillna("")
training_data.url = training_data.url.fillna("")
training_data.score = training_data.score.fillna(0)

In [None]:
# def bert objects
bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# define bert class
class BertEmbeddings(object):
    
    # initally define class with a model and a tokenizer to be used
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    # setter for model
    def set_model(self, model):
        self.model = model

    # setter for tokenizer
    def set_tokenizer(self, tokenizer):
        self.tokenizer = tokenizer

    # convert text into vert tokens
    def tokenize_text(self, text):
        # add special start and end tokens
        text = "[CLS] " + text + " [SEP]"

        # tokenize sentence
        tokens = self.tokenizer.tokenize(text)

        # get vocab indicies
        tokens_index = self.tokenizer.convert_tokens_to_ids(tokens)

        # convert into pytorch tensors
        output = torch.tensor([tokens_index])

        return output
        
    # apply bert model to get embeddings for single input
    def apply_model(self, tokens):
        self.model.eval()

        with torch.no_grad():
           output = self.model(tokens)
        return output[2]

    # reshape the output of the model
    def reshape_token_embeddings(self, token_embeddings):

        # combine all of the layers of the model
        token_embeddings = torch.stack(token_embeddings, dim=0)

        # get rid of the batch layer as we only use 1 sentence per input
        token_embeddings = torch.squeeze(token_embeddings, dim=1)

        # change order of layers and tokens
        token_embeddings = token_embeddings.permute(1,0,2)

        # return transformed embeddings
        return token_embeddings
    

    # sum the 
    def get_word_embedding_concat(self, token_embeddings, index):

        return 0

    def get_word_embedding_sum(self, token_embeddings, index):
        
        return 0


    # define word embedding by averging all the token embeddings from the second to last layer
    def get_sentence_embeddings(self, token_embeddings):

        token_tensors = token_embeddings[-2]

        sentence_embedding = torch.mean(token_tensors, dim=0)

        return sentence_embedding

    # Given an input sentence return the coresponding sentence embedding
    def get_embedding(self, input):

        tokens = self.tokenize_text(input)

        token_embeddings = self.apply_model(tokens)

        token_embeddings = self.reshape_token_embeddings(token_embeddings)

        sentence_embedding = self.get_sentence_embeddings(token_embeddings)

        return sentence_embedding
    
  

In [None]:
# def pytorch embedding model class
class PytorchEmbeddings(object):

    def __init__(self, vocab_path, embedding_dim):
        self.stopwords = set(stopwords.words('english'))
        with open(vocab_path, "rb") as fb:
            vocab = pickle.load(fb)

        self.vocab_to_index = {word: index for index, word in enumerate(vocab)}
        self.embedding = nn.EmbeddingBag(len(self.vocab_to_index), embedding_dim)

    def preproccess(self, text):
        if isinstance(text, float):
            return [""]
        
        # split into tokens
        tokens = re.split('\s+', text)

        # remove punctuation
        tokens = ["".join([i for i in x if i not in string.punctuation]) for x in tokens]

        # remove numbers
        tokens = [re.sub("\d+", "", x) for x in tokens]

        # make all tokens lowercase
        tokens = [x.lower() for x in tokens]

        # remove tokens which are too short or too long
        tokens = [token for token in tokens if len(token) > 2 and len(token) < 15]

        # remove hyperlinks
        tokens = [token for token in tokens if not (token.startswith("http") or token.startswith("www") or token.endswith("com"))]

        # remove stop words
        #final = [word for word in tokens if word not in stop_words]

        if isinstance(tokens, float):
            return [""]

        return tokens

    def get_indexs(self, text):

        tokens = self.preproccess(text)

        indicies = [self.vocab_to_index[token] for token in tokens]

        return torch.tensor(indicies, dtype = torch.int64)

    def get_nn_embeddings(self, text):

        indicies = self.get_indexs(text)

        with torch.no_grad():
            output = self.embedding(indicies, torch.tensor([0]))
            
        return output

    

In [None]:
# define a bert loader
bert = BertEmbeddings(bert_model, bert_tokenizer)

In [None]:
nnModel = PytorchEmbeddings("titles_vocab", 1024)

In [None]:
from scipy.spatial.distance import cosine

In [None]:
def get_simularity(embeddings1, embeddings2):
    return 1 - cosine(embeddings1, embeddings2)

In [None]:
t1 = bert.get_embedding("cat")
t2 = bert.get_embedding("dog")
print(get_simularity(t1, t2))

In [None]:
t3 = nnModel.get_nn_embeddings("cat")[0]
t4 = nnModel.get_nn_embeddings("dog")[0]
print(get_simularity(t3, t4))

In [4]:
with open("../data/data_train", "rb") as fb:
    data_train = pickle.load(fb)

with open("../data/data_valid", "rb") as fb:
    data_valid = pickle.load(fb)

with open("../data/data_test", "rb") as fb:
    data_test = pickle.load(fb)

In [None]:
data_train.title = data_train.title.fillna("")
data_valid.title = data_valid.title.fillna("")
data_test.title = data_test.title.fillna("")

In [None]:
def get_embedding(text):
    return openai.Embedding.create(input=text, model = "text-embedding-ada-002")['data'][0]['embedding']

In [None]:
print(len(openai.Embedding.create(input="", model = "text-embedding-ada-002")['data'][0]['embedding']))

In [None]:
def print_embedding_index(x):
    print(ind)
    ind += 1
    return get_embedding(x)

In [None]:
title_embeddings = [0] * len(data_train.title)

In [None]:
for ind, title in enumerate(data_train.title):
    if title_embeddings[ind] == 0:
        title_embeddings[ind] = get_embedding(title)

In [None]:
while True:
    try:
        for ind, title in enumerate(data_train.title):
            if title_embeddings[ind] == 0:
                print(ind)
                title_embeddings[ind] = get_embedding(title)

        break
    except:
        print("broke")

In [None]:
valid_title_embeddings = [0] * len(data_valid.title)
test_title_embeddings = [0] * len(data_test.title)

In [None]:
while True:
    try:
        for ind, title in enumerate(data_valid.title):
            if valid_title_embeddings[ind] == 0:
                print(ind)
                valid_title_embeddings[ind] = get_embedding(title)

        break
    except:
        print("broke")

In [None]:
while True:
    try:
        for ind, title in enumerate(data_test.title):
            if test_title_embeddings[ind] == 0:
                print(ind)
                test_title_embeddings[ind] = get_embedding(title)

        break
    except:
        print("broke")

In [None]:
with open("train_embeddings", "wb") as fb:
    pickle.dump(title_embeddings, fb)

In [None]:
with open("valid_embeddings", "wb") as fb:
    pickle.dump(valid_title_embeddings, fb)

In [66]:
with open("test_embeddings", "wb") as fb:
    pickle.dump(test_title_embeddings, fb)

In [None]:
for i in range(len(title_embeddings)):
    if title_embeddings[i] == 0:
        print(i)
        break

In [None]:
data_train["title_ada_embeddings"] = data_train.title.apply(lambda x: print_embedding_index(x))

In [None]:
data_train