In [2]:
# import required libraries
from nltk.corpus import stopwords
import string
import re

In [3]:
# import ml modules
import torch
from torch.utils.data import Dataset

In [5]:
# import embedding modules
from pytorch_transformers import BertTokenizer
from pytorch_transformers import BertModel
from ipynb.fs.defs.Embeddings import BertEmbeddings

In [6]:
# define preproccessing function
stop_words = set(stopwords.words('english'))

def preproccess(text):
    if isinstance(text, float):
        return [""]
    
    # split into tokens
    tokens = re.split('\s+', text)

    # remove punctuation
    tokens = ["".join([i for i in x if i not in string.punctuation]) for x in tokens]

    # remove numbers
    tokens = [re.sub("\d+", "", x) for x in tokens]

    # make all tokens lowercase
    tokens = [x.lower() for x in tokens]

    # remove tokens which are too short or too long
    tokens = [token for token in tokens if len(token) > 2 and len(token) < 15]

    # remove hyperlinks
    tokens = [token for token in tokens if not (token.startswith("http") or token.startswith("www") or token.endswith("com"))]

    # remove stop words
    #final = [word for word in tokens if word not in stop_words]

    if isinstance(tokens, float):
        return [""]

    return tokens

In [7]:
# define custom dataset for BoW model
class HackerNewsPostDataset(Dataset):

    def __init__(self, data, labels, cutoff = None, transforms = None):
        self.posts = data
        self.scores = labels
        self.transforms = transforms
        self.cutoff = cutoff

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()

        post = self.posts.loc[index]
        score = self.scores[index]

        if self.cutoff:
            score = [(1 if isinstance(score, float) and score > self.cutoff else 0)]

        sample = {'post': post, 'score': score}

        if self.transforms:
            for transform in self.transforms:
                sample = transform(sample)

        return sample

In [8]:
# define custom dataset for word embeddings model
class EmbeddingsDataset(Dataset):

    def __init__(self, data, lables, cutoff):
        self.posts = data
        self.scores = lables
        self.cutoff = cutoff

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):

        post = self.posts.loc[index]
        score = self.scores[index]

        title_indexs = preproccess(post["title"])
        text_indexs = preproccess(post["text"])

        if title_indexs == []:
            title_indexs = ["<|empty|>"]

        if text_indexs == []:
            text_indexs = ["<|empty|>"]

        time = post["time"]

        score = 1 if (isinstance(score, float) and score > self.cutoff) else 0

        #score = torch.FloatTensor(score)

        sample = ((title_indexs, text_indexs, time), score)
        return sample

In [10]:
class BertTitleEmbeddingDataset(Dataset):

    def __init__(self, data, labels, cutoff):
        self.posts = data
        self.labels = labels
        self.cutoff = cutoff
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):

        post = self.posts.loc[index]
        score = self.labels[index]

        bert_embedding_model = BertEmbeddings(self.bert_model, self.bert_tokenizer)

        sentence_embedding = bert_embedding_model.get_embedding(post["title"])

        score = 1 if (isinstance(score, float) and score > self.cutoff) else 0

        sample = (sentence_embedding, score)
        
        return sample


In [12]:
class BertProcessedTitleEmbeddingDataset(Dataset):

    def __init__(self, data, labels, cutoff):
        self.posts = data
        self.labels = labels
        self.cutoff = cutoff
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):

        post = self.posts.loc[index]
        score = self.labels[index]

        bert_embedding_model = BertEmbeddings(self.bert_model, self.bert_tokenizer)

        title = "".join(preproccess(post["title"]))

        sentence_embedding = bert_embedding_model.get_embedding(title)

        score = 1 if (isinstance(score, float) and score > self.cutoff) else 0

        sample = (sentence_embedding, score)
        
        return sample