In [1]:
# import required libraries
from nltk.corpus import stopwords
import string
import re
import datetime

In [2]:
# import ml modules
import torch
from torch.utils.data import Dataset

In [3]:
# import embedding modules
from pytorch_transformers import BertTokenizer
from pytorch_transformers import BertModel
from ipynb.fs.defs.Embeddings import BertEmbeddings

In [4]:
# define preproccessing function
stop_words = set(stopwords.words('english'))

def preproccess(text):
    if isinstance(text, float):
        return [""]
    
    # split into tokens
    tokens = re.split('\s+', text)

    # remove punctuation
    tokens = ["".join([i for i in x if i not in string.punctuation]) for x in tokens]

    # remove numbers
    tokens = [re.sub("\d+", "", x) for x in tokens]

    # make all tokens lowercase
    tokens = [x.lower() for x in tokens]

    # remove tokens which are too short or too long
    tokens = [token for token in tokens if len(token) > 2 and len(token) < 15]

    # remove hyperlinks
    tokens = [token for token in tokens if not (token.startswith("http") or token.startswith("www") or token.endswith("com"))]

    # remove non-english unicode characters
    tokens = [token for token in tokens if not re.findall("[^\u0000-\u05C0\u2100-\u214F]+", token)]

    # remove stop words
    #final = [word for word in tokens if word not in stop_words]

    if isinstance(tokens, float):
        return [""]

    return tokens

In [5]:
def convert_time_to_date(time):
    out = datetime.datetime.fromtimestamp(time)
    return out

In [6]:
# define custom dataset for BoW model
class HackerNewsPostDataset(Dataset):

    def __init__(self, data, labels, cutoff = None, transforms = None):
        self.posts = data
        self.scores = labels
        self.transforms = transforms
        self.cutoff = cutoff

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()

        post = self.posts.loc[index]
        score = self.scores[index]

        if self.cutoff:
            score = [(1 if isinstance(score, float) and score > self.cutoff else 0)]

        sample = {'post': post, 'score': score}

        if self.transforms:
            for transform in self.transforms:
                sample = transform(sample)

        return sample

In [7]:
# define custom dataset for word embeddings model
class EmbeddingsDataset(Dataset):

    def __init__(self, data, lables, cutoff):
        self.posts = data
        self.scores = lables
        self.cutoff = cutoff

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):

        post = self.posts.loc[index]
        score = self.scores[index]

        title_indexs = preproccess(post["title"])
        text_indexs = preproccess(post["text"])

        if title_indexs == []:
            title_indexs = ["<|empty|>"]

        if text_indexs == []:
            text_indexs = ["<|empty|>"]

        time = post["time"]

        score = 1 if (isinstance(score, float) and score > self.cutoff) else 0

        #score = torch.FloatTensor(score)

        sample = ((title_indexs, text_indexs, time), score)
        return sample

In [8]:
class BertTitleEmbeddingDataset(Dataset):

    def __init__(self, data, labels, cutoff):
        self.posts = data
        self.labels = labels
        self.cutoff = cutoff
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):

        post = self.posts.loc[index]
        score = self.labels[index]

        bert_embedding_model = BertEmbeddings(self.bert_model, self.bert_tokenizer)

        sentence_embedding = bert_embedding_model.get_embedding(post["title"])

        score = 1 if (isinstance(score, float) and score > self.cutoff) else 0

        sample = (sentence_embedding, score)
        
        return sample


In [9]:
class BertProcessedTitleEmbeddingDataset(Dataset):

    def __init__(self, data, labels, cutoff):
        self.posts = data
        self.labels = labels
        self.cutoff = cutoff
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):

        post = self.posts.loc[index]
        score = self.labels[index]

        bert_embedding_model = BertEmbeddings(self.bert_model, self.bert_tokenizer)

        title = "".join(preproccess(post["title"]))

        sentence_embedding = bert_embedding_model.get_embedding(title)

        score = 1 if (isinstance(score, float) and score > self.cutoff) else 0

        sample = (sentence_embedding, score)
        
        return sample

In [10]:
class BertProcessedTitleEmbeddingTitleAndTextDataset(Dataset):

    def __init__(self, data, labels, cutoff):
        self.posts = data
        self.labels = labels
        self.cutoff = cutoff
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):

        post = self.posts.loc[index]
        score = self.labels[index]

        bert_embedding_model = BertEmbeddings(self.bert_model, self.bert_tokenizer)

        title = "".join(preproccess(post["title"]))
        text = "".join(preproccess(post["text"]))

        sentence_embedding_title = bert_embedding_model.get_embedding(title)
        sentence_embedding_text = bert_embedding_model.get_embedding(text)
        
        sentence_embedding = torch.cat((sentence_embedding_title,sentence_embedding_text), dim = -1)

        score = 1 if (isinstance(score, float) and score > self.cutoff) else 0

        sample = (sentence_embedding, score)
        
        return sample

In [11]:
class BertProcessedTitleEmbeddingTitleAndTimeDataset(Dataset):

    def __init__(self, data, labels, cutoff):
        self.posts = data
        self.labels = labels
        self.cutoff = cutoff
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):

        post = self.posts.loc[index]
        score = self.labels[index]

        bert_embedding_model = BertEmbeddings(self.bert_model, self.bert_tokenizer)

        title = "".join(preproccess(post["title"]))
        time = post["time"]

        sentence_embedding_title = bert_embedding_model.get_embedding(title)
        embedding = torch.cat((sentence_embedding_title + torch.tensor([time])), dim=-1)

        score = 1 if (isinstance(score, float) and score > self.cutoff) else 0

        sample = (embedding, score)
        
        return sample

In [12]:
class BertProcessedTitleEmbeddingMulticlass(Dataset):

    def __init__(self, data, labels, cutoff0, cutoff1, cutoff2):
        self.posts = data
        self.labels = labels
        self.cutoff0 = cutoff0
        self.cutoff1 = cutoff1
        self.cutoff2 = cutoff2
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):

        post = self.posts.loc[index]
        score = self.labels[index]

        bert_embedding_model = BertEmbeddings(self.bert_model, self.bert_tokenizer)

        title = "".join(preproccess(post["title"]))

        sentence_embedding = bert_embedding_model.get_embedding(title)


        if score <= self.cutoff0:
            score = [1, 0, 0, 0]
        elif score < self.cutoff1:
            score = [0, 1, 0, 0]
        elif score < self.cutoff2:
            score = [0, 0, 1, 0]
        else:
            score = [0, 0, 0, 1]

        sample = (sentence_embedding, score)
        
        return sample

In [None]:
class OpenAIEmbeddingDataset(Dataset):

    def __init__(self, data, labels, cutoff, embeddings):
        self.labels = labels
        self.cutoff = cutoff
        self.embeddings = embeddings

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):

        score = self.labels[index]

        sentence_embedding = self.embeddings[index]

        score = 1 if (isinstance(score, float) and score > self.cutoff) else 0

        sample = (sentence_embedding, score)
        
        return sample

In [None]:
class FirstSentenceEmbeddingDataset(Dataset):

    def __init__(self, data, labels, cutoff):
        self.posts = data
        self.labels = labels
        self.cutoff = cutoff
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):

        post = self.posts.loc[index]
        score = self.labels[index]

        bert_embedding_model = BertEmbeddings(self.bert_model, self.bert_tokenizer)

        first_sent = post["title"].split(".")[0]
        title = "".join(preproccess(first_sent))

        sentence_embedding = bert_embedding_model.get_embedding(title)
        score = 1 if (isinstance(score, float) and score > self.cutoff) else 0

        sample = (sentence_embedding, score)
        return sample

        
        
        

In [None]:
class NonEmbeddingDataset(Dataset):

    def __init__(self, data, lables, cutoff):
        self.posts = data
        self.scores = lables
        self.cutoff = cutoff

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, index):

        post = self.posts.loc[index]
        score = self.scores[index]

        has_url = 1 if post["url"] != "" else 0
        has_title = 1 if post["title"] != "" else 0
        has_text = 1 if post["text"] != "" else 0
        num_comments = post["descendants"]

        time = convert_time_to_date(post["time"])

        day = time.day
        month = time.month
        year = time.year
        hour = time.hour

        features = torch.tensor([has_url, has_title, has_text, hour, day, month, year, num_comments], dtype = torch.float32)
        score = 1 if score > self.cutoff else 0

        sample = (features, score)
        return sample