In [5]:
import torch
import datasets

dataset = datasets.load_dataset('tweets_hate_speech_detection')

In [6]:
import re 

def split_tokens(row):
    row['all_tokens'] = [i for i in re.split(r"[+ ]", re.sub(r"[^a-z@# ]","",row["tweet"].lower()) )
              if len(i)]
    return row

In [7]:
dataset = dataset.map(split_tokens)

In [10]:
from collections import Counter
counts = Counter([i for s in dataset["train"]["all_tokens"] for i in s])
counts = {key: value for (key, value) in counts.items() if value>10}
vocab = list(counts.keys())
n_v = len(vocab)

In [12]:
tok2id = {}
id2tok   = {}
for ind, word in enumerate(vocab):
    tok2id[word] =ind
    id2tok [ind] =word
def remove_rare_tokens(row):
    row["token"] = [i for i in row["all_tokens"] if i in vocab]
    return row

dataset = dataset.map(remove_rare_tokens)

In [21]:
from collections import defaultdict
cooccurence_counts = defaultdict(float)
wsize = 3
for i in range(len(dataset)):
    row = dataset["train"][i]
    for ind, w in enumerate(row["token"]):
        for j in range(max(0, ind-wsize) , min(ind+wsize+1, len(row["token"]))):
            if ind!=j:
                cooccurence_counts[(tok2id[w], tok2id[row["token"][j]])]+= (1/abs(ind-j))
    

In [22]:
coocurrence_matrix = [(words[0], words[1], count)
                              for words, count in cooccurence_counts.items()]

In [23]:
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
class GloVeDataSet(Dataset):

    def __init__(self, coocurrence_matrix):
        self._coocurrence_matrix = coocurrence_matrix

    def __getitem__(self, index):
        return self._coocurrence_matrix[index]

    def __len__(self):
        return len(self._coocurrence_matrix)


class NotTrainedError(Exception):
    pass


class NotFitToCorpusError(Exception):
    pass



In [24]:
glove_dataset = GloVeDataSet(coocurrence_matrix)

In [25]:
import torch
import torch.nn as nn
from torch import optim

class GloVeModel(nn.Module):
    """Implement GloVe model with Pytorch
    """

    def __init__(self, embedding_size,  vocab_size, min_occurrance=1, x_max=100, alpha=3 / 4):
        super(GloVeModel, self).__init__()

        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.alpha = alpha
        self.min_occurrance = min_occurrance
        self.x_max = x_max

        self._focal_embeddings = nn.Embedding(
            vocab_size, embedding_size).type(torch.float64)
        self._context_embeddings = nn.Embedding(
            vocab_size, embedding_size).type(torch.float64)
        self._focal_biases = nn.Embedding(vocab_size, 1).type(torch.float64)
        self._context_biases = nn.Embedding(vocab_size, 1).type(torch.float64)
        


    def _loss(self, focal_input, context_input, coocurrence_count):
        x_max, alpha = self.x_max, self.alpha

        focal_embed = self._focal_embeddings(focal_input)
        context_embed = self._context_embeddings(context_input)
        focal_bias = self._focal_biases(focal_input)
        context_bias = self._context_biases(context_input)

        # count weight factor
        weight_factor = torch.pow(coocurrence_count / x_max, alpha)
        weight_factor[weight_factor > 1] = 1

        embedding_products = torch.sum(focal_embed * context_embed, dim=1)
        log_cooccurrences = torch.log(coocurrence_count)

        distance_expr = (embedding_products + focal_bias +
                         context_bias + log_cooccurrences) ** 2

        single_losses = weight_factor * distance_expr
        mean_loss = torch.mean(single_losses)
        return mean_loss

In [26]:
def train():
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 
    embedding_size = 128
    num_epoch  = 100
    batch_size = 2
    learning_rate = 0.01
    vocab_size= n_v
    loop_interval=10
    
    glove_dataloader = DataLoader(glove_dataset, batch_size)
    total_loss = 0
    model = GloVeModel(embedding_size,  vocab_size)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(num_epoch):
        for idx, batch in enumerate(glove_dataloader):
            optimizer.zero_grad()

            i_s, j_s, counts = batch
            i_s = i_s.to(device)
            j_s = j_s.to(device)
            counts = counts.to(device)
            loss = model._loss(i_s, j_s, counts)

            total_loss += loss.item()
            if idx % loop_interval == 0:
                avg_loss = total_loss / loop_interval
                print("epoch: {}, current step: {}, average loss: {}".format(
                    epoch, idx, avg_loss))
                total_loss = 0

            loss.backward()
            optimizer.step()

        print("finish glove vector training")


 

In [27]:
train()

epoch: 0, current step: 0, average loss: 0.40114140982275226
epoch: 0, current step: 10, average loss: 3.151820792712834
epoch: 0, current step: 20, average loss: 3.450219623899547
epoch: 0, current step: 30, average loss: 5.059381217822585
epoch: 0, current step: 40, average loss: 2.944037662372412
epoch: 0, current step: 50, average loss: 2.5466736065374103
epoch: 0, current step: 60, average loss: 3.597549830513109
finish glove vector training
epoch: 1, current step: 0, average loss: 0.05241783985176228
epoch: 1, current step: 10, average loss: 1.1584430211956578
epoch: 1, current step: 20, average loss: 1.2362486139956546
epoch: 1, current step: 30, average loss: 1.2775365811649995
epoch: 1, current step: 40, average loss: 0.6437174300498401
epoch: 1, current step: 50, average loss: 0.1826464806594807
epoch: 1, current step: 60, average loss: 0.6854064829924085
finish glove vector training
epoch: 2, current step: 0, average loss: 0.011886699275514627
epoch: 2, current step: 10, ave