Names: Jorge Mazariegos & Cameron Knopp

In [6]:
# imports statements
import time
from collections import defaultdict, Counter
import string
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
import torch
#from gensim.models import KeyedVectors
from torch.utils.data import Dataset, DataLoader
from nltk import word_tokenize
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
set(stopwords.words('english'))

%matplotlib inline
plt.style.use('seaborn-paper')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/camknopp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# preprocess should take in the datasets (.xml) and prepare them to be used
def preprocess(data):
    """
    Args:
        data (str):
    Returns: a list of tokens

    """
    ### YOUR CODE BELOW ###
    tokens = []
    tokens_with_punct = []
    for sentence in data:
        for word in sentence:
             tokens_with_punct.append(word.lower())
    for temp in tokens_with_punct:
        for word in temp:
            tokens.append((word_tokenize(word)))
    stop = stopwords.words('english')
    tokens = [token for token in tokens_with_punct if token.isalpha()] # remove punctuation from tokens
    tokens_with_stop_words = tokens
    tokens = [token for token in tokens_with_stop_words if token not in stop]
    ### YOUR CODE ABOVE ###

    return tokens

In [8]:
# DATA PROCESSING #
f = open('datasets/data/test/subtask1-heterographic-test.xml', 'r', encoding = 'utf8')
data = f.read()


soup = BeautifulSoup(data, 'xml')
ids = soup.find_all('text')
words = soup.find_all('word')

sentence = []
punList = []
for i in range(0, len(ids)):
    for line in ids[i]:
        for word in line:
            if(word != '\n'):
                sentence.append(word)       
    punList.append(sentence)
    sentence = []
tokens = preprocess(punList)


In [9]:
class Vocabulary:
    def __init__(self, special_tokens=None):
        self.w2idx = {}
        self.idx2w = {}
        self.w2cnt = defaultdict(int)
        self.special_tokens = special_tokens
        if self.special_tokens is not None:
            self.add_tokens(special_tokens)

    def add_tokens(self, tokens):
        for token in tokens:
            self.add_token(token)
            self.w2cnt[token] += 1

    def add_token(self, token):
        if token not in self.w2idx:
            cur_len = len(self)
            self.w2idx[token] = cur_len
            self.idx2w[cur_len] = token

    def prune(self, min_cnt=2):
        to_remove = set([token for token in self.w2idx if self.w2cnt[token] < min_cnt])
        if self.special_tokens is not None:
            to_remove = to_remove.difference(set(self.special_tokens))
        
        for token in to_remove:
            self.w2cnt.pop(token)
            
        self.w2idx = {token: idx for idx, token in enumerate(self.w2cnt.keys())}
        self.idx2w = {idx: token for token, idx in self.w2idx.items()}
    
    def __contains__(self, item):
        return item in self.w2idx
    
    def __getitem__(self, item):
        if isinstance(item, str):
            return self.w2idx[item]
        elif isinstance(item , int):
            return self.idx2w[item]
        else:
            raise TypeError("Supported indices are int and str")
    
    def __len__(self):
        return(len(self.w2idx))

In [10]:
class SkipGramDataset(Dataset):
    def __init__(self, data, vocab, skip_window=3):
        super().__init__()

        self.vocab = vocab
        self.data = data
        self.skip_window = skip_window

        self.pairs = self._generate_pairs(data, skip_window)

    def _generate_pairs(self, data, skip_window):
        """
        Args: input data (a list of tokens)
        Returns: all possible pairs for the SkipGram mode
        """
        pairs = []

        for i in range(len(data)):
            for j in range(-skip_window, skip_window + 1):
                context_idx = i + j
                if j == 0 or context_idx < 0 or context_idx >= len(data):
                    continue
                if data[i] not in self.vocab or data[context_idx] not in self.vocab:
                    continue
                pairs.append((data[i], data[context_idx]))
        return pairs

    def __getitem__(self, idx):
        """
        Args:
            idx
        Returns:

        """
        pair = self.pairs[idx]
        pair = [self.vocab[t] for t in pair]
        return pair

    def __len__(self):
        """
        Returns
        """
        return len(self.pairs)

In [11]:
import torch.nn.functional as F

class SkipGramModel(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        """
        Args:
            vocab_size (int): vocabulary size
            embedding_dim (int): the dimension of word embeddings
        """
        ### INSERT YOUR CODE BELOW ###
        #self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        #self.linear = torch.nn.Linear(1, vocab_size)
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.linear = torch.nn.Linear(embedding_dim, vocab_size)
        
        ### INSERT YOUR CODE ABOVE ###

    def forward(self, inputs):
        """
        Perform the forward pass of the skip-gram model.
        
        Args:
            inputs (torch.LongTensor): input tensor containing batches of word ids [Bx1]
        Returns:
            outputs (torch.FloatTensor): output tensor with unnormalized probabilities over the vocabulary [BxV]
        """
        ### INSERT YOUR CODE BELOW ###
        embeds = self.embedding(inputs)
        #embeds = self.embedding(inputs)
        outputs = self.linear(embeds)
        outputs=outputs
        #output = F.log_softmax(self.linear(embeds), dim=1)
        ### INSERT YOUR CODE ABOVE ###
        return outputs
    
    def save_embeddings(self, voc, path):
        """
        Save the embedding matrix to a specified path.
        
        Args:
            voc (Vocabulary): the Vocabulary object for id-to-token mapping
            path (str): the location of the target file
        """
        ### INSERT YOUR CODE BELOW ###
        embeds = self.embedding.weight.data.cpu().numpy()
        f = open(path, 'w')
        f.write(str(vocab_size) + ' ' + str(embedding_dim) + '\n')
        
        for idx in range(len(embeds)):
            word = voc.idx2w[idx]
            embedding = ' '.join(map(str,embeds[idx]))
            f.write(word + ' '+ embedding + '\n')
        ### INSERT YOUR CODE ABOVE ###
        print("Successfuly saved to {}".format(path))

In [12]:
# DATA PROCESSING #
with open('text8.txt') as f:
    data = f.read()
tokens = preprocess(data[:1000000])

# CONSTRUCTING VOCABULARY #
voc = Vocabulary()
voc.add_tokens(tokens)
voc.prune(5)
vocab_size = len(voc)

# TRAINING PARAMETERS #
embedding_dim = 128
skip_window = 2
batch_size = 512
lr = 0.1
num_epochs = 100
report_every = 1

# DATASET
dataset = SkipGramDataset(tokens, voc, skip_window=skip_window)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

# MODEL
model = SkipGramModel(vocab_size=vocab_size, embedding_dim=embedding_dim)
if torch.cuda.is_available():
    model = model.cuda()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adagrad(model.parameters(), lr=lr)

FileNotFoundError: [Errno 2] No such file or directory: 'text8.txt'

In [None]:
# TRAINING #
tick = time.time()
epoch_losses = []
for epoch_num in range(1, num_epochs + 1):
    batch_losses = []
    for i, batch in enumerate(data_loader):
        ### YOUR CODE BELOW ###
        # Zero the gradients
        optimizer.zero_grad()
        # Extract the inputs and the targets
        inputs, targets = batch
        
        # Transfer the inputs and the targets to GPUs, if available
        if torch.cuda.is_available():
            inputs = inputs.cuda()
            targets = targets.cuda()
            
        # Run the model
        outputs = model(inputs)

        # Compute the loss
        loss = criterion(outputs,targets)
        
        # Backpropagate the error
        loss.backward()
        
        # Update the parameters
        optimizer.step()
        
        # Append the loss
        batch_losses.append(float(loss))
        
        ### YOUR CODE ABOVE ###
        
    epoch_loss = np.mean(np.array(batch_losses))
    epoch_losses.append(epoch_loss)

    if epoch_num % report_every == 0:
        tock = time.time()
        print("Epoch {}. Loss {:.4f}. Elapsed {:.0f} seconds".format(epoch_num, epoch_loss, tock-tick))

print("Total time elapsed: {:.0f} minutes".format((tock-tick)/60))

In [None]:
# Plot the learning curve
### YOUR CODE BELOW ###

# this normally will work, but it didn't run this time because I exited out after retraining the model and
# ...saving the embeddings. However, I didn't run this cell before exiting out, so in order for this cell to run, I
#.. have to retrain the dataset, which I don't have the time to do currently. But anyways, it shows a graph that is steadily
# .. decreasing over time

plt.plot(epoch_losses)

### YOUR CODE ABOVE