Names: Jorge Mazariegos & Cameron Knopp

In [1]:
# imports statements
import time
from collections import defaultdict, Counter
import string
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
import torch
#from gensim.models import KeyedVectors
from torch.utils.data import Dataset, DataLoader
from nltk import word_tokenize, sent_tokenize
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
set(stopwords.words('english'))

%matplotlib inline
plt.style.use('seaborn-paper')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mazar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
class Vocabulary:
    def __init__(self, special_tokens=None):
        self.w2idx = {}
        self.idx2w = {}
        self.w2cnt = defaultdict(int)
        self.special_tokens = special_tokens
        if self.special_tokens is not None:
            self.add_tokens(special_tokens)

    def add_tokens(self, tokens):
        for token in tokens:
            self.add_token(token)
            self.w2cnt[token] += 1

    def add_token(self, token):
        if token not in self.w2idx:
            cur_len = len(self)
            self.w2idx[token] = cur_len
            self.idx2w[cur_len] = token

    def prune(self, min_cnt=2):
        to_remove = set([token for token in self.w2idx if self.w2cnt[token] < min_cnt])
        if self.special_tokens is not None:
            to_remove = to_remove.difference(set(self.special_tokens))
        
        for token in to_remove:
            self.w2cnt.pop(token)
            
        self.w2idx = {token: idx for idx, token in enumerate(self.w2cnt.keys())}
        self.idx2w = {idx: token for token, idx in self.w2idx.items()}
    
    def __contains__(self, item):
        return item in self.w2idx
    
    def __getitem__(self, item):
        if isinstance(item, str):
            return self.w2idx[item]
        elif isinstance(item , int):
            return self.idx2w[item]
        else:
            raise TypeError("Supported indices are int and str")
    
    def __len__(self):
        return(len(self.w2idx))

In [92]:
#######################################################
# Using skipgrams we can create the wordpairs described
# in the research paper.
#######################################################

class SkipGramDataset(Dataset):
    def __init__(self, data, vocab, skip_window=3):
        super().__init__()

        #######################################################
        # Unlike before data will be a list of strings handed
        # all at once.
        #######################################################
        self.vocab = vocab
        self.data = data
        self.skip_window = skip_window

        self.pairs = self._generate_pairs(data, skip_window)
        
    #######################################################
    #
    #######################################################
    def _generate_pairs(self, data, skip_window):
        """
        Args: input data (a list of tokens)
        Returns: all possible pairs for the SkipGram mode
        """
        pairs = []

        for i in range(len(data)):
            for j in range(-skip_window, skip_window + 1):
                context_idx = i + j
                if j == 0 or context_idx < 0 or context_idx >= len(data):
                    continue
                if data[i] not in self.vocab or data[context_idx] not in self.vocab:
                    continue
                pairs.append((data[i], data[context_idx]))
        return pairs
    
    #######################################################
    #
    #######################################################
    def __getitem__(self, idx):
        """
        Args:
            idx
        Returns:

        """
        pair = self.pairs[idx]
        pair = [self.vocab[t] for t in pair]
        return pair
    
    #######################################################
    #
    #######################################################
    def __len__(self):
        """
        Returns
        """
        return len(self.pairs)

In [5]:
import torch.nn.functional as F

class SkipGramModel(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        """
        Args:
            vocab_size (int): vocabulary size
            embedding_dim (int): the dimension of word embeddings
        """
        ### INSERT YOUR CODE BELOW ###
        #self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        #self.linear = torch.nn.Linear(1, vocab_size)
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.linear = torch.nn.Linear(embedding_dim, vocab_size)
        
        ### INSERT YOUR CODE ABOVE ###

    def forward(self, inputs):
        """
        Perform the forward pass of the skip-gram model.
        
        Args:
            inputs (torch.LongTensor): input tensor containing batches of word ids [Bx1]
        Returns:
            outputs (torch.FloatTensor): output tensor with unnormalized probabilities over the vocabulary [BxV]
        """
        ### INSERT YOUR CODE BELOW ###
        embeds = self.embedding(inputs)
        #embeds = self.embedding(inputs)
        outputs = self.linear(embeds)
        outputs=outputs
        #output = F.log_softmax(self.linear(embeds), dim=1)
        ### INSERT YOUR CODE ABOVE ###
        return outputs
    
    def save_embeddings(self, voc, path):
        """
        Save the embedding matrix to a specified path.
        
        Args:
            voc (Vocabulary): the Vocabulary object for id-to-token mapping
            path (str): the location of the target file
        """
        ### INSERT YOUR CODE BELOW ###
        embeds = self.embedding.weight.data.cpu().numpy()
        f = open(path, 'w')
        f.write(str(vocab_size) + ' ' + str(embedding_dim) + '\n')
        
        for idx in range(len(embeds)):
            word = voc.idx2w[idx]
            embedding = ' '.join(map(str,embeds[idx]))
            f.write(word + ' '+ embedding + '\n')
        ### INSERT YOUR CODE ABOVE ###
        print("Successfuly saved to {}".format(path))

In [109]:
# preprocess should take in the datasets (.xml) and prepare them to be used
def preprocess(data):
    """
    Args:
        data (list(str)):
    Returns: a list of tokens and a list of tokenized sentences

    """
    #######################################################
    # initialize variables to use in preprocess
    #######################################################
    puns = []
    tokens = []
    stop = stopwords.words('english')
    
    #######################################################
    # Given a sentence, tokenize it and append it to a list
    #######################################################
    for sentence in data:
        puns.append(word_tokenize(sentence.lower())) # creates the list of all sentences
        
    #######################################################
    # Every sentence is tokenized, but let's grab each
    # individual word to make a vocab out of.
    #######################################################
    for sentence in puns:
        for word in sentence:
            if(word.isalpha()): # filter out punctuation
                tokens.append(word)
    #######################################################
    # Remove stop words from tokens
    #######################################################
    tokens_with_stop_words = tokens
    tokens = [token for token in tokens_with_stop_words if token not in stop]

    return tokens, puns

In [110]:
# DATA PROCESSING #
#######################################################
# Open the dataset/'s we will be using and process the
# text within to be used by our code.
#######################################################
f = open('datasets/data/test/subtask1-heterographic-test.xml', 'r', encoding = 'utf8')
data = f.read()

#######################################################
# Using Beautiful Soup we can easily extract the puns
# from the given datasets.
#######################################################
soup = BeautifulSoup(data, 'xml')
ids = soup.find_all('text')
words = soup.find_all('word')

#######################################################
# Create a list of all puns within the dataset to hand
# over to our preprocess function
#######################################################
wurd = ""
sentence = ""
sentences = []
punList = []
for i in range(0, len(ids)):
    for line in ids[i]:
        for word in line:
            if(word != '\n' or word == '\''):
                if(word.isalpha()): # If not punctuation
                    wurd = word
                    if(sentence == ""): # If the start of the sentence
                        sentence = sentence + wurd
                    else: # If not the start of the sentence
                        sentence = sentence + " " + wurd
                else: # If punctuation we don't want to put a space between the character and it.
                    wurd = word
                    sentence = sentence + wurd
                wurd = "" # clear the current word
    sentences.append(sentence) # append the created string sentence to our list.
    sentence = ""
#######################################################
# Create a list of tokens to make a vocabulary of and
# create a list of sentences to create make word pairs
# from.
#######################################################
tokens, punList = preprocess(sentences)

#######################################################
# Create our Vocabulary
#######################################################
voc = Vocabulary()
voc.add_tokens(tokens)
vocab_size = len(voc)



In [113]:
#######################################################
# Create our Dataset
#######################################################

#dataset = SkipGramDataset(punList, voc, skip_window=2)
print(punList)

[["''", 'i', "'", 'm', 'halfway', 'up', 'a', 'mountain', ',', "''", 'tom', 'alleged', '.'], ['i', "'", 'd', 'like', 'to', 'be', 'a', 'chinese', 'laborer', ',', 'said', 'tom', 'coolly', '.'], ['no', ',', 'baby', 'oil', 'does', 'not', 'come', 'from', 'squeezing', 'dead', 'babies', '.'], ['dentists', 'don', "'", 't', 'like', 'a', 'hard', 'day', 'at', 'the', 'orifice', '.'], ['are', 'evil', 'wildebeests', 'bad', 'gnus', '?'], ['if', 'you', 'can', "'", 't', 'be', 'good', ',', 'be', 'careful', '.'], ['a', 'busy', 'barber', 'is', 'quite', 'harried', '.'], ['my', 'name', 'is', 'avery', '.', 'i', 'raise', 'birds', '.'], ['two', 'construction', 'workers', 'had', 'a', 'stairing', 'contest', '.'], ['in', 'the', 'winter', ',', 'some', 'horses', 'are', 'friesian', '.'], ['it', "'", 's', 'between', 'my', 'sole', 'and', 'my', 'heel', ',', 'said', 'tom', 'archly', '.'], ['old', 'electricians', 'never', 'die', ',', 'they', 'just', 'do', 'it', 'until', 'it', 'hz', '.'], ['yesterday', 'i', 'accidentally',

In [None]:
# TRAINING #
tick = time.time()
epoch_losses = []
for epoch_num in range(1, num_epochs + 1):
    batch_losses = []
    for i, batch in enumerate(data_loader):
        ### YOUR CODE BELOW ###
        # Zero the gradients
        optimizer.zero_grad()
        # Extract the inputs and the targets
        inputs, targets = batch
        
        # Transfer the inputs and the targets to GPUs, if available
        if torch.cuda.is_available():
            inputs = inputs.cuda()
            targets = targets.cuda()
            
        # Run the model
        outputs = model(inputs)

        # Compute the loss
        loss = criterion(outputs,targets)
        
        # Backpropagate the error
        loss.backward()
        
        # Update the parameters
        optimizer.step()
        
        # Append the loss
        batch_losses.append(float(loss))
        
        ### YOUR CODE ABOVE ###
        
    epoch_loss = np.mean(np.array(batch_losses))
    epoch_losses.append(epoch_loss)

    if epoch_num % report_every == 0:
        tock = time.time()
        print("Epoch {}. Loss {:.4f}. Elapsed {:.0f} seconds".format(epoch_num, epoch_loss, tock-tick))

print("Total time elapsed: {:.0f} minutes".format((tock-tick)/60))

In [None]:
# Plot the learning curve
### YOUR CODE BELOW ###

# this normally will work, but it didn't run this time because I exited out after retraining the model and
# ...saving the embeddings. However, I didn't run this cell before exiting out, so in order for this cell to run, I
#.. have to retrain the dataset, which I don't have the time to do currently. But anyways, it shows a graph that is steadily
# .. decreasing over time

plt.plot(epoch_losses)

### YOUR CODE ABOVE