Names: Jorge Mazariegos & Cameron Knopp

In [15]:
# imports statements
import time
from collections import defaultdict, Counter
import string
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
import torch
#from gensim.models import KeyedVectors
from torch.utils.data import Dataset, DataLoader
from nltk import word_tokenize
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
set(stopwords.words('english'))

%matplotlib inline
plt.style.use('seaborn-paper')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/camknopp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# preprocess should take in the datasets (.xml) and prepare them to be used
def preprocess(data):
    """
    Args:
        data (list(str)):
    Returns: a list of tokens and a list of tokenized sentences

    """
    #######################################################
    # initialize variables to use in preprocess
    #######################################################
    puns = []
    tokens = []
    stop = stopwords.words('english')
    
    #######################################################
    # Given a sentence, tokenize it and append it to a list
    #######################################################
    for sentence in data:
        puns.append(word_tokenize(sentence.lower())) # creates the list of all sentences
        
    #######################################################
    # Every sentence is tokenized, but let's grab each
    # individual word to make a vocab out of.
    #######################################################
    for sentence in puns:
        for word in sentence:
            if(word.isalpha()): # filter out punctuation
                tokens.append(word)
    #######################################################
    # Remove stop words from tokens
    #######################################################
    tokens_with_stop_words = tokens
    tokens = [token for token in tokens_with_stop_words if token not in stop]

    return tokens, puns

In [None]:
# DATA PROCESSING #
#######################################################
# Open the dataset/'s we will be using and process the
# text within to be used by our code.
#######################################################
f = open('datasets/data/test/subtask1-heterographic-test.xml', 'r', encoding = 'utf8')
data = f.read()

#######################################################
# Using Beautiful Soup we can easily extract the puns
# from the given datasets.
#######################################################
soup = BeautifulSoup(data, 'xml')
ids = soup.find_all('text')
words = soup.find_all('word')
#######################################################
# Create a list of all puns within the dataset to hand
# over to our preprocess function
#######################################################
wurd = ""
sentence = ""
sentences = []
punList = []
for i in range(0, len(ids)):
    for line in ids[i]:
        for word in line:
            if(word != '\n' or word == '\''):
                if(word.isalpha()): # If not punctuation
                    wurd = word
                    if(sentence == ""): # If the start of the sentence
                        sentence = sentence + wurd
                    else: # If not the start of the sentence
                        sentence = sentence + " " + wurd
                else: # If punctuation we don't want to put a space between the character and it.
                    wurd = word
                    sentence = sentence + wurd
                wurd = "" # clear the current word
    sentences.append(sentence) # append the created string sentence to our list.
    sentence = ""
#######################################################
# Create a list of tokens to make a vocabulary of and
# create a list of sentences to create make word pairs
# from.
#######################################################
tokens, punList = preprocess(sentences)

#######################################################
# Create our Vocabulary
#######################################################
voc = Vocabulary()
voc.add_tokens(tokens)
vocab_size = len(voc)

In [16]:
class Vocabulary:
    def __init__(self, special_tokens=None):
        self.w2idx = {}
        self.idx2w = {}
        self.w2cnt = defaultdict(int)
        self.special_tokens = special_tokens
        if self.special_tokens is not None:
            self.add_tokens(special_tokens)

    def add_tokens(self, tokens):
        for token in tokens:
            self.add_token(token)
            self.w2cnt[token] += 1

    def add_token(self, token):
        if token not in self.w2idx:
            cur_len = len(self)
            self.w2idx[token] = cur_len
            self.idx2w[cur_len] = token

    def prune(self, min_cnt=2):
        to_remove = set([token for token in self.w2idx if self.w2cnt[token] < min_cnt])
        if self.special_tokens is not None:
            to_remove = to_remove.difference(set(self.special_tokens))
        
        for token in to_remove:
            self.w2cnt.pop(token)
            
        self.w2idx = {token: idx for idx, token in enumerate(self.w2cnt.keys())}
        self.idx2w = {idx: token for token, idx in self.w2idx.items()}
    
    def __contains__(self, item):
        return item in self.w2idx
    
    def __getitem__(self, item):
        if isinstance(item, str):
            return self.w2idx[item]
        elif isinstance(item , int):
            return self.idx2w[item]
        else:
            raise TypeError("Supported indices are int and str")
    
    def __len__(self):
        return(len(self.w2idx))

In [141]:
#######################################################
# Using skipgrams we can create the wordpairs described
# in the N-Hance research paper.
#######################################################

class SkipGramDataset(Dataset):
    def __init__(self, data, vocab, skip_window=3):
        super().__init__()

        #######################################################
        # Unlike before, data will be a list of strings handed
        # all at once.
        #######################################################
        self.vocab = vocab
        self.data = data
        self.skip_window = skip_window
        self.pairs = self._generate_pairs(data, skip_window)
        
    #######################################################
    #
    #######################################################
    def _generate_pairs(self, data, skip_window):
        """
        Args: input data (a list of lists of words for each sentence (i.e, each list of words is a sentence))
        Returns: a list of lists. Each list will contain the word pairs for a given sentence in the input dataset
        """
        pairs = [[]]  # list of word pairs for each sentence
        curr_sentence_pairs = [] # list of word pairs for current sentence
        pruned_pairs = []
        
        for sent in data: 
            for i in range(len(sent)):
                for j in range(-skip_window, skip_window + 1):
                    context_idx = i + j
                    if j == 0 or context_idx < 0 or context_idx >= len(sent):
                        continue
                    if sent[i] not in self.vocab or sent[context_idx] not in self.vocab:
                        continue
                    curr_sentence_pairs.append((sent[i], sent[context_idx]))
                    
            for (a, b) in list(curr_sentence_pairs):
                print("({}, {}) exists in cur_sentence_pairs".format(a,b))
                if (b, a) in curr_sentence_pairs:
                    print("deleting ({}, {})".format(b, a))
                    del (b, a)
                
            
            pairs.append(curr_sentence_pairs.copy()) # need to append a copy so that it is not cleared with we call clear() in the next line
            curr_sentence_pairs.clear()
            #print(pairs[len(pairs)-1])
                    
        return pairs
    
    def __prunepair__(self, pair_list, tuple(item_one, item_two)):
        for i in range(len(pair_list)):
            if pair_list[i] == (item_one, item_two):
                delete pair_list[i]
        
    
    #######################################################
    #
    #######################################################
    def __getitem__(self, idx):
        """
        Args:
            idx
        Returns:

        """
        pair = self.pairs[idx]
        print(pair)
        #print(pair)
        #pair = [self.vocab[t] for t in pair]
        #pair = [self.vocab.__getitem__(t) for t in pair]
        return pair
    
    #######################################################
    #
    #######################################################
    def __len__(self):
        """
        Returns
        """
        return len(self.pairs)

SyntaxError: invalid syntax (<ipython-input-141-6cfb16edbfb8>, line 54)

In [137]:
#######################################################
# Create our Dataset
#######################################################

dataset = SkipGramDataset(punList, voc, skip_window=2)
pairs = dataset.__getitem__(1)
print(pairs)


(tom, alleged) exists in cur_sentence_pairs
deleting (alleged, tom)
(alleged, tom) exists in cur_sentence_pairs
deleting (tom, alleged)
(chinese, laborer) exists in cur_sentence_pairs
deleting (laborer, chinese)
(laborer, chinese) exists in cur_sentence_pairs
deleting (chinese, laborer)
(laborer, said) exists in cur_sentence_pairs
deleting (said, laborer)
(said, laborer) exists in cur_sentence_pairs
deleting (laborer, said)
(said, tom) exists in cur_sentence_pairs
deleting (tom, said)
(said, coolly) exists in cur_sentence_pairs
deleting (coolly, said)
(tom, said) exists in cur_sentence_pairs
deleting (said, tom)
(tom, coolly) exists in cur_sentence_pairs
deleting (coolly, tom)
(coolly, said) exists in cur_sentence_pairs
deleting (said, coolly)
(coolly, tom) exists in cur_sentence_pairs
deleting (tom, coolly)
(baby, oil) exists in cur_sentence_pairs
deleting (oil, baby)
(oil, baby) exists in cur_sentence_pairs
deleting (baby, oil)
(come, squeezing) exists in cur_sentence_pairs
deleting 

deleting (designed, destroyer)
(afraid, heights) exists in cur_sentence_pairs
deleting (heights, afraid)
(heights, afraid) exists in cur_sentence_pairs
deleting (afraid, heights)
(afraid, widths) exists in cur_sentence_pairs
deleting (widths, afraid)
(widths, afraid) exists in cur_sentence_pairs
deleting (afraid, widths)
(obese, editor) exists in cur_sentence_pairs
deleting (editor, obese)
(obese, started) exists in cur_sentence_pairs
deleting (started, obese)
(editor, obese) exists in cur_sentence_pairs
deleting (obese, editor)
(editor, started) exists in cur_sentence_pairs
deleting (started, editor)
(started, obese) exists in cur_sentence_pairs
deleting (obese, started)
(started, editor) exists in cur_sentence_pairs
deleting (editor, started)
(started, weight) exists in cur_sentence_pairs
deleting (weight, started)
(weight, started) exists in cur_sentence_pairs
deleting (started, weight)
(weight, redaction) exists in cur_sentence_pairs
deleting (redaction, weight)
(weight, program) e

deleting (want, know)
(old, chinese) exists in cur_sentence_pairs
deleting (chinese, old)
(old, cooks) exists in cur_sentence_pairs
deleting (cooks, old)
(chinese, old) exists in cur_sentence_pairs
deleting (old, chinese)
(chinese, cooks) exists in cur_sentence_pairs
deleting (cooks, chinese)
(chinese, never) exists in cur_sentence_pairs
deleting (never, chinese)
(cooks, old) exists in cur_sentence_pairs
deleting (old, cooks)
(cooks, chinese) exists in cur_sentence_pairs
deleting (chinese, cooks)
(cooks, never) exists in cur_sentence_pairs
deleting (never, cooks)
(cooks, die) exists in cur_sentence_pairs
deleting (die, cooks)
(never, chinese) exists in cur_sentence_pairs
deleting (chinese, never)
(never, cooks) exists in cur_sentence_pairs
deleting (cooks, never)
(never, die) exists in cur_sentence_pairs
deleting (die, never)
(die, cooks) exists in cur_sentence_pairs
deleting (cooks, die)
(die, never) exists in cur_sentence_pairs
deleting (never, die)
(wok, away) exists in cur_sentence

(every, royal) exists in cur_sentence_pairs
deleting (royal, every)
(every, flush) exists in cur_sentence_pairs
deleting (flush, every)
(every, time) exists in cur_sentence_pairs
deleting (time, every)
(time, flush) exists in cur_sentence_pairs
deleting (flush, time)
(time, every) exists in cur_sentence_pairs
deleting (every, time)
(sternly, warned) exists in cur_sentence_pairs
deleting (warned, sternly)
(warned, sternly) exists in cur_sentence_pairs
deleting (sternly, warned)
(warned, circus) exists in cur_sentence_pairs
deleting (circus, warned)
(circus, warned) exists in cur_sentence_pairs
deleting (warned, circus)
(circus, clown) exists in cur_sentence_pairs
deleting (clown, circus)
(clown, circus) exists in cur_sentence_pairs
deleting (circus, clown)
(light, fuse) exists in cur_sentence_pairs
deleting (fuse, light)
(fuse, light) exists in cur_sentence_pairs
deleting (light, fuse)
(inside, cannon) exists in cur_sentence_pairs
deleting (cannon, inside)
(cannon, inside) exists in cur

(peddling, salesman) exists in cur_sentence_pairs
deleting (salesman, peddling)
(peddling, used) exists in cur_sentence_pairs
deleting (used, peddling)
(peddling, bikes) exists in cur_sentence_pairs
deleting (bikes, peddling)
(used, peddling) exists in cur_sentence_pairs
deleting (peddling, used)
(used, bikes) exists in cur_sentence_pairs
deleting (bikes, used)
(bikes, peddling) exists in cur_sentence_pairs
deleting (peddling, bikes)
(bikes, used) exists in cur_sentence_pairs
deleting (used, bikes)
(tried, record) exists in cur_sentence_pairs
deleting (record, tried)
(record, tried) exists in cur_sentence_pairs
deleting (tried, record)
(record, album) exists in cur_sentence_pairs
deleting (album, record)
(album, record) exists in cur_sentence_pairs
deleting (record, album)
(reptile, shop) exists in cur_sentence_pairs
deleting (shop, reptile)
(shop, reptile) exists in cur_sentence_pairs
deleting (reptile, shop)
(terrible, gecko) exists in cur_sentence_pairs
deleting (gecko, terrible)
(g

deleting (rule, proves)
(rule, proves) exists in cur_sentence_pairs
deleting (proves, rule)
(first, climb) exists in cur_sentence_pairs
deleting (climb, first)
(climb, first) exists in cur_sentence_pairs
deleting (first, climb)
(climb, mount) exists in cur_sentence_pairs
deleting (mount, climb)
(climb, everest) exists in cur_sentence_pairs
deleting (everest, climb)
(mount, climb) exists in cur_sentence_pairs
deleting (climb, mount)
(mount, everest) exists in cur_sentence_pairs
deleting (everest, mount)
(everest, climb) exists in cur_sentence_pairs
deleting (climb, everest)
(everest, mount) exists in cur_sentence_pairs
deleting (mount, everest)
(said, tom) exists in cur_sentence_pairs
deleting (tom, said)
(said, hilariously) exists in cur_sentence_pairs
deleting (hilariously, said)
(tom, said) exists in cur_sentence_pairs
deleting (said, tom)
(tom, hilariously) exists in cur_sentence_pairs
deleting (hilariously, tom)
(hilariously, said) exists in cur_sentence_pairs
deleting (said, hilar

deleting (churn, dairy)
(churn, dairy) exists in cur_sentence_pairs
deleting (dairy, churn)
(less, said) exists in cur_sentence_pairs
deleting (said, less)
(said, less) exists in cur_sentence_pairs
deleting (less, said)
(fighting, dragon) exists in cur_sentence_pairs
deleting (dragon, fighting)
(dragon, fighting) exists in cur_sentence_pairs
deleting (fighting, dragon)
(dragon, easier) exists in cur_sentence_pairs
deleting (easier, dragon)
(easier, dragon) exists in cur_sentence_pairs
deleting (dragon, easier)
(easier, slayed) exists in cur_sentence_pairs
deleting (slayed, easier)
(slayed, easier) exists in cur_sentence_pairs
deleting (easier, slayed)
(slayed, done) exists in cur_sentence_pairs
deleting (done, slayed)
(done, slayed) exists in cur_sentence_pairs
deleting (slayed, done)
(rabbits, generally) exists in cur_sentence_pairs
deleting (generally, rabbits)
(rabbits, lead) exists in cur_sentence_pairs
deleting (lead, rabbits)
(generally, rabbits) exists in cur_sentence_pairs
dele

(wife, always) exists in cur_sentence_pairs
deleting (always, wife)
(always, wife) exists in cur_sentence_pairs
deleting (wife, always)
(children, allowed) exists in cur_sentence_pairs
deleting (allowed, children)
(allowed, children) exists in cur_sentence_pairs
deleting (children, allowed)
(allowed, dig) exists in cur_sentence_pairs
deleting (dig, allowed)
(dig, allowed) exists in cur_sentence_pairs
deleting (allowed, dig)
(dig, coal) exists in cur_sentence_pairs
deleting (coal, dig)
(coal, dig) exists in cur_sentence_pairs
deleting (dig, coal)
(coal, would) exists in cur_sentence_pairs
deleting (would, coal)
(would, coal) exists in cur_sentence_pairs
deleting (coal, would)
(would, still) exists in cur_sentence_pairs
deleting (still, would)
(still, would) exists in cur_sentence_pairs
deleting (would, still)
(still, miners) exists in cur_sentence_pairs
deleting (miners, still)
(miners, still) exists in cur_sentence_pairs
deleting (still, miners)
(went, saw) exists in cur_sentence_pairs

(inn, crowd) exists in cur_sentence_pairs
deleting (crowd, inn)
(crowd, inn) exists in cur_sentence_pairs
deleting (inn, crowd)
(old, ranchers) exists in cur_sentence_pairs
deleting (ranchers, old)
(old, never) exists in cur_sentence_pairs
deleting (never, old)
(ranchers, old) exists in cur_sentence_pairs
deleting (old, ranchers)
(ranchers, never) exists in cur_sentence_pairs
deleting (never, ranchers)
(ranchers, die) exists in cur_sentence_pairs
deleting (die, ranchers)
(never, old) exists in cur_sentence_pairs
deleting (old, never)
(never, ranchers) exists in cur_sentence_pairs
deleting (ranchers, never)
(never, die) exists in cur_sentence_pairs
deleting (die, never)
(die, ranchers) exists in cur_sentence_pairs
deleting (ranchers, die)
(die, never) exists in cur_sentence_pairs
deleting (never, die)
(breed, last) exists in cur_sentence_pairs
deleting (last, breed)
(last, breed) exists in cur_sentence_pairs
deleting (breed, last)
(king, never) exists in cur_sentence_pairs
deleting (nev

deleting (day, one)
(one, whispered) exists in cur_sentence_pairs
deleting (whispered, one)
(whispered, one) exists in cur_sentence_pairs
deleting (one, whispered)
(bad, skin) exists in cur_sentence_pairs
deleting (skin, bad)
(skin, bad) exists in cur_sentence_pairs
deleting (bad, skin)
(pore, excuse) exists in cur_sentence_pairs
deleting (excuse, pore)
(excuse, pore) exists in cur_sentence_pairs
deleting (pore, excuse)
(taste, like) exists in cur_sentence_pairs
deleting (like, taste)
(taste, girl) exists in cur_sentence_pairs
deleting (girl, taste)
(like, taste) exists in cur_sentence_pairs
deleting (taste, like)
(like, girl) exists in cur_sentence_pairs
deleting (girl, like)
(like, scouts) exists in cur_sentence_pairs
deleting (scouts, like)
(girl, taste) exists in cur_sentence_pairs
deleting (taste, girl)
(girl, like) exists in cur_sentence_pairs
deleting (like, girl)
(girl, scouts) exists in cur_sentence_pairs
deleting (scouts, girl)
(scouts, like) exists in cur_sentence_pairs
dele

deleting (sent, message)
(message, friends) exists in cur_sentence_pairs
deleting (friends, message)
(friends, message) exists in cur_sentence_pairs
deleting (message, friends)
(unable, attend) exists in cur_sentence_pairs
deleting (attend, unable)
(attend, unable) exists in cur_sentence_pairs
deleting (unable, attend)
(report, concluded) exists in cur_sentence_pairs
deleting (concluded, report)
(concluded, report) exists in cur_sentence_pairs
deleting (report, concluded)
(good, time) exists in cur_sentence_pairs
deleting (time, good)
(time, good) exists in cur_sentence_pairs
deleting (good, time)
(time, heard) exists in cur_sentence_pairs
deleting (heard, time)
(heard, time) exists in cur_sentence_pairs
deleting (time, heard)
(music, stores) exists in cur_sentence_pairs
deleting (stores, music)
(stores, music) exists in cur_sentence_pairs
deleting (music, stores)
(cd, part) exists in cur_sentence_pairs
deleting (part, cd)
(part, cd) exists in cur_sentence_pairs
deleting (cd, part)
(pa

(todd, sweeney) exists in cur_sentence_pairs
deleting (sweeney, todd)
(good, source) exists in cur_sentence_pairs
deleting (source, good)
(source, good) exists in cur_sentence_pairs
deleting (good, source)
(source, sheer) exists in cur_sentence_pairs
deleting (sheer, source)
(sheer, source) exists in cur_sentence_pairs
deleting (source, sheer)
(sheer, terror) exists in cur_sentence_pairs
deleting (terror, sheer)
(terror, sheer) exists in cur_sentence_pairs
deleting (sheer, terror)
(need, silence) exists in cur_sentence_pairs
deleting (silence, need)
(silence, need) exists in cur_sentence_pairs
deleting (need, silence)
(tom, allowed) exists in cur_sentence_pairs
deleting (allowed, tom)
(allowed, tom) exists in cur_sentence_pairs
deleting (tom, allowed)
(tom, guessed) exists in cur_sentence_pairs
deleting (guessed, tom)
(guessed, tom) exists in cur_sentence_pairs
deleting (tom, guessed)
(big, piece) exists in cur_sentence_pairs
deleting (piece, big)
(piece, big) exists in cur_sentence_pa

(sand, got) exists in cur_sentence_pairs
deleting (got, sand)
(said, tom) exists in cur_sentence_pairs
deleting (tom, said)
(said, grittily) exists in cur_sentence_pairs
deleting (grittily, said)
(tom, said) exists in cur_sentence_pairs
deleting (said, tom)
(tom, grittily) exists in cur_sentence_pairs
deleting (grittily, tom)
(grittily, said) exists in cur_sentence_pairs
deleting (said, grittily)
(grittily, tom) exists in cur_sentence_pairs
deleting (tom, grittily)
(westinghouse, stopped) exists in cur_sentence_pairs
deleting (stopped, westinghouse)
(westinghouse, everything) exists in cur_sentence_pairs
deleting (everything, westinghouse)
(stopped, westinghouse) exists in cur_sentence_pairs
deleting (westinghouse, stopped)
(stopped, everything) exists in cur_sentence_pairs
deleting (everything, stopped)
(everything, westinghouse) exists in cur_sentence_pairs
deleting (westinghouse, everything)
(everything, stopped) exists in cur_sentence_pairs
deleting (stopped, everything)
(needed, b

deleting (great, came)
(came, wail) exists in cur_sentence_pairs
deleting (wail, came)
(carry, many) exists in cur_sentence_pairs
deleting (many, carry)
(many, carry) exists in cur_sentence_pairs
deleting (carry, many)
(many, buckets) exists in cur_sentence_pairs
deleting (buckets, many)
(many, start) exists in cur_sentence_pairs
deleting (start, many)
(buckets, many) exists in cur_sentence_pairs
deleting (many, buckets)
(buckets, start) exists in cur_sentence_pairs
deleting (start, buckets)
(buckets, feeling) exists in cur_sentence_pairs
deleting (feeling, buckets)
(start, many) exists in cur_sentence_pairs
deleting (many, start)
(start, buckets) exists in cur_sentence_pairs
deleting (buckets, start)
(start, feeling) exists in cur_sentence_pairs
deleting (feeling, start)
(start, pail) exists in cur_sentence_pairs
deleting (pail, start)
(feeling, buckets) exists in cur_sentence_pairs
deleting (buckets, feeling)
(feeling, start) exists in cur_sentence_pairs
deleting (start, feeling)
(fe

deleting (educational, created)
(educational, created) exists in cur_sentence_pairs
deleting (created, educational)
(educational, porpoises) exists in cur_sentence_pairs
deleting (porpoises, educational)
(porpoises, educational) exists in cur_sentence_pairs
deleting (educational, porpoises)
(ford, without) exists in cur_sentence_pairs
deleting (without, ford)
(without, ford) exists in cur_sentence_pairs
deleting (ford, without)
(without, battery) exists in cur_sentence_pairs
deleting (battery, without)
(battery, without) exists in cur_sentence_pairs
deleting (without, battery)
(asked, tom) exists in cur_sentence_pairs
deleting (tom, asked)
(asked, crankily) exists in cur_sentence_pairs
deleting (crankily, asked)
(tom, asked) exists in cur_sentence_pairs
deleting (asked, tom)
(tom, crankily) exists in cur_sentence_pairs
deleting (crankily, tom)
(crankily, asked) exists in cur_sentence_pairs
deleting (asked, crankily)
(crankily, tom) exists in cur_sentence_pairs
deleting (tom, crankily)


deleting (herb, today)
(today, gone) exists in cur_sentence_pairs
deleting (gone, today)
(gone, today) exists in cur_sentence_pairs
deleting (today, gone)
(gone, tomorrow) exists in cur_sentence_pairs
deleting (tomorrow, gone)
(tomorrow, gone) exists in cur_sentence_pairs
deleting (gone, tomorrow)
(old, hairdressers) exists in cur_sentence_pairs
deleting (hairdressers, old)
(old, never) exists in cur_sentence_pairs
deleting (never, old)
(hairdressers, old) exists in cur_sentence_pairs
deleting (old, hairdressers)
(hairdressers, never) exists in cur_sentence_pairs
deleting (never, hairdressers)
(hairdressers, die) exists in cur_sentence_pairs
deleting (die, hairdressers)
(never, old) exists in cur_sentence_pairs
deleting (old, never)
(never, hairdressers) exists in cur_sentence_pairs
deleting (hairdressers, never)
(never, die) exists in cur_sentence_pairs
deleting (die, never)
(die, hairdressers) exists in cur_sentence_pairs
deleting (hairdressers, die)
(die, never) exists in cur_senten

(received, award) exists in cur_sentence_pairs
deleting (award, received)
(award, received) exists in cur_sentence_pairs
deleting (received, award)
(award, sheer) exists in cur_sentence_pairs
deleting (sheer, award)
(sheer, award) exists in cur_sentence_pairs
deleting (award, sheer)
(sheer, delete) exists in cur_sentence_pairs
deleting (delete, sheer)
(delete, sheer) exists in cur_sentence_pairs
deleting (sheer, delete)
(name, abbie) exists in cur_sentence_pairs
deleting (abbie, name)
(abbie, name) exists in cur_sentence_pairs
deleting (name, abbie)
(mother, superior) exists in cur_sentence_pairs
deleting (superior, mother)
(superior, mother) exists in cur_sentence_pairs
deleting (mother, superior)
(gruel, unusual) exists in cur_sentence_pairs
deleting (unusual, gruel)
(unusual, gruel) exists in cur_sentence_pairs
deleting (gruel, unusual)
(unusual, punishment) exists in cur_sentence_pairs
deleting (punishment, unusual)
(punishment, unusual) exists in cur_sentence_pairs
deleting (unusu

deleting (bird, fouls)
(ballet, shoes) exists in cur_sentence_pairs
deleting (shoes, ballet)
(ballet, made) exists in cur_sentence_pairs
deleting (made, ballet)
(shoes, ballet) exists in cur_sentence_pairs
deleting (ballet, shoes)
(shoes, made) exists in cur_sentence_pairs
deleting (made, shoes)
(made, ballet) exists in cur_sentence_pairs
deleting (ballet, made)
(made, shoes) exists in cur_sentence_pairs
deleting (shoes, made)
(made, pointe) exists in cur_sentence_pairs
deleting (pointe, made)
(pointe, made) exists in cur_sentence_pairs
deleting (made, pointe)
(believe, restaurants) exists in cur_sentence_pairs
deleting (restaurants, believe)
(believe, always) exists in cur_sentence_pairs
deleting (always, believe)
(restaurants, believe) exists in cur_sentence_pairs
deleting (believe, restaurants)
(restaurants, always) exists in cur_sentence_pairs
deleting (always, restaurants)
(restaurants, overcook) exists in cur_sentence_pairs
deleting (overcook, restaurants)
(always, believe) exist

(upon, discovering) exists in cur_sentence_pairs
deleting (discovering, upon)
(discovering, upon) exists in cur_sentence_pairs
deleting (upon, discovering)
(discovering, deadly) exists in cur_sentence_pairs
deleting (deadly, discovering)
(deadly, discovering) exists in cur_sentence_pairs
deleting (discovering, deadly)
(deadly, virus) exists in cur_sentence_pairs
deleting (virus, deadly)
(deadly, carried) exists in cur_sentence_pairs
deleting (carried, deadly)
(virus, deadly) exists in cur_sentence_pairs
deleting (deadly, virus)
(virus, carried) exists in cur_sentence_pairs
deleting (carried, virus)
(carried, deadly) exists in cur_sentence_pairs
deleting (deadly, carried)
(carried, virus) exists in cur_sentence_pairs
deleting (virus, carried)
(carried, surinamese) exists in cur_sentence_pairs
deleting (surinamese, carried)
(surinamese, carried) exists in cur_sentence_pairs
deleting (carried, surinamese)
(surinamese, toads) exists in cur_sentence_pairs
deleting (toads, surinamese)
(toads

deleting (language, knew)
(sign, knew) exists in cur_sentence_pairs
deleting (knew, sign)
(sign, language) exists in cur_sentence_pairs
deleting (language, sign)
(language, knew) exists in cur_sentence_pairs
deleting (knew, language)
(language, sign) exists in cur_sentence_pairs
deleting (sign, language)
(mute, point) exists in cur_sentence_pairs
deleting (point, mute)
(point, mute) exists in cur_sentence_pairs
deleting (mute, point)
(saw, something) exists in cur_sentence_pairs
deleting (something, saw)
(saw, similar) exists in cur_sentence_pairs
deleting (similar, saw)
(something, saw) exists in cur_sentence_pairs
deleting (saw, something)
(something, similar) exists in cur_sentence_pairs
deleting (similar, something)
(similar, saw) exists in cur_sentence_pairs
deleting (saw, similar)
(similar, something) exists in cur_sentence_pairs
deleting (something, similar)
(similar, moss) exists in cur_sentence_pairs
deleting (moss, similar)
(moss, similar) exists in cur_sentence_pairs
deletin

(bluntly, tom) exists in cur_sentence_pairs
deleting (tom, bluntly)
(crack, open) exists in cur_sentence_pairs
deleting (open, crack)
(open, crack) exists in cur_sentence_pairs
deleting (crack, open)
(open, walnuts) exists in cur_sentence_pairs
deleting (walnuts, open)
(walnuts, open) exists in cur_sentence_pairs
deleting (open, walnuts)
(walnuts, several) exists in cur_sentence_pairs
deleting (several, walnuts)
(several, walnuts) exists in cur_sentence_pairs
deleting (walnuts, several)
(several, officers) exists in cur_sentence_pairs
deleting (officers, several)
(several, foolishly) exists in cur_sentence_pairs
deleting (foolishly, several)
(officers, several) exists in cur_sentence_pairs
deleting (several, officers)
(officers, foolishly) exists in cur_sentence_pairs
deleting (foolishly, officers)
(officers, pounded) exists in cur_sentence_pairs
deleting (pounded, officers)
(foolishly, several) exists in cur_sentence_pairs
deleting (several, foolishly)
(foolishly, officers) exists in 

In [128]:
def detectPuns(skipgram):
    """
    input:
        list of lists of list of words representing the input sentences
    output:
        list of boolean values, one boolean value for each input sentence.
        True = yes, the sentence did contain a pun
        False = no, the sentence did not contain a pun
    """
    # calculate the pmi scores between the two words in each pair & store this in a dictionary with the index...
    # ... being the word pair and the value being the pmi score
    # generate threshold value for each sentence of pairs
    # if any pmi score is above this threshold value for a given sentence, then that sentence contains a pun
    
    # generate bigrams for each sentence in order to determine PMI scores (since we need to know the probability of a word)
    bgrams = [nltk.bigrams(sent) for sent in skipgram.data]
   
    
    for i in range(skipgram.__len__()):
        for pair in skipgram.__getitem__(i):
            pass
            
            
    
    

SyntaxError: invalid syntax (<ipython-input-128-17a018ea203a>, line 17)

In [None]:
import torch.nn.functional as F

class SkipGramModel(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        """
        Args:
            vocab_size (int): vocabulary size
            embedding_dim (int): the dimension of word embeddings
        """
        ### INSERT YOUR CODE BELOW ###
        #self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        #self.linear = torch.nn.Linear(1, vocab_size)
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.linear = torch.nn.Linear(embedding_dim, vocab_size)
        
        ### INSERT YOUR CODE ABOVE ###

    def forward(self, inputs):
        """
        Perform the forward pass of the skip-gram model.
        
        Args:
            inputs (torch.LongTensor): input tensor containing batches of word ids [Bx1]
        Returns:
            outputs (torch.FloatTensor): output tensor with unnormalized probabilities over the vocabulary [BxV]
        """
        ### INSERT YOUR CODE BELOW ###
        embeds = self.embedding(inputs)
        #embeds = self.embedding(inputs)
        outputs = self.linear(embeds)
        outputs=outputs
        #output = F.log_softmax(self.linear(embeds), dim=1)
        ### INSERT YOUR CODE ABOVE ###
        return outputs
    
    def save_embeddings(self, voc, path):
        """
        Save the embedding matrix to a specified path.
        
        Args:
            voc (Vocabulary): the Vocabulary object for id-to-token mapping
            path (str): the location of the target file
        """
        ### INSERT YOUR CODE BELOW ###
        embeds = self.embedding.weight.data.cpu().numpy()
        f = open(path, 'w')
        f.write(str(vocab_size) + ' ' + str(embedding_dim) + '\n')
        
        for idx in range(len(embeds)):
            word = voc.idx2w[idx]
            embedding = ' '.join(map(str,embeds[idx]))
            f.write(word + ' '+ embedding + '\n')
        ### INSERT YOUR CODE ABOVE ###
        print("Successfuly saved to {}".format(path))