# N-Hance system recreation for SemEval2017 Task 7

Names: Jorge Mazariegos & Cameron Knopp

In [1]:
# imports statements
import time
import string
import itertools
import operator
import math
import matplotlib.pyplot as plt
import numpy as np
import torch
import nltk
import re
from nltk.collocations import *
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
from scipy.stats import iqr
from statistics import median
from collections import defaultdict, OrderedDict, Counter
from bs4 import BeautifulSoup
#from gensim.models import KeyedVectors
from torch.utils.data import Dataset, DataLoader

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
set(stopwords.words('english'))
  

%matplotlib inline
plt.style.use('seaborn-paper')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/camknopp/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/camknopp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/camknopp/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import pywsd

Warming up PyWSD (takes ~10 secs)... took 6.07002592086792 secs.


In [3]:
# preprocess should take in the datasets (.xml) and prepare them to be used
def preprocess(data):
    """
    Args:
        sentences (list(str)):
    Returns: a list of tokens and a list of tokenized sentences

    """
    #######################################################
    # initialize variables to use in preprocess
    #######################################################
    puns = []
    tokens = []
    stop = stopwords.words('english') + list(string.punctuation)
        
    #######################################################
    # Given a sentence, tokenize it and append it to a list
    #######################################################
    for sentence in data:
        puns.append(word_tokenize(sentence.lower())) # creates the list of all sentences
        
    #######################################################
    # Every sentence is tokenized, but let's grab each
    # individual word to make a vocab out of.
    #######################################################
    for sentence in puns:
        for word in sentence:
            if(word.isalpha()): # filter out punctuation
                tokens.append(word)
    #######################################################
    # Remove stop words from tokens
    #######################################################
    tokens_with_stop_words = tokens
    tokens = [token for token in tokens_with_stop_words if token not in stop]

    return tokens, puns

In [26]:
def data_process(file):

    # DATA PROCESSING #
    #######################################################
    # Open the dataset/'s we will be using and process the
    # text within to be used by our code.
    #######################################################
    #f = open('datasets/data/test/subtask1-heterographic-test.xml', 'r', encoding = 'utf8')
    
    f = open(file, 'r', encoding = 'utf8')
    data = f.read()

    #######################################################
    # Using Beautiful Soup we can easily extract the puns
    # from the given datasets.
    #######################################################
    soup = BeautifulSoup(data, 'xml')
    ids = soup.find_all('text')
    words = soup.find_all('word')
    
    """
    add each word to the dictionary pa
    """
    #######################################################
    # Create a list of all sentences within the dataset to hand
    # over to our preprocess function
    #######################################################
    wurd = ""
    sentence = ""
    sentences = []
    pun_list = []
    
    #this will be a dictionary of {sentence : sentence_id}
    sentences_dict = dict()
    
    # this will contain the mapping of sentence id -> list(tuple(word, word_id))
    word_dict = dict(list())
    
    for i in range(len(words)):
        if words[i].parent['id'] in word_dict:
            word_dict[words[i].parent['id']].append((words[i], words[i]['id']))
        else:
            word_dict[words[i].parent['id']] = [(words[i], words[i]['id'])]
        
    #print(word_dict)
    
    
    for i in range(len(ids)):
        for line in ids[i]:
            for word in line:
                if(word != '\n' or word == '\''):
                    if(word.isalpha()): # If not punctuation
                        wurd = word
                        if(sentence == ""): # If the start of the sentence
                            sentence = sentence + wurd
                        else: # If not the start of the sentence
                            sentence = sentence + " " + wurd
                    else: # If punctuation we don't want to put a space between the character and it.
                        wurd = word
                        sentence = sentence + wurd
                    wurd = "" # clear the current word
        sentences.append(sentence) # append the created string sentence to our list.
        
        sentences_dict.update({sentence : ids[i]['id']}) # map the sentence to it's sentence id in the .xml file
        sentence = ""
        
    #######################################################
    # Create a list of tokens to make a vocabulary of and
    # create a list of sentences to create make word pairs
    # from.
    #######################################################
    
    
    tokens, pun_list = preprocess(sentences)
    return tokens, pun_list, sentences_dict, word_dict
    

In [27]:
class Vocabulary:
    def __init__(self, special_tokens=None):
        self.w2idx = {}
        self.idx2w = {}
        self.w2cnt = defaultdict(int)
        self.special_tokens = special_tokens
        if self.special_tokens is not None:
            self.add_tokens(special_tokens)

    def add_tokens(self, tokens):
        for token in tokens:
            self.add_token(token)
            self.w2cnt[token] += 1

    def add_token(self, token):
        if token not in self.w2idx:
            cur_len = len(self)
            self.w2idx[token] = cur_len
            self.idx2w[cur_len] = token

    def prune(self, min_cnt=2):
        to_remove = set([token for token in self.w2idx if self.w2cnt[token] < min_cnt])
        if self.special_tokens is not None:
            to_remove = to_remove.difference(set(self.special_tokens))
        
        for token in to_remove:
            self.w2cnt.pop(token)
            
        self.w2idx = {token: idx for idx, token in enumerate(self.w2cnt.keys())}
        self.idx2w = {idx: token for token, idx in self.w2idx.items()}
    
    def __contains__(self, item):
        return item in self.w2idx
    
    def __getitem__(self, item):
        if isinstance(item, str):
            return self.w2idx[item]
        elif isinstance(item , int):
            return self.idx2w[item]
        else:
            raise TypeError("Supported indices are int and str")
    
    def __len__(self):
        return(len(self.w2idx))

In [28]:
#######################################################
# Using skipgrams we can create the wordpairs described
# in the N-Hance research paper.
#######################################################

class SkipGramDataset(Dataset):
    def __init__(self, data, vocab, skip_window=3):
        super().__init__()

        #######################################################
        # Unlike before, data will be a list of strings handed
        # all at once.
        #######################################################
        self.vocab = vocab
        self.data = data
        # set skip_window to the length of the longest sentence in the data set
        self.skip_window =  max(data, key=len)
        self.pairs = self._generate_pairs(data, skip_window)
        
        
    #######################################################
    # generate word pairs given list of lists of words representing each sentence
    #######################################################
    def _generate_pairs(self, data, skip_window):

        pairs = [[]]  # list of word pairs for each sentence
        curr_sentence_pairs = [] # list of word pairs for current sentence
        pruned_pairs = []
        

        for sent in data: 
            for i in range(len(sent)):
                for j in range(-skip_window, skip_window + 1):
                    context_idx = i + j
                    if j == 0 or context_idx < 0 or context_idx >= len(sent):
                        continue
                    if sent[i] not in self.vocab or sent[context_idx] not in self.vocab:
                        continue
                        
                    # only add in this sentence if the reverse does not already exist in the list
                    if (sent[context_idx], sent[i]) not in curr_sentence_pairs:
                        curr_sentence_pairs.append((sent[i], sent[context_idx]))
                    
            pairs.append(curr_sentence_pairs.copy()) # need to append a copy so that it is not cleared with we call clear() in the next line
            curr_sentence_pairs.clear()
                    
        return pairs
    
    #######################################################
    # returns the list of word_pairs for the sentence at the given index
    #######################################################
    def __getitem__(self, idx):

        pair = self.pairs[idx]

        #pair = [self.vocab[t] for t in pair]
        #pair = [self.vocab.__getitem__(t) for t in pair]
        return pair
    
    #######################################################
    # returns the number of sentences
    #######################################################
    def __len__(self):

        return len(self.pairs)

In [29]:
def generate_pmi_scores(file):
    """
    returns a list of dictionaries (one for each sentence) of {word_pair : pmi_score}
    each dictionary is ordered from highest to lowest pmi score
    """
    
    """
        Because pun words seldom appear in Wikipedia, we added test datasets to guarantee words co-occur at least once and thus the
        system is able to compute PMI scores for each
        word pair. 
    
    """
    
#     corpus = open("corpus.txt").read().replace('\n','')
#     corpus = word_tokenize(corpus)

#     # Call the bigram method within NLTK
#     bigram_measures = nltk.collocations.BigramAssocMeasures()

#     # Apply the Moby Dick text to the above bigram method
#     bgram_finder = BigramCollocationFinder.from_words(corpus, 20)

#     # Score the bigrams
#     bgrams_scored = bgram_finder.score_ngrams(bigram_measures.raw_freq)
    
#     print(bgrams_scored[:5])

    # Tokenize dataset and Create a Vocabulary using the tokens
    tokens, pun_list, sentences_dict, word_dict = data_process(file)
    voc = Vocabulary()
    voc.add_tokens(tokens)
    
    # create skipgram dataset using vocab and puns
    skipgram = SkipGramDataset(pun_list, voc, skip_window=2)
    
    # create a Counter object to get counts of individual words    
    stop = stopwords.words('english') + list(string.punctuation) + ["''", '""', "..."]
    all_sentences = list(itertools.chain.from_iterable(pun_list.copy()))
    all_sentences = [word for word in all_sentences if word not in stop]
    
    word_counts = Counter(all_sentences)
    total_words = len(all_sentences)
    
    # get list of lists of word_pairs for each sentence
    word_pairs = skipgram.pairs.copy()
    word_pairs = [[(a,b) for (a,b) in sent] for sent in word_pairs] 
        
    # create Counter object to get counts for each word_pair
    all_word_pairs= list(itertools.chain.from_iterable(word_pairs.copy())) # join all sentences together
    all_word_pairs = [(a,b) for (a,b) in all_word_pairs] 
    total_word_pairs = len(all_word_pairs)
    word_pair_count = Counter(all_word_pairs)
    
   # print(all_word_pairs)
    # create a list of dictionaries for each sentence { word_pair : pmi_score }
    pmi_scores = list(dict())
    
    # now we will calculate the PMI score for each word_pair
    # the formula for PMI score is: log[p(x,y) / (p(x)*p(y))]
    for i in range(skipgram.__len__()):
        current_dict = {}
        # for each sentence, find pmi score for each individual word_pair
        for w_pair in word_pairs[i]:
    
            numerator = word_pair_count[w_pair] / total_word_pairs
            denominator = (word_counts[w_pair[0]] / total_words) * (word_counts[w_pair[1]] / total_words)
            current_pmi =  numerator / denominator
            current_pmi = math.log2(current_pmi)
            current_dict.update({w_pair : current_pmi}) # add bigram's pmi score to dictionary at index i (the current sentence)
        
        pmi_scores.append(current_dict.copy())
        current_dict.clear()
        
    
    # now we sort the dictionary entries from highest->lowest based on value (PMI score)
    ordered_pmi_scores = list(OrderedDict())
    
    for i in range(len(pmi_scores)):
        current_dict = pmi_scores[i]
        # convert to dictionary ordered by value (which is the pmi score in this case)
        current_ordered_dict = OrderedDict(sorted(current_dict.items(), key=lambda x: x[1], reverse=True))
        ordered_pmi_scores.append(current_ordered_dict.copy())
        current_ordered_dict.clear()
   
    print(ordered_pmi_scores)
    return ordered_pmi_scores, skipgram, word_dict, sentences_dict
    

In [30]:
def detect_puns(file, heterographic):
    """
    create word_pairs for sentences in given file
    calculate pmi scores for all given word_pairs
    calculate the interquartile range for the pmi scores of word_pairs in each sentence
    find the median value of the interquartile ranges across all sentences in the given dataset
    for each sentence, if the highest pmi score - second highest pmi score > median interquartile range ...
    (cont.) then that means that that sentence contains a pun
    """
    
    # homographic pun 5 would be referred to as hom5 in the final list (this is based on the N-Hance system's guidelines)
    if heterographic:
        prefix = "het"
    else:
        prefix = "hom"
    
    # get pmi scores for all word_pairs in the file
    ordered_pmi_scores, skipgram = generate_pmi_scores(file)[:2]
    
    # now we need to find the interquartile range for each dictionary in the list using iqr from scipy.stats
    iqr_values = []
    
    for dictionary in ordered_pmi_scores:
        iqr_values.append(iqr(list(dictionary.values())))
    
    
    # now we take the median of these iqr values and take that as our iqr value of the current dataset
    median_iqr = median(iqr_values)
    
    # this will contain a 0 or 1 at each sentence id, 
    # 1 = contains pun; 0 = does not contain pun
    contains_pun = []
     
    for i in range(1, len(ordered_pmi_scores)):
        curr_dict = list(ordered_pmi_scores[i].items())
        
        if len(curr_dict) > 1 :
            # if the difference between the highest pmi score and second highest pmi score (cont.)
            #... is greater than the median iqr, then the sentence contains a pun
            if float(curr_dict[0][1] - curr_dict[1][1]) > median_iqr:
                contains_pun.append(prefix + "_" + str(i) + " 1" )
            else:
                contains_pun.append(prefix + "_" + str(i) + " 0" )
        else:
            contains_pun.append(prefix + "_" + str(i) + " 0" )        

    # returning more than just one value because these other values are needed in subtask 2 later on
    return contains_pun, ordered_pmi_scores, skipgram

In [31]:
"""
Completes subtask 1 (pun detection)
Outputs the results for heterographic and homographic puns to two seperate files

In order to run the scoring system for the heterographic puns:
open terminal and change directory to ~/datasets/scoring/bin
then, type (your file paths will vary):
java de.tudarmstadt.ukp.semeval2017.task7.scorer.PunScorer -d ~/Desktop/GitHub/NLP-Final-Project/datasets/data/test/subtask1-heterographic-test.gold ~/Desktop/GitHub/NLP-Final-Project/system_output/subtask1_heterographic.txt ~/Desktop/GitHub/NLP-Final-Project/scorer_results/subtask1_heterographic_results.txt

for homographic puns:
open terminal and change directory to ~/datasets/scoring/bin
then, type (your file paths will vary): 
java de.tudarmstadt.ukp.semeval2017.task7.scorer.PunScorer -d ~/Desktop/GitHub/NLP-Final-Project/datasets/data/test/subtask1-homographic-test.gold ~/Desktop/GitHub/NLP-Final-Project/system_output/subtask1_homographic.txt ~/Desktop/GitHub/NLP-Final-Project/scorer_results/subtask1_homographic_results.txt

"""
def subtask1():
    contains_pun_heterographic = detect_puns('datasets/data/test/subtask1-heterographic-test.xml', True)[0]
    with open('system_output/subtask1_heterographic.txt', 'w') as filehandle:
        for pun_result in contains_pun_heterographic:
            filehandle.write('{}\n'.format(pun_result))

    contains_pun_homographic = detect_puns('datasets/data/test/subtask1-homographic-test.xml', False)[0]
    with open('system_output/subtask1_homographic.txt', 'w') as filehandle:
        for pun_result in contains_pun_homographic:
            filehandle.write('{}\n'.format(pun_result))
            
subtask1()


[OrderedDict(), OrderedDict([(('tom', 'alleged'), 6.056495362270003)]), OrderedDict([(('chinese', 'laborer'), 10.87027655348704), (('laborer', 'said'), 6.300420945156093), (('said', 'coolly'), 6.300420945156093), (('tom', 'coolly'), 6.056495362270003), (('said', 'tom'), 5.531033873297509)]), OrderedDict([(('squeezing', 'babies'), 13.87027655348704), (('squeezing', 'dead'), 11.548348458599678), (('dead', 'babies'), 11.548348458599678), (('come', 'squeezing'), 9.87027655348704), (('baby', 'oil'), 9.548348458599678)]), OrderedDict([(('hard', 'day'), 7.0373865393222985), (('like', 'hard'), 5.641457862991159)]), OrderedDict([(('wildebeests', 'gnus'), 12.87027655348704), (('evil', 'wildebeests'), 11.062921631429436), (('wildebeests', 'bad'), 9.7828137122367), (('bad', 'gnus'), 8.7828137122367), (('evil', 'bad'), 6.975458790179097)]), OrderedDict(), OrderedDict([(('busy', 'barber'), 10.548348458599678), (('quite', 'harried'), 10.285314052765884), (('barber', 'quite'), 9.285314052765884)]), Or



In [32]:
def locate_puns(file, heterographic):
    """
    generate pmi scores
    the second word in the word pair with the highest pmi score in a given sentence is the pun word
    if a given index contains a pun,
    check the highest pmi score in the dictionary at that index & find the second word
    Append this to the results list
    in order to get the correct sentence id for a given pun, I need to find the sentence in the 
    """
    
    formatted_results = []
    stop = stopwords.words('english') + list(string.punctuation)
   
    # get pmi scores in the form of a dictionary for each sentence mapping each word_pair to a pmi score
    ordered_pmi_scores, skipgram, word_dict, sentences_dict = generate_pmi_scores(file)
    
    # this will be a dict mapping sentence_id to the pun_word_index (starting from 1, not 0) in that sentence
    sent2punidx = dict()
    sentences = list(sentences_dict.keys())

    for i in range(1, len(ordered_pmi_scores)):
        if (len(ordered_pmi_scores[i]) == 0):
            continue
            
        curr_dict = ordered_pmi_scores[i]
        highest_pmi = list(curr_dict.items())[0][0] # get word_pair in current sentence with highest pmi
        pun_index = 0
        # now we need to find the sentence_id that this word_pair belongs to
        
        for j in range(len(sentences)):
            found_sentence = False
            
            if highest_pmi[0] and highest_pmi[1] in sentences[j].lower():
                found_sentence = True
                curr_sent = word_tokenize(sentences[j].lower())
                sentence_id = sentences_dict[sentences[j]]
              
                for word in word_dict[sentence_id]:
                    if word[0].string == highest_pmi[1]:
                        pun_index = word[1]
                        sent2punidx[sentence_id] = pun_index
                        break
                    
            # no need to check the rest of the sentences since we already found where the pun is located
            if found_sentence: 
                break 
                        

   # print(list(sent2punidx.items()))
        
    for pun in list(sent2punidx.items()):
        formatted_results.append(str(pun[0]) + " " + str(pun[1]))
#     for pun in list(sentence_to_pun_location.items()):
#         formatted_results.append(str(pun[0]) + " " + str(pun[0]) + "_" + str(pun[1]))
    
    #print(formatted_results)
    
    return formatted_results     

In [33]:
"""
Completes subtask 2 (pun location)
Outputs the results for heterographic and homographic puns to two seperate files

In order to run the scoring system for the heterographic puns:
open terminal and change directory to ~/datasets/scoring/bin
then, enter (your file paths will vary): 
java de.tudarmstadt.ukp.semeval2017.task7.scorer.PunScorer -l ~/Desktop/GitHub/NLP-Final-Project/datasets/data/test/subtask2-heterographic-test.gold ~/Desktop/GitHub/NLP-Final-Project/system_output/subtask2_heterographic.txt ~/Desktop/GitHub/NLP-Final-Project/scorer_results/subtask2_heterographic_results.txt

for homographic puns:
open terminal and change directory to ~/datasets/scoring/bin
then, enter (your file paths will vary): 
java de.tudarmstadt.ukp.semeval2017.task7.scorer.PunScorer -l ~/Desktop/GitHub/NLP-Final-Project/datasets/data/test/subtask2-homographic-test.gold ~/Desktop/GitHub/NLP-Final-Project/system_output/subtask2_homographic.txt ~/Desktop/GitHub/NLP-Final-Project/scorer_results/subtask2_homographic_results.txt
"""
def subtask2():
    locate_pun_heterographic = locate_puns('datasets/data/test/subtask2-heterographic-test.xml', True)
    with open('system_output/subtask2_heterographic.txt', 'w') as filehandle:
        for pun_location in locate_pun_heterographic:
            filehandle.write('{}\n'.format(pun_location))

    locate_pun_homographic = locate_puns('datasets/data/test/subtask2-homographic-test.xml', False)
    with open('system_output/subtask2_homographic.txt', 'w') as filehandle:
        for pun_location in locate_pun_homographic:
            filehandle.write('{}\n'.format(pun_location))
            
subtask2()

[OrderedDict(), OrderedDict([(('tom', 'alleged'), 5.658427108384172)]), OrderedDict([(('chinese', 'laborer'), 10.47220829960121), (('laborer', 'said'), 5.917619447923572), (('said', 'coolly'), 5.917619447923572), (('tom', 'coolly'), 5.658427108384172), (('said', 'tom'), 5.148232376064988)]), OrderedDict([(('hard', 'day'), 7.980355203271535), (('like', 'hard'), 6.395392702550379)]), OrderedDict([(('evil', 'wildebeests'), 12.47220829960121), (('wildebeests', 'gnus'), 12.47220829960121), (('wildebeests', 'bad'), 9.887245798880054), (('evil', 'bad'), 8.887245798880054), (('bad', 'gnus'), 8.887245798880054)]), OrderedDict([(('busy', 'barber'), 11.47220829960121), (('quite', 'harried'), 9.887245798880054), (('barber', 'quite'), 8.887245798880054)]), OrderedDict([(('raise', 'birds'), 12.47220829960121), (('name', 'avery'), 8.26275493397226)]), OrderedDict([(('stairing', 'contest'), 11.887245798880054), (('construction', 'workers'), 10.56531770399269), (('two', 'construction'), 7.7173207974377



In [34]:
# this will install pretrained work vectors on your machine, which we also used in homework 3.

# !wget --no-clobber https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
# !unzip -n wiki-news-300d-1M.vec.zip
    

In [35]:
#!ls -lh wiki-news-300d-1M*


In [36]:
#word_vectors = KeyedVectors.load_word2vec_format('wiki-news-300d-1M.vec', binary=False)

In [37]:
def interpret_puns(file):
    """ 
    process the .xml file
    Find the pun word for each sentence (the one when word.sense == '2')
    get the first relevant sense of the word by using pywsd simple lesk method
        *** this will the overlap between the pun word sentence and the dictionary entries
    for the second sense of the pun word, extract all senses and synyonms for given pun word
    to measure the similarity of two words, use word2vec and take cosine of vectors of those words
    word2vec model created using gensim library and fed using wikipedia data. Vec size of 128-dimensional and window size of 10

    """
    print("starting")
    ordered_pmi_scores, skipgram, word_dict, sentences_dict = generate_pmi_scores(file)
    
    
    puns = dict()
    print(word_dict)
    
    for sent in word_dict:
        print("iteration")
        for word2id in sent:
            if word2id[0]['senses'] == '2':
                puns[sent] = word2id[0].string
                continue
    print("finished")
    print(puns)
    print("finished")

    
    
    return
    

In [38]:
def subtask3():
    interpret_puns('datasets/data/test/subtask3-heterographic-test.xml')