# Importing the necessary libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy
import os
import time
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn

# NLTK Tokenise meanings
- CC coordinating conjunction
- CD cardinal digit
- DT determiner
- EX existential there (like: “there is” … think of it like “there exists”)
- FW foreign word
- IN preposition/subordinating conjunction
- JJ adjective ‘big’
- JJR adjective, comparative ‘bigger’
- JJS adjective, superlative ‘biggest’
- LS list marker 1)
- MD modal could, will
- NN noun, singular ‘desk’
- NNS noun plural ‘desks’
- NNP proper noun, singular ‘Harrison’
- NNPS proper noun, plural ‘Americans’
- PDT predeterminer ‘all the kids’
- POS possessive ending parent’s
- PRP personal pronoun I, he, she
- PRP possessive pronoun my, his, hers
- RB adverb very, silently,
- RBR adverb, comparative better
- RBS adverb, superlative best
- RP particle give up
- TO, to go ‘to’ the store.
- UH interjection, errrrrrrrm
- VB verb, base form take
- VBD verb, past tense took
- VBG verb, gerund/present participle taking
- VBN verb, past participle taken
- VBP verb, sing. present, non-3d take
- VBZ verb, 3rd person sing. present takes
- WDT wh-determiner which
- WP wh-pronoun who, what
- WP possessive wh-pronoun whose
- WRB wh-abverb where, when

## We discard these categories for our task: CC, CD, DT, EX, IN, LS, MD, PDT, POS, PRP, TO, UH, WDT, WP, WP, WRB

# Spacy tokenise meanings (not using this, just for reference)
- PERSON	People, including fictional.
- NORP	Nationalities or religious or political groups.
- FAC	Buildings, airports, highways, bridges, etc.
- ORG	Companies, agencies, institutions, etc.
- GPE	Countries, cities, states.
- LOC	Non-GPE locations, mountain ranges, bodies of water.
- PRODUCT	Objects, vehicles, foods, etc. (Not services.)
- EVENT	Named hurricanes, battles, wars, sports events, etc.
- WORK_OF_ART	Titles of books, songs, etc.
- LAW	Named documents made into laws.
- LANGUAGE	Any named language.
- DATE	Absolute or relative dates or periods.
- TIME	Times smaller than a day.
- PERCENT	Percentage, including ”%“.
- MONEY	Monetary values, including unit.
- QUANTITY	Measurements, as of weight or distance.
- ORDINAL	“first”, “second”, etc.
- CARDINAL	Numerals that do not fall under another type.

# GloVe embeddings
- Link: http://nlp.stanford.edu/data/glove.6B.zip
- Run the following code to get the embeddings from the official site for GloVe
- zip file was downloaded and extracted

In [4]:
#Loading embeddings from GloVe 
embeddings = {}


f = open('/home/andrea/Notebooks/Saama AI/glove.6B/glove.6B.50d.txt',encoding="utf8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings))

Loaded 400000 word vectors.


In [5]:
#to calculate elapsed time
class ElapsedTimer(object):
    def __init__(self):
        self.start_time = time.time()
    def elapsed(self,sec):
        if sec < 60:
            return str(sec) + " sec"
        elif sec < (60 * 60):
            return str(sec / 60) + " min"
        else:
            return str(sec / (60 * 60)) + " hr"
    def elapsed_time(self):
        print("Elapsed: %s " % self.elapsed(time.time() - self.start_time) )

In [6]:
#to get entities using nltk
def entities_with_nltk(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [7]:
#to get the nearest neighbours using cosine similarity
def get_neighbours(s):
    res = {}
    for i in embeddings:
        if s not in embeddings:
            return res
        res[i] = scipy.spatial.distance.cosine(embeddings[i],embeddings[s])
    sorted_keys = sorted(res, key=res.get) 
    return sorted_keys

In [8]:
#to get the words that can be used for prediction
def get_possible_words(s):
    words = []
    exceptions = ['CC', 'CD', 'DT', 'EX', 'IN', 'LS', 'MD', 'PDT', 'POS', 'PRP', 'TO', 'UH', 'WDT', 'WP', 'WP', 'WRB','.',',']
    for i in s:
        if i[1] not in exceptions and len(i[0])>3:
            words.append(i[0])
    return words

In [9]:
#to get the answer
def get_answer(words,word):
    for i in words:
        s = get_neighbours(i)
        for k in s:
            if(len(k)==len(word)):
                if(compare(k,word)==True):
                    return k
    return ""

In [10]:
#to compare two words
def compare(a,b):
    for i in range(len(a)):
        if b[i]=='_':
            pass
        else:
            if(a[i]!=b[i]):
                return False
    return True

In [11]:
#to get word
def get_word(sent,word):
    categories = entities_with_nltk(sent)
    words = get_possible_words(categories)
    timer = ElapsedTimer()
    ans = get_answer(words, word)
    timer.elapsed_time()
    return ans

In [12]:
evaluation = pd.read_csv('eval.csv',sep = ',')

In [13]:
evaluation.head()

Unnamed: 0,Masked,Meaning
0,C o _ _ n t h,the modern Greek port near the site of the anc...
1,_ e c e i _ e,get something; come into possession of
2,_ o l l a g _,a paste-up made by sticking together pieces of...
3,t _ _ s h,worthless material that is to be disposed of
4,D e _ a w _ r _,a river that rises in the Catskills in southea...


In [14]:
def remove_space(x):
    x = x.lower()
    letters = [i for i in x]
    s = ""
    for i in letters:
        if i!=' ':
            s = s+i
    return s
evaluation['Masked'] = evaluation['Masked'].apply(lambda x: remove_space(x))

# Let us try evaluating this to see how it goes

In [15]:
sent = "a garden plant with purple flowers that smell very pleasant."
word = "l_v_nd_r"

In [16]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/andrea/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/andrea/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [17]:
get_word(sent,word)

Elapsed: 36.17349362373352 sec 


'lavender'

# Now applying it to the entire dataset

In [18]:
evaluation['Answer']=evaluation.apply(lambda x: get_word(x.Meaning, x.Masked), axis=1)

Elapsed: 36.98732399940491 sec 


KeyboardInterrupt: 

# The previous function was run on a GPU enabled notebook 
- It used about 20 seconds per word. 
- The answers were stored in the column 'Answers'
- The results obtained are stored in a file called 'results.csv' and loaded here for reference
- It was able to predict the answer for most of the words, in some cases (very rare) predicted the wrong words

In [19]:
results = pd.read_csv('results.csv')
results = results.drop(results.columns[0],axis=1)

In [20]:
results

Unnamed: 0,Masked,Meaning,Answer
0,co__nth,the modern Greek port near the site of the anc...,corinth
1,_ecei_e,get something; come into possession of,receive
2,_ollag_,a paste-up made by sticking together pieces of...,collage
3,t__sh,worthless material that is to be disposed of,trash
4,de_aw_r_,a river that rises in the Catskills in southea...,delaware
...,...,...,...
246,hic_e_,a small inflamed elevation of the skin; a pust...,hickey
247,no,a negative,no
248,sol_e_t,a liquid substance capable of dissolving other...,solvent
249,_ump_,resembling a garbage dump,lumpy


In [21]:
#answers that weren't found
results[results['Answer'].isnull()]

Unnamed: 0,Masked,Meaning,Answer
11,f_otp_d,a highwayman who robs on foot,
35,skyja__,subject an aircraft to air piracy,
81,su__osa_l_,capable of being inferred on slight grounds,
84,admi_is___b_e,capable of being administered or managed,
86,__rmute,change the order or arrangement of,
114,__calm,make steady,
120,shu_d__y,provoking fear terror,
125,_a_writ_r,someone who writes comic material for public p...,
132,_eth_sel__,(Old Testament,
205,c_ns_ern__e,"fill with anxiety, dread, dismay, or confusion",


# Using clustering to group similar words and improve performance
- Here I have experimented with KMeans from sklearn library to group similar words into 5 clusters to see if it improves the run time

# Importing the necessary libraries

In [22]:
from sklearn.cluster import KMeans
import pickle

# Initialising KMeans() with 5 clusters and 300 iterations
- So the idea now is to find the word's cluster and search for words within that cluster

In [23]:
kmeans = KMeans(
    init="random",
    n_clusters=5,
    n_init=10,
    max_iter=300,
    random_state=42
)

# Fitting the model
- I have stored the model I have trained in the files. Try not to run this again as it can lead to different clusters
- To save the model: pickle.dump(model, open(filename, 'wb')) 
- To load saved model from local directory: model = pickle.load(open(filename, 'rb')) 

In [24]:
#getting embeddings alone
dat = np.array(list(embeddings.values()))

In [25]:
kmeans.fit(dat)

KMeans(init='random', n_clusters=5, random_state=42)

In [26]:
pickle.dump(kmeans,open('kmeans_model','wb'))

# Getting the words alone from the embeddings dictionary

In [27]:
corpus_words = list(embeddings.keys())

In [28]:
cluster_centers = kmeans.cluster_centers_

# Storing labels

In [29]:
labels = kmeans.labels_

# For evaluation let's check the words from a given cluster to see if they hold similarities.
- From this we observe that words that correspond to the 1st cluster are related to places and locations. Voila! it has worked!

In [30]:
count=0
for i in range(400000):
    if labels[i]==1:
        count+=1
        if(count<=30):
            print(corpus_words[i])
print(count)

municipality
township
suburb
voivodeship
commune
gmina
nova
pradesh
bah
prefecture
romanized
dhaka
punjab
perth
headquartered
s.e.
bengal
unincorporated
f.c.
glacier
essex
canton
scotia
brunswick
south-east
neighbourhood
sussex
yangon
sur
saskatchewan
58288


# Creating a dictionary representing these clusters

In [31]:
cluster_dict={}
for i in range(400000):
    if labels[i] not in cluster_dict:
        cluster_dict[labels[i]]=[]
    cluster_dict[labels[i]].append(corpus_words[i])

# Now for the testing!

In [35]:
def get_embeddings(word_list,word):
    word_dict = {}
    for i in word_list:
        if i in embeddings.keys() and word in embeddings.keys():
            word_dict[i] = scipy.spatial.distance.cosine(embeddings[i],embeddings[word])
    return sorted(word_dict, key=word_dict.get)

In [36]:
def get_answer_with_kmeans(sent,word):
    categories = entities_with_nltk(sent)
    words = get_possible_words(categories)
    timer = ElapsedTimer()
    for i in words:
        min_dist = []
        for j in cluster_centers:
            if i in embeddings:
                min_dist.append(scipy.spatial.distance.cosine(embeddings[i],j))
            else:
                min_dist.append(9999999)
        if(len(min_dist)>0):
            min_dist = np.array(min_dist)
            c = np.argmin(min_dist)
            words_to_compare = get_embeddings(cluster_dict[c],i)
        for k in words_to_compare:
            if(len(k)==len(word)):
                if compare(k,word):
                    timer.elapsed_time()
                    return k
    return ""
    

# By the looks of it, it was able to predict only 167 of 251 words but at almost 1/10th of the time taken previously! awesome ain't it?

In [38]:
results['Answer with KMeans'] = evaluation.apply(lambda x: get_answer_with_kmeans(x.Meaning, x.Masked), axis=1)

Elapsed: 4.177642822265625 sec 
Elapsed: 3.8582141399383545 sec 


KeyboardInterrupt: 

In [None]:
results[results['Answer with KMeans']==""]

In [None]:
results[results['Answer']==results['Answer with KMeans']]

Unnamed: 0,Masked,Meaning,Answer,Answer with KMeans
1,_ecei_e,get something; come into possession of,receive,receive
3,t__sh,worthless material that is to be disposed of,trash,trash
4,de_aw_r_,a river that rises in the Catskills in southea...,delaware,delaware
6,ic__a_e_ron,any polyhedron having twenty plane faces,icosahedron,icosahedron
8,_ala_si_,a constitutional monarchy in southeastern Asia...,malaysia,malaysia
...,...,...,...,...
245,b_ut,(sports,bout,bout
246,hic_e_,a small inflamed elevation of the skin; a pust...,hickey,hickey
247,no,a negative,no,no
248,sol_e_t,a liquid substance capable of dissolving other...,solvent,solvent


In [None]:
results

Unnamed: 0,Masked,Meaning,Answer,Answer with KMeans
0,co__nth,the modern Greek port near the site of the anc...,corinth,
1,_ecei_e,get something; come into possession of,receive,receive
2,_ollag_,a paste-up made by sticking together pieces of...,collage,
3,t__sh,worthless material that is to be disposed of,trash,trash
4,de_aw_r_,a river that rises in the Catskills in southea...,delaware,delaware
...,...,...,...,...
246,hic_e_,a small inflamed elevation of the skin; a pust...,hickey,hickey
247,no,a negative,no,no
248,sol_e_t,a liquid substance capable of dissolving other...,solvent,solvent
249,_ump_,resembling a garbage dump,lumpy,jumps


In [None]:
results.to_csv('results_with_kmeans.csv')

# I had so much fun coding this! I definitely learnt  a lot, any suggestions to improve this are welcome :)