# Importing the necessary libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy
import os
import time
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn

# NLTK Tokenise meanings
- CC coordinating conjunction
- CD cardinal digit
- DT determiner
- EX existential there (like: “there is” … think of it like “there exists”)
- FW foreign word
- IN preposition/subordinating conjunction
- JJ adjective ‘big’
- JJR adjective, comparative ‘bigger’
- JJS adjective, superlative ‘biggest’
- LS list marker 1)
- MD modal could, will
- NN noun, singular ‘desk’
- NNS noun plural ‘desks’
- NNP proper noun, singular ‘Harrison’
- NNPS proper noun, plural ‘Americans’
- PDT predeterminer ‘all the kids’
- POS possessive ending parent’s
- PRP personal pronoun I, he, she
- PRP possessive pronoun my, his, hers
- RB adverb very, silently,
- RBR adverb, comparative better
- RBS adverb, superlative best
- RP particle give up
- TO, to go ‘to’ the store.
- UH interjection, errrrrrrrm
- VB verb, base form take
- VBD verb, past tense took
- VBG verb, gerund/present participle taking
- VBN verb, past participle taken
- VBP verb, sing. present, non-3d take
- VBZ verb, 3rd person sing. present takes
- WDT wh-determiner which
- WP wh-pronoun who, what
- WP possessive wh-pronoun whose
- WRB wh-abverb where, when

## We discard these categories for our task: CC, CD, DT, EX, IN, LS, MD, PDT, POS, PRP, TO, UH, WDT, WP, WP, WRB

# Spacy tokenise meanings (not using this, just for reference)
- PERSON	People, including fictional.
- NORP	Nationalities or religious or political groups.
- FAC	Buildings, airports, highways, bridges, etc.
- ORG	Companies, agencies, institutions, etc.
- GPE	Countries, cities, states.
- LOC	Non-GPE locations, mountain ranges, bodies of water.
- PRODUCT	Objects, vehicles, foods, etc. (Not services.)
- EVENT	Named hurricanes, battles, wars, sports events, etc.
- WORK_OF_ART	Titles of books, songs, etc.
- LAW	Named documents made into laws.
- LANGUAGE	Any named language.
- DATE	Absolute or relative dates or periods.
- TIME	Times smaller than a day.
- PERCENT	Percentage, including ”%“.
- MONEY	Monetary values, including unit.
- QUANTITY	Measurements, as of weight or distance.
- ORDINAL	“first”, “second”, etc.
- CARDINAL	Numerals that do not fall under another type.

In [2]:
#words from the given corpus
wordss = []
for line in open('Targets.txt'):
    wordss.append(line[:-1])

# GloVe embeddings
- Link: http://nlp.stanford.edu/data/glove.6B.zip
- Run the following code to get the embeddings from the official site for GloVe
- zip file was downloaded and extracted

In [6]:
os.chdir('glove.6B')

In [7]:
os.listdir()

['glove.6B.200d.txt',
 'glove.6B.50d.txt',
 'glove.6B.100d.txt',
 'glove.6B.300d.txt']

In [8]:
#Loading embeddings from GloVe 
embeddings = {}

f = open('glove.6B.100d.txt',encoding="utf8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings))

Loaded 400000 word vectors.


In [9]:
#to calculate elapsed time
class ElapsedTimer(object):
    def __init__(self):
        self.start_time = time.time()
    def elapsed(self,sec):
        if sec < 60:
            return str(sec) + " sec"
        elif sec < (60 * 60):
            return str(sec / 60) + " min"
        else:
            return str(sec / (60 * 60)) + " hr"
    def elapsed_time(self):
        print("Elapsed: %s " % self.elapsed(time.time() - self.start_time) )

In [10]:
#to get entities using nltk
def entities_with_nltk(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [28]:
#to get the nearest neighbours using cosine similarity
def get_neighbours(s):
    res = {}
    for i in embeddings:
        if s not in embeddings:
            return res
        res[i] = scipy.spatial.distance.cosine(embeddings[i],embeddings[s])
    sorted_keys = sorted(res, key=res.get) 
    return sorted_keys

In [13]:
#to get the words that can be used for prediction
def get_possible_words(s):
    words = []
    exceptions = ['CC', 'CD', 'DT', 'EX', 'IN', 'LS', 'MD', 'PDT', 'POS', 'PRP', 'TO', 'UH', 'WDT', 'WP', 'WP', 'WRB','.',',']
    for i in s:
        if i[1] not in exceptions and len(i[0])>3:
            words.append(i[0])
    return words

In [14]:
#to get the answer
def get_answer(words,word):
    for i in words:
        s = get_neighbours(i)
        for k in s:
            if(len(k)==len(word)):
                if(compare(k,word)==True):
                    return k
    return ""

In [15]:
#to compare two words
def compare(a,b):
    for i in range(len(a)):
        if b[i]=='_':
            pass
        else:
            if(a[i]!=b[i]):
                return False
    return True

In [16]:
#to get word
def get_word(sent,word):
    categories = entities_with_nltk(sent)
    words = get_possible_words(categories)
    timer = ElapsedTimer()
    ans = get_answer(words, word)
    timer.elapsed_time()
    return ans

In [18]:
os.chdir("..")
os.listdir()

['Evaluation.csv',
 'Approach.docx',
 '.ipynb_checkpoints',
 'Consolidated-Copy1.ipynb',
 'Trials word2vec.ipynb',
 'Approach.pdf',
 'results_with_kmeans.csv',
 'results.csv',
 'glove.6B',
 'Final with KMeans-Copy1.ipynb',
 'eval.csv',
 'Final with KMeans.ipynb',
 'Targets.txt',
 'Final.ipynb',
 'kmeans_model',
 'Consolidated.ipynb',
 'Consolidated-Copy2.ipynb']

In [19]:
evaluation = pd.read_csv('eval.csv',sep = ',')

In [20]:
evaluation.head()

Unnamed: 0,Masked,Meaning
0,C o _ _ n t h,the modern Greek port near the site of the anc...
1,_ e c e i _ e,get something; come into possession of
2,_ o l l a g _,a paste-up made by sticking together pieces of...
3,t _ _ s h,worthless material that is to be disposed of
4,D e _ a w _ r _,a river that rises in the Catskills in southea...


In [21]:
def remove_space(x):
    x = x.lower()
    letters = [i for i in x]
    s = ""
    for i in letters:
        if i!=' ':
            s = s+i
    return s
evaluation['Masked'] = evaluation['Masked'].apply(lambda x: remove_space(x))

# Let us try evaluating this to see how it goes

In [22]:
sent = "a garden plant with purple flowers that smell very pleasant."
word = "l_v_nd_r"

In [26]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/andrea/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/andrea/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [29]:
get_word(sent,word)

Elapsed: 29.961535215377808 sec 


'lavender'

# Now applying it to the entire dataset

In [30]:
evaluation['Answer']=evaluation.apply(lambda x: get_word(x.Meaning, x.Masked), axis=1)

Elapsed: 30.494628190994263 sec 


KeyboardInterrupt: 

# The previous function was run on a GPU enabled notebook 
- It used about 20 seconds per word. 
- The answers were stored in the column 'Answers'
- The results obtained are stored in a file called 'results.csv' and loaded here for reference
- It was able to predict the answer for most of the words, in some cases (very rare) predicted the wrong words

In [31]:
results = pd.read_csv('results.csv')
results = results.drop(results.columns[0],axis=1)

In [32]:
results

Unnamed: 0,Masked,Meaning,Answer
0,co__nth,the modern Greek port near the site of the anc...,corinth
1,_ecei_e,get something; come into possession of,receive
2,_ollag_,a paste-up made by sticking together pieces of...,collage
3,t__sh,worthless material that is to be disposed of,trash
4,de_aw_r_,a river that rises in the Catskills in southea...,delaware
...,...,...,...
246,hic_e_,a small inflamed elevation of the skin; a pust...,hickey
247,no,a negative,no
248,sol_e_t,a liquid substance capable of dissolving other...,solvent
249,_ump_,resembling a garbage dump,lumpy


In [33]:
#answers that weren't found
results[results['Answer'].isnull()]

Unnamed: 0,Masked,Meaning,Answer
11,f_otp_d,a highwayman who robs on foot,
35,skyja__,subject an aircraft to air piracy,
81,su__osa_l_,capable of being inferred on slight grounds,
84,admi_is___b_e,capable of being administered or managed,
86,__rmute,change the order or arrangement of,
114,__calm,make steady,
120,shu_d__y,provoking fear terror,
125,_a_writ_r,someone who writes comic material for public p...,
132,_eth_sel__,(Old Testament,
205,c_ns_ern__e,"fill with anxiety, dread, dismay, or confusion",


# Using clustering to group similar words and improve performance
- Here I have experimented with KMeans from sklearn library to group similar words into 5 clusters to see if it improves the run time

# Importing the necessary libraries

In [34]:
from sklearn.cluster import KMeans
import pickle

# Initialising KMeans() with 5 clusters and 300 iterations
- So the idea now is to find the word's cluster and search for words within that cluster

In [35]:
kmeans = KMeans(
    init="random",
    n_clusters=5,
    n_init=10,
    max_iter=300,
    random_state=42
)

# Fitting the model
- I have stored the model I have trained in the files. Try not to run this again as it can lead to different clusters
- To save the model: pickle.dump(model, open(filename, 'wb')) 
- To load saved model from local directory: model = pickle.load(open(filename, 'rb')) 

In [36]:
#getting embeddings alone
dat = np.array(list(embeddings.values()))

In [37]:
kmeans.fit(dat)

KMeans(init='random', n_clusters=5, random_state=42)

In [38]:
pickle.dump(kmeans,open('kmeans_model','wb'))

# Getting the words alone from the embeddings dictionary

In [39]:
corpus_words = list(embeddings.keys())

In [40]:
cluster_centers = kmeans.cluster_centers_

# Storing labels

In [41]:
labels = kmeans.labels_

# For evaluation let's check the words from a given cluster to see if they hold similarities.
- From this we observe that words that correspond to the 5th cluster are related to currencies and units. Voila! it has worked!

In [42]:
count=0
for i in range(400000):
    if labels[i]==1:
        count+=1
        if(count<=30):
            print(corpus_words[i])
print(count)

kuh
née
duh
fah
luh
mahn
shuh
meh
zah
sih
juh
eduard
sahn
yoh
mehd
nuh
ehl
ehn
herve
y.
puh
vich
hahm
skee
horst
dool
reh
z.
konstantin
rahm
116190


# Creating a dictionary representing these clusters

In [43]:
cluster_dict={}
for i in range(400000):
    if labels[i] not in cluster_dict:
        cluster_dict[labels[i]]=[]
    cluster_dict[labels[i]].append(corpus_words[i])

# Now for the testing!

In [52]:
cluster_dict[1]`

['kuh',
 'née',
 'duh',
 'fah',
 'luh',
 'mahn',
 'shuh',
 'meh',
 'zah',
 'sih',
 'juh',
 'eduard',
 'sahn',
 'yoh',
 'mehd',
 'nuh',
 'ehl',
 'ehn',
 'herve',
 'y.',
 'puh',
 'vich',
 'hahm',
 'skee',
 'horst',
 'dool',
 'reh',
 'z.',
 'konstantin',
 'rahm',
 'massoud',
 'toder',
 'jerzy',
 'eed',
 'keer',
 'voh',
 'odierno',
 'meen',
 'zhahn',
 'phan',
 'norbert',
 'beel',
 'konrad',
 'asefi',
 'klitschko',
 'vitaly',
 'seh',
 'kemal',
 'mohd',
 'martino',
 'chowdhury',
 '1w',
 'mahl',
 'tih',
 '202-887-8334',
 'ereli',
 'vyacheslav',
 'avila',
 'daoud',
 'tahr',
 'shum',
 'gissin',
 'vah',
 'zalmay',
 'andya',
 'valero',
 'fawzi',
 'rohan',
 'ivanko',
 'egeland',
 'asher',
 'morin',
 'shoh',
 'yasin',
 'micheletti',
 'faris',
 'uhn',
 'murat',
 'naji',
 'joh',
 'persie',
 'regev',
 'skakel',
 'koenig',
 'khai',
 'ahk',
 'boyer',
 'matti',
 'kahl',
 'dimitris',
 '202-887-8338',
 'lih',
 'kow',
 'kohr',
 'vasily',
 'johndroe',
 'yury',
 'armin',
 'khalaf',
 'dih',
 'blazy',
 'kwah',


In [71]:
def get_embeddings(word_list,word):
    word_dict = {}
    for i in word_list:
        if i in embeddings.keys():
            word_dict[i] = scipy.spatial.distance.cosine(embeddings[i],embeddings[word])
    return sorted(word_dict, key=word_dict.get)

In [72]:
def get_answer_with_kmeans(sent,word):
    categories = entities_with_nltk(sent)
    words = get_possible_words(categories)
    timer = ElapsedTimer()
    for i in words:
        min_dist = []
        for j in cluster_centers:
            if i in embeddings:
                min_dist.append(scipy.spatial.distance.cosine(embeddings[i],j))
            else:
                min_dist.append(9999999)
        if(len(min_dist)>0):
            min_dist = np.array(min_dist)
            c = np.argmin(min_dist)
            words_to_compare = get_embeddings(cluster_dict[c],i)
        for k in words_to_compare:
            if(len(k)==len(word)):
                if compare(k,word):
                    timer.elapsed_time()
                    return k
    return ""
    

# By the looks of it, it was able to predict only 167 of 251 words but at almost 1/10000th of the time taken previously! awesome ain't it?

In [73]:
results['Answer with KMeans'] = evaluation.apply(lambda x: get_answer_with_kmeans(x.Meaning, x.Masked), axis=1)

the
,
.
of
to
and
in
a
"
's
for
-
that
on
is
was
said
with
he
as
it
by
at
(
)
from
his
''
``
an
be
has
are
have
but
were
not
this
who
they
had
i
which
will
their
:
or
its
one
after
new
been
also
we
would
two
more
'
first
about
up
when
year
there
all
--
out
she
other
people
n't
her
percent
than
over
into
last
some
government
time
$
you
years
if
no
world
can
three
do
;
president
only
state
million
could
us
most
_
against
u.s.
so
them
what
him
united
during
before
may
since
many
while
where
states
because
now
city
made
like
between
did
just
national
day
country
under
such
second
then
company
group
any
through
china
four
being
down
war
back
off
south
american
minister
police
well
including
team
international
week
officials
still
both
even
high
part
told
those
end
former
these
make
billion
work
our
home
school
party
house
old
later
get
another
tuesday
news
long
five
called
1
wednesday
military
way
used
much
next
monday
thursday
friday
game
here
?
should
take
very
my
north
security
season
yo

headed
documents
follow
bosnia
rice
super
45
stopped
carolina
directly
initial
relief
yuan
continues
earth
gives
activity
polls
bc
louis
corruption
picture
storm
backed
irish
interests
refugees
defeat
sen.
islands
initially
spot
bond
embassy
agencies
swiss
request
standards
estate
trust
ceremony
300
guy
fully
occurred
kingdom
shortly
iranian
bob
accept
c
diplomatic
cold
signs
facilities
struck
discovered
contact
regime
airlines
1,000
youth
safe
penalty
transport
follows
marriage
maybe
appearance
highway
millions
gain
90
poll
door
threatened
wins
silver
passengers
scott
rescue
lose
apparently
exports
showing
failure
increasing
talking
specific
fresh
card
sixth
understand
vehicle
panel
latin
refer
r
items
benefits
seem
partner
kill
communications
broadcast
faces
protests
caught
pretty
presented
setting
driver
steve
happen
capacity
fed
box
65
stores
deaths
davis
sport
prize
boost
producer
comments
goods
ford
episode
decline
immediate
ending
girls
bonds
climate
orders
destroyed
awarded
pro

singh
requirements
baltimore
underground
typical
employee
winds
economists
traveling
drawing
gang
reviews
1960
unclear
selection
scheme
churches
ceo
f.
absence
residence
resident
tens
simpson
reception
opponent
arrive
militia
congo
pilots
flood
suddenly
enjoy
flew
painting
genus
sweet
entitled
condemned
railroad
finishing
electronics
essential
regarded
notably
handling
roger
52
defined
attending
pages
ratings
harris
desert
crashed
coaches
proper
welcomed
developments
departure
thompson
restaurants
composite
hosts
quiet
interviews
equivalent
jason
machines
fat
universities
beaten
cape
external
locations
editorial
plastic
kelly
kurdish
narrow
string
nepal
inter
robinson
designated
andy
1965
bound
prosecution
boss
moore
covering
revolutionary
occur
gasoline
margin
extension
tampa
judicial
lieutenant
telecommunications
somewhat
¥
editors
comprehensive
offense
tigers
800
makers
supports
constructed
kabul
humans
alex
parent
opposite
existence
1966
magic
funeral
chechnya
steady
couples
scienc

homeless
arabic
constituency
revenge
islamabad
announcing
partial
rebellion
internationally
petersburg
reign
lakes
76
exists
airbus
bible
hassan
pink
5th
indictment
errors
74
integrity
choices
deputies
expedition
reluctant
zones
7-5
aftermath
cardinals
destination
displayed
assume
pregnant
lessons
terrible
boards
opener
sean
sprint
1.6
component
examination
embargo
novels
sub
divorce
bryant
generations
holocaust
col.
teammate
peoples
seeds
guantanamo
somewhere
vocal
nevada
unveiled
similarly
lions
tommy
inspector
acquire
phillips
vaccine
bridges
i.e.
hughes
crowds
floods
pulling
neighbor
turnout
keith
clinic
thank
merged
fatal
sustainable
steam
luck
chemicals
powered
dreams
territorial
lawsuits
notion
rubber
lifting
passion
kidnapping
peres
anticipated
beirut
doubts
revival
liked
flows
footage
loaded
select
lab
crazy
bloomberg
romanian
stimulus
incumbent
outdoor
fda
slovakia
neighborhoods
discipline
olmert
substitute
musician
cousin
addressing
maintains
judiciary
restricted
diet
citize

mercedes
agassi
oval
sometime
checking
meals
timetable
treating
jacob
150,000
bunch
cleaning
bow
evaluation
angela
dirt
ventures
franco
observation
relating
jesse
pine
myers
graduating
20-year
7,000
drinks
contracted
harassment
1940s
graduation
airplane
joke
guangdong
kandahar
jamie
freight
enhanced
crushed
ramos
sampras
surely
labels
cartoon
cpc
mature
solely
se
3/4
lease
devils
tube
heroin
rubin
retaliation
genes
sheep
sounded
slammed
justices
usage
appreciate
javier
tender
altitude
celebrating
abducted
telegram
volleyball
installation
strained
equality
assad
glad
caution
dragon
surrendered
memphis
straw
nickname
bacteria
grandmother
ruins
sailed
candidacy
skill
festivals
imminent
joan
cavalry
monopoly
spaces
salmon
diagnosed
contention
o'neal
burial
venus
celtics
productivity
platforms
programmes
shouted
ferrari
holders
shipped
pastor
ruth
dragged
debates
uzbekistan
nova
ghost
enters
naming
intend
complain
rebuilt
brokers
custom
bids
canyon
regain
enclave
embrace
archives
incredible

accuracy
kicking
posters
rage
treasure
functioning
halls
hosni
manuscript
aung
booth
betty
correction
granting
gravity
lens
obtaining
z
leap
geological
excitement
forbidden
workforce
frenchman
wash
chopped
felipe
lovers
consultation
n’t
renovation
diagnosis
115
hurdles
fossil
screenplay
biblical
wished
zhou
edgar
sticks
unexpectedly
logistics
khatami
generic
nicaragua
successes
oven
behaviour
slumped
hanoi
healing
mighty
catcher
madonna
pct
certification
testament
workplace
goodwill
instances
durham
notified
monarch
abolished
enrollment
tomatoes
1895
inclusion
networking
favorites
installations
sergio
turin
throat
reunification
academics
2.0
forthcoming
guilt
accidentally
beings
thoroughly
oracle
booked
employ
digit
particles
blend
embarrassing
councils
catches
fitzgerald
pond
acquiring
spelling
cooked
regulate
pockets
curve
rockies
sebastian
deadlock
umbrella
treasurer
fortunes
apologized
forge
layout
stint
valentine
arrange
varying
3.1
darren
kay
lure
liquidity
punjab
finishes
propel

supervised
deposed
5.6
cheers
dedication
growers
cuisine
explored
hardy
improper
guru
'em
bother
joked
gloves
overshadowed
emotionally
sufficiently
vault
stirring
4.7
stranger
rewards
presbyterian
silly
1883
laptop
damon
acknowledges
muster
spelled
106
witch
criticize
retains
packaging
compositions
angolan
judy
hyde
arguably
willis
profitability
yao
bullpen
appropriations
cites
needing
sedan
roses
doses
plymouth
builders
reinforcements
frances
cultivation
reuters
arise
grouping
lara
spokesperson
preserving
collided
screened
ditch
3.9
hayden
bureaucracy
travis
discharge
avert
rewarded
minimize
olds
daly
clerics
lender
expire
sabotage
nablus
erosion
first-class
pri
ironically
berger
commercially
pirate
touches
blade
sour
dialect
byron
feud
discovers
taped
teamed
ebay
quarterfinal
extremism
fatalities
550
petrol
pneumonia
customary
h1n1
immense
downing
illicit
clinch
discontinued
1878
shortstop
pitt
sings
amr
acknowledging
toilet
simmons
knocking
scoreless
benson
proceeding
contamination


mich.
seasonally
mccurry
boycotted
fractured
32-year
gutierrez
dispersed
compares
unfairly
psychiatrist
jew
fujian
slap
turkmenistan
extras
cheating
118
fearful
sundays
warlords
nod
mobility
dismantling
randolph
1,800
crossings
mortars
6.3
battalions
mourners
assistants
newcomer
frightened
haunted
adolf
finalized
celebrates
installing
breathe
lavrov
bothered
auxiliary
staunch
warlord
telekom
zia
pleas
overlooked
belmont
viewer
122
mediators
warheads
bnp
hanson
recycling
arrow
spilled
alleges
orchestrated
iaaf
fours
rossi
apparatus
allawi
manipulation
displaying
compassion
compelled
alma
prohibits
cares
analog
icrc
6.2
vowing
piazza
hawkins
lip
employing
re-elected
debating
batter
cruiser
'n'
tuna
convene
terminals
shandong
rapids
pottery
hancock
marginal
explode
bleak
leigh
quell
dial
mirrors
100m
interval
interact
alarmed
27th
envisioned
qualifications
maneuver
clayton
someday
phnom
nonsense
mid-1980s
swat
sociology
weld
26th
inconsistent
100th
obsessed
rom
vale
gala
crusade
walton
9.

gps
vargas
singular
blogs
09
138
secretive
liters
45,000
chao
asbestos
loyalist
traction
keating
scam
app
1841
notification
suppressed
uncertainties
indonesians
transcript
demons
revisions
breached
pros
perished
1844
attach
captures
salute
citrus
worsen
illegitimate
buckingham
genera
traumatic
comcast
botanical
karbala
europa
coroner
violently
ethnicity
draped
275
sunrise
hoover
councillor
embracing
directs
jammed
haitians
insect
awarding
anti-
ngos
bruised
dinosaurs
unanswered
collapsing
slum
standardized
tandem
tolerated
cynical
cameo
polished
ozone
abn
motorway
vein
confinement
chamberlain
breeze
aspiring
ababa
wwf
royce
overweight
manifesto
reverend
blond
asians
sadly
logs
prized
polite
resigns
beckett
astonishing
quincy
accomplishment
differing
favourites
chaplain
punches
decreasing
pullback
ballroom
spans
frogs
mccoy
helmets
heathrow
judgement
fracture
rotten
fared
shipbuilding
911
resigning
hwang
vans
anthropology
thrive
statutes
climax
macy
briefs
exert
ashton
adm.
denouncing
f

remembrance
detector
guaranteeing
parades
g7
roaring
endorsements
2,400
formulated
complicate
strides
sulfur
sticky
immigrated
reprinted
favourable
homecoming
sharia
aegean
reefs
motorcade
harmless
dwelling
satirical
unforced
balked
plenary
soils
villains
departures
qi
deduction
tyre
czar
aeronautics
wooded
intricate
retaliatory
symbolism
backstage
dinars
retention
exemptions
blackwater
spat
superiority
pornographic
mediated
dues
venerable
ardent
stave
nld
cameraman
viability
boosts
minsk
chernobyl
happier
contemporaries
tvs
5000
hua
mortal
sped
clarified
bulldozers
integrating
hinder
conditioned
shortest
nc
marital
phenomenal
compensated
denotes
compromises
wharf
ablaze
programmed
standout
misled
asphalt
jitters
hardened
gigantic
orchard
foreigner
entrances
snatch
nonproliferation
notoriety
towel
dormant
interbank
quirky
ppp
rites
payrolls
scars
blackmail
synod
librarian
sentinel
mapped
ensued
ratko
balances
rewrite
coloured
faint
hwa
midwestern
standstill
36th
calculating
infiltratio

faculties
cottages
mitigate
defer
budding
firsthand
erupt
braced
newfound
hacking
dealerships
unveils
detrimental
spheres
reinstate
constitutionally
discouraging
documenting
yongbyon
juveniles
boroughs
triggers
euthanasia
dioceses
simplify
offside
fodder
obituary
sunken
behest
combatant
19,000
stroll
vested
porous
redesign
aforementioned
hotline
clogged
sonar
enthusiastically
capping
laredo
arduous
disengagement
initiation
underscoring
symbolize
stockholders
inflict
lectured
solidly
unfold
exempted
limitation
herds
attire
diners
parlor
high-speed
dissenting
onward
reruns
landfill
envisaged
majorities
breaching
intergovernmental
deities
yearlong
adversely
pursuits
premieres
sparse
exerted
importer
unify
getaway
jolt
rotate
impromptu
populace
sophistication
best-selling
best-known
11:00
taekwondo
unintended
rainforest
lieutenants
11:30
monastic
shear
punctuated
breathtaking
mirrored
1860s
footprint
injecting
salvaged
swamped
homegrown
eighteenth
looters
invasions
hostel
tides
reservists


refiners
reorganize
subways
tabs
nearer
spawning
mpg
gardeners
erroneously
inhumane
mysteriously
unprofor
droughts
workload
exhibitors
pregame
tidy
eavesdropping
unsuitable
2010-11
intersects
fairways
jeeps
english-language
southbound
gnp
accumulating
co-operative
abolishing
corrective
empires
long-distance
cross-country
mishap
unexploded
contraband
gamers
uninterrupted
refrigerate
extravaganza
harvests
conduit
peninsular
quieter
upturn
secluded
aisles
leeway
gunfight
ligaments
dispatching
convening
definitively
fractious
left-wing
unprofitable
normalcy
jeddah
campers
twenty-five
narratives
disproportionately
prerequisite
appropriation
streamlining
arsenals
academically
installments
papacy
northbound
exceedingly
overflow
nestled
undetermined
embankment
non-fiction
continual
usaf
reputations
mitigation
sorties
secretary-general
ict
ringed
overhauling
undeveloped
unconditionally
climates
townspeople
volunteering
airfields
exhausting
demarcation
widest
yanukovich
licences
localized
previe

KeyError: 'Greek'

In [None]:
results[results['Answer with KMeans']==""]

In [49]:
results[results['Answer']==results['Answer with KMeans']]

Unnamed: 0,Masked,Meaning,Answer,Answer with KMeans
1,_ecei_e,get something; come into possession of,receive,receive
3,t__sh,worthless material that is to be disposed of,trash,trash
4,de_aw_r_,a river that rises in the Catskills in southea...,delaware,delaware
6,ic__a_e_ron,any polyhedron having twenty plane faces,icosahedron,icosahedron
8,_ala_si_,a constitutional monarchy in southeastern Asia...,malaysia,malaysia
...,...,...,...,...
245,b_ut,(sports,bout,bout
246,hic_e_,a small inflamed elevation of the skin; a pust...,hickey,hickey
247,no,a negative,no,no
248,sol_e_t,a liquid substance capable of dissolving other...,solvent,solvent


In [50]:
results

Unnamed: 0,Masked,Meaning,Answer,Answer with KMeans
0,co__nth,the modern Greek port near the site of the anc...,corinth,
1,_ecei_e,get something; come into possession of,receive,receive
2,_ollag_,a paste-up made by sticking together pieces of...,collage,
3,t__sh,worthless material that is to be disposed of,trash,trash
4,de_aw_r_,a river that rises in the Catskills in southea...,delaware,delaware
...,...,...,...,...
246,hic_e_,a small inflamed elevation of the skin; a pust...,hickey,hickey
247,no,a negative,no,no
248,sol_e_t,a liquid substance capable of dissolving other...,solvent,solvent
249,_ump_,resembling a garbage dump,lumpy,jumps


In [51]:
results.to_csv('results_with_kmeans.csv')

# I had so much fun coding this! I definitely learnt  a lot, any suggestions to improve this are welcome :)