<h1><center> Computational Lexical Semantics</center></h1>
<h5><center>LIN5580-SEM1-A-1718</center></h5>
<h3><center>Jovana Urosevic and Angelo Basile</center></h3>
<h5><center>January 31, 2018</center></h5>

-----

### Instruction on how to use this notebook to replicate the results:

1. download and install python 2
2. download and install [pipenv](https://docs.pipenv.org/)
3. install cython separately: ```pipenv install cython```
4. install the requirements from the Pipfile
5. active the virtualenvironment: ```pipenv shell```
6. ```cd dissect-master```
7. install dissect: ```pipenv run python2 setup.py install```
8. deactive the virtual environment:```exit```
9. run the notebook: ```pipenv run jupyter notebook```

#### Extra

For the POS-tagging tasks, it is required to download language models.

- ```pipen run python2 -m spacy download en``` for downloading the English model
- ```pipen run python2 -m spacy download es``` for downloading the Spanish model

In [58]:
# EXTRACTION PROGRAM
# monolingual
import spacy
import codecs
from __future__ import print_function
nlp = spacy.load('en', disable=['parser', 'ner'])

# function to tag the text
def tag(token):
    word = token.text
    tag = token.tag_
    return word+'_'+tag

# load the imbdb corpus
import thinc.extra.datasets
data, _ = thinc.extra.datasets.imdb()
texts, _ = zip(*data[-10000:])

# filter the text based on POS
filtered = []
for doc in texts:
    filtered.append(' '.join(w.text for w in nlp(doc) if w.pos_ in ['NOUN', 'VERB']))

# build the frequency distribution for words
from collections import Counter
word_freq = Counter()
for t in filtered:
    word_freq.update(t.split())

# use only the 1550 most frequent words
vocab = [x[0] for x in word_freq.most_common(1550)]
assert len(vocab) == 1550

#build the wordXword matrix
from sklearn.feature_extraction.text import CountVectorizer
docs = filtered
count_model = CountVectorizer(vocabulary=vocab)
X = count_model.fit_transform(docs)
Xc = (X.T * X) 
Xc.setdiag(0)

# invert the vocabulary
di = dict([[v,k] for k,v in count_model.vocabulary_.items()])

# save the output in a dissect compatible format
with open('mono.dm',"w") as o:
    for word, counts in zip(vocab, Xc.toarray()):
        o.write(word.encode('utf-8')+' '+" ".join(map(str, counts)))
        o.write('\n')

with open('mono.rows', 'w') as o:
    for w in vocab:
        o.write(w.encode('utf-8'))
        o.write('\n')

# note: this is the same as before, since we are using all the words as context
with open('mono.cols', 'w') as o:
    for w in vocab:
        o.write(w.encode('utf-8'))
        o.write('\n')

# BUILD THE DSM
from composes.semantic_space.space import Space

#create a space from co-occurrence counts in dense format
mono = Space.build(data = "./mono.dm",
                   rows = "./mono.rows",
                   cols = "./mono.cols",
                   format = "dm")

# extract similar words
from composes.similarity.cos import CosSimilarity
# 5 most common
for w in word_freq.most_common(5):
    print(mono.get_neighbours(w[0], 5, CosSimilarity()))



[('is', 1.0000000000000002), ('seemed', 0.8099054023790865), ('figured', 0.8078217152278638), ('hated', 0.7998818532597823), ('hip', 0.798432862050815)]
[('was', 1.0), ('fails', 0.9583693827602454), ('feel', 0.9565436643990166), ('walk', 0.9545529508846556), ('makes', 0.9544440477548656)]
[('movie', 0.9999999999999999), ('show', 0.9789815854478494), ('allowed', 0.9726192774757002), ('accent', 0.9722384866746526), ('spot', 0.9721894717725857)]
[('film', 1.0), ('show', 0.9762035295747843), ('baby', 0.9733688913493664), ('rescue', 0.9728059173958485), ('knows', 0.9722895597600173)]
[('is', 0.0), ('was', 0.0), ('movie', 0.0), ('film', 0.0), ("'s", 0.0)]


In [63]:
# 5 least common
for w in word_freq.most_common(1500)[-5:]:
    pprint.pprint(mono.get_neighbours(w[0], 5, CosSimilarity()))

[('according', 0.9999999999999999),
 ('everything', 0.9931032867180424),
 ('place', 0.9928814512460278),
 ('says', 0.9926084092220174),
 ('keep', 0.9923692240019467)]
[('serve', 1.0),
 ('example', 0.9884668620184874),
 ('appears', 0.9881421908147624),
 ('seems', 0.9881201223258333),
 ('nature', 0.9880742693529966)]
[('breaking', 1.0),
 ('work', 0.9889140793499385),
 ('taken', 0.9885708775893547),
 ('today', 0.9884574294589742),
 ('number', 0.9882349708414532)]
[('favorites', 1.0),
 ('watching', 0.9783481244248106),
 ('loved', 0.9778277195482216),
 ('opinion', 0.9771952444770546),
 ('enjoyed', 0.9770937336798221)]
[('mistakes', 1.0),
 ('means', 0.9868802609693922),
 ('makes', 0.986564466448231),
 ('miss', 0.9861086700731426),
 ('kind', 0.9859963407857474)]


In [73]:
mono.get_neighbours('get', 15, CosSimilarity())

[('get', 1.0),
 ('everyone', 0.9964144093340506),
 ('mind', 0.9962880682030615),
 ('believe', 0.9962865266548062),
 ('come', 0.9962714101107594),
 ('give', 0.9962699881148831),
 ('reason', 0.9962526357871062),
 ('thing', 0.9962040639305845),
 ('look', 0.9961682074477383),
 ('things', 0.9960742493531183),
 ('end', 0.9960663558282294),
 ('take', 0.9960522935950892),
 ('anything', 0.9960490497511368),
 ('something', 0.9960261419004385),
 ('ca', 0.9959793968425902)]

In [None]:
# apply a transformation
# transformation
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
transformed = mono.apply(PpmiWeighting())

import pprint
for w in word_freq.most_common(5):
    pprint.pprint(transformed.get_neighbours(w[0], 5, CosSimilarity()))

In [88]:
import pandas as pd
import codecs
with codecs.open('./data/c.clean.en/data', 'r', encoding='utf-8') as f:
    en = f.read().lower().splitlines()

with codecs.open('./data/c.clean.es/data', 'r', encoding='utf-8') as f:
    es = f.read().lower().splitlines()

with open('./data/en-es_aligned.intersect/data', 'r') as f:
    align = f.read().splitlines()
    
# check shapes
assert len(en) == len(es)
assert len(en) == len(align)
print(len(en))

1997883


In [None]:
en_lemmas = []
es_lemmas = []
for doc in en:
    en_lemmas.append(' '.join(w.lemma_ for w in nlp(doc)))
    

In [None]:
for doc in es:
    es_lemmas.append(' '.join(w.lemma_ for w in nlp(doc)))

In [None]:
# filter the text based on POS
en_tagged = []
es_tagged = []
for doc in en:
    en_tagged.append(' '.join(w.text+'_'+w.pos_ for w in nlp(doc)))
for doc in es:
    en_tagged.append(' '.join(w.text+'_'+w.pos_ for w in nlp(doc)))

In [None]:
# build the frequency distribution for words
from collections import Counter
word_freq = Counter()
for t in filtered:
    word_freq.update(t.split())

In [None]:
# use only the 1550 most frequent words
vocab = [x[0] for x in word_freq.most_common(1550)]
assert len(vocab) == 1550

corpus = zip(en, es, align)
assert len(corpus) == len(en)
len(corpus)

from collections import defaultdict
mm = defaultdict(int)

for x,y,z in corpus:
    zs = z.split(' ')
    for words in zs:
        try:
            mm[x.split()[int(words.split('-')[0])],
               y.split()[int(words.split('-')[1])]] += 1
        except:
            pass

en = []
es = []
count = []
for k,v in mm.items():
    en.append(k[0])
    es.append(k[1])
    count.append(v)

assert len(en) == len(es)
assert len(en) == len(count)

sm = pd.DataFrame([en,es,count]).T
print(sm.head())

sm.to_csv('parallel.sm', index=False, sep=' ', header=False)

with open('parallel.rows',"w") as o:
    for word in en:
        o.write(word)
        o.write('\n')

with open('parallel.cols',"w") as o:
    for word in es:
        o.write(word)
        o.write('\n')

In [132]:
# parallel
from composes.semantic_space.space import Space

#create a space from co-occurrence counts in sparse format
parallel = Space.build(data = "./parallel.sm",
                       rows = "./parallel.cols",
                       cols = "./parallel.rows",
                       format = "sm")

In [52]:
# build the frequency distribution for words
from collections import Counter
wf = Counter()
for t in es:
    wf.update(t.split())

In [53]:
wf.most_common(5)

[('de', 4223),
 (',', 2956),
 ('que', 2868),
 ('a', 2563),
 ('se', 2518),
 ('la', 2497),
 ('los', 2431),
 ('las', 2220),
 ('en', 2168),
 ('no', 1869),
 ('una', 1645),
 ('un', 1497),
 ('el', 1469),
 ('con', 1343),
 ('para', 955),
 ('m\xc3\xa1s', 913),
 ('y', 872),
 ('por', 796),
 ('contra', 778),
 ('es', 691),
 ('ha', 664),
 ('parte', 625),
 ('muy', 618),
 ('sobre', 618),
 ('bien', 591),
 ('sin', 571),
 ('entre', 564),
 ('lo', 563),
 ('su', 531),
 ('gran', 524),
 ('respecto', 518),
 ('hacer', 516),
 ('al', 515),
 ('del', 501),
 ('forma', 478),
 ('hecho', 447),
 ('poco', 442),
 ('hace', 441),
 ('como', 441),
 ('esta', 429),
 ('todo', 417),
 ('tambi\xc3\xa9n', 412),
 ('conseguir', 385),
 ('ser', 373),
 ('punto', 368),
 ('importante', 366),
 ('posible', 364),
 ('est\xc3\xa1', 364),
 ('tiene', 363),
 ('son', 356)]

In [118]:
# 5 most common
for w in wf.most_common()[-50:-45]:
    try:
        pprint.pprint(parallel.get_neighbours(w[0], 5, CosSimilarity()))
    except:
        pass

In [61]:
parallel.get_neighbours('bien', 15, CosSimilarity())

[('well-armed', 1.0),
 ('well-kept', 1.0),
 ('well-hidden', 1.0),
 ('welldefined', 1.0),
 ('bien', 1.0),
 ('well-anchored', 1.0),
 ('well-sustained', 1.0),
 ('well-oriented', 1.0),
 ('well-reasoned', 1.0),
 ('well-laid', 1.0),
 ('well-identified', 1.0),
 ('well-mapped', 1.0),
 ('well-spaced', 1.0),
 ('well-clothed', 1.0),
 ('well-applied', 1.0)]

In [83]:
parallel.get_neighbours('casa', 15, CosSimilarity())

[('housecleaning', 1.0),
 ('house-by-house', 1.0),
 ('homewards', 1.0),
 ('home-worker', 1.0),
 ('house-label', 1.0),
 ('homemakers', 1.0),
 ('homebuyers', 1.0),
 ('house-to-house', 1.0),
 ('home-spun', 1.0),
 ('home-to-home', 1.0),
 ('casa', 0.9999999999999999),
 ('housewives', 0.9958705948858223),
 ('housewife', 0.9901475429766743),
 ('marries', 0.9486832980505138),
 ('housekeepers', 0.8944271909999159)]

In [46]:
parallel.get_neighbours('tinto', 15, CosSimilarity())

[('tinto', 1.0),
 ('red', 0.057016996594512454),
 ('out', 0.0),
 ('euro-apathy', 0.0),
 ('mrs', 0.0),
 ('danger', 0.0),
 ('discouraging', 0.0),
 ('allow', 0.0),
 ('settings', 0.0),
 ('readjustments', 0.0),
 ('transposing', 0.0),
 ('inter-pillar', 0.0),
 ('achieve', 0.0),
 ('securing', 0.0),
 ('willing', 0.0)]

In [136]:
parallel.get_neighbours('hacer', 15, CosSimilarity())

[('deliberados', 0.0),
 ('euroapat\xc3\xada', 0.0),
 ('eluned', 0.0),
 ('amenazados', 0.0),
 ('desalentadora', 0.0),
 ('autoricemos', 0.0),
 ('escenarios', 0.0),
 ('dispuesta', 0.0),
 ('traslaci\xc3\xb3n', 0.0),
 ('entre', 0.0),
 ('conseguirla', 0.0),
 ('logros', 0.0),
 ('desean', 0.0),
 ('adhieran', 0.0),
 ('respetad\xc3\xadsima', 0.0)]

In [67]:
from numpy import dot
from numpy.linalg import norm
import spacy

cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))

nlp = spacy.load('en_core_web_lg')

king = nlp.vocab[u'king']
man = nlp.vocab[u'man']
woman = nlp.vocab[u'woman']

result = king.vector - man.vector + woman.vector

vocabulary = [w for w in nlp.vocab if w.has_vector and w.orth_.islower() and w.lower_ not in ['king','man','woman']]

vocabulary.sort(key=lambda w: cosine(w.vector, result))
vocabulary.reverse()

print(vocabulary[0].orth_)

queen
