In [8]:
import json
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm
from sklearn.decomposition import TruncatedSVD
from embeds import ppmi

In [None]:
# Load the thesaurus from JSON file (https://github.com/dariusk/ea-thesaurus)
with open('../../data/embeds_train/ea-thesaurus.json') as json_file:
    eat_dict = json.load(json_file)
    
eat_dict["0"]

In [None]:
# Finding all unique cues and responses
cues, resps = set(), set()
for key, values in tqdm(eat_dict.items()):
    cues.add(key)
    cue_resps = {list(resp_count.keys())[0] for resp_count in values}
    resps = resps.union(cue_resps)

# Initializing empty dataframe with cues and responses 
eat = pd.DataFrame(index=list(cues), columns=list(resps))
eat.shape

In [None]:
# Populating dataframe
for key, values in tqdm(eat_dict.items()):
    for resp_count in values:
        resp = list(resp_count.keys())[0]
        eat.loc[key, resp] = resp_count[resp]

eat = eat.astype(float)
eat

In [None]:
ppmi_eat = ppmi(eat).fillna(0.0)
ppmi_eat

In [10]:
svd = TruncatedSVD(n_components=300, algorithm='arpack')
ppmi_svd_eat = pd.DataFrame( svd.fit_transform(ppmi_eat.values), index=ppmi_eat.index)
ppmi_svd_eat.index = ppmi_svd_eat.index.str.lower()
ppmi_svd_eat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
GIGGLE,2.167927,0.375563,2.758483,0.854133,-0.667614,-1.641266,1.306137,-0.791301,-0.360371,0.190121,...,-0.405488,0.085836,-0.016532,0.178211,-0.443269,-0.301608,-0.418918,-0.583633,-0.117077,-0.153281
SUBURBIA,2.931710,-0.949029,-0.570196,-1.893571,-1.090748,-1.507408,-1.567403,2.263327,2.255524,-3.446940,...,0.527875,1.844376,-1.398774,-0.332089,-0.092459,0.867476,-0.360491,0.573079,-0.205977,0.189599
TING,1.347815,-0.678988,-0.132480,0.444549,0.219535,0.611710,-0.075654,0.170958,0.036595,1.380967,...,0.534689,0.482910,0.155562,0.360247,0.725048,0.239356,-0.266341,-1.037193,0.599160,-0.215805
LOTION,2.054189,-1.811100,0.736915,0.592254,0.600783,1.754077,1.074092,-0.168595,-0.645175,0.567000,...,-0.674530,-1.155365,-0.068777,0.388419,-0.561606,0.287357,-0.543839,0.063403,-0.238251,0.898187
SHOULDER,2.590813,-2.103426,-0.534780,0.592822,1.042483,0.320478,0.681497,-2.391483,-1.267528,-0.445545,...,0.438396,-0.602145,1.528370,-0.348873,-0.571695,0.109613,-0.932463,-0.492410,0.251181,-0.148256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TANGLED,1.651795,-1.397709,-0.571897,0.186481,0.834872,0.191213,0.731303,-1.003719,-1.298051,0.033736,...,-1.314355,-0.332342,-0.410520,-0.294566,-0.828750,0.568998,-0.935828,0.812024,-0.624747,1.776850
SPANNER,1.491090,-1.329350,-1.114769,-0.885721,0.241525,0.687041,0.457015,-0.975430,-0.524517,-0.191444,...,-0.665559,-0.252784,-0.341754,0.053111,-2.064653,-0.065839,-0.311614,-0.568724,-0.890954,0.424696
LEPER,2.763657,-0.801647,2.158743,-0.220582,2.691216,0.930539,-0.776617,0.025627,-0.987777,-0.936058,...,1.027818,-0.286792,-0.287484,1.142807,0.174168,0.270343,-1.693439,-0.597462,-0.399247,-0.186600
MISTRUST,4.303057,3.127893,3.208775,-0.981809,2.564949,0.401537,-3.019214,-0.542838,-1.917662,2.023895,...,-0.707933,0.639987,-0.798199,0.593236,0.638008,-0.922726,-1.145198,-0.494845,0.945255,-0.189797


In [13]:
# checking it worked
def find_closest_k(word, k=5):
    word_vec = ppmi_svd_eat.loc[word]
    return ppmi_svd_eat.dot(word_vec).nlargest(k)

find_closest_k('dog')

dog       87.824092
furry     62.784142
animal    61.627352
dogs      59.696172
mice      59.623431
dtype: float64

In [14]:
# Subsetting to only the words in psychNorms norms
to_pull = set(
    pd.read_csv('../../data/psychNorms/psychNorms.zip', index_col=0, low_memory=False, compression='zip').index
)
ppmi_svd_eat = ppmi_svd_eat.loc[ppmi_svd_eat.index.isin(to_pull)].astype(float)

ppmi_svd_eat.to_csv('../../data/embeds/PPMI_SVD_EAT.csv')