In [1]:
import json
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm
from sklearn.decomposition import TruncatedSVD
from embeds import ppmi

In [2]:
# Load the thesaurus from JSON file (https://github.com/dariusk/ea-thesaurus)
with open('../../data/free_assoc/ea-thesaurus.json') as json_file:
    eat_dict = json.load(json_file)
    
eat_dict["0"]

[{'NOTHING': '13'},
 {'ZERO': '9'},
 {'DEAR': '3'},
 {'BLANK': '2'},
 {'NO': '2'},
 {'OH': '2'},
 {'0': '2'},
 {'<INFINITY>': '1'},
 {'?': '1'},
 {'ANOTHER': '1'},
 {'ANYTHING': '1'},
 {'BINARY': '1'},
 {'CAROL': '1'},
 {'COWBOY': '1'},
 {'GOD': '1'},
 {'HELL': '1'},
 {'KNICKERS': '1'},
 {'LUST': '1'},
 {'MY GOD': '1'},
 {'NIL': '1'},
 {'ODD': '1'},
 {'QUERY': '1'},
 {'QUESTION': '1'},
 {'ROCKETS': '1'},
 {'SHIT': '1'},
 {'SNEAKY': '1'},
 {'UNIVERSE': '1'},
 {'WOW': '1'},
 {'10': '1'},
 {'7': '1'}]

In [3]:
# Finding all unique cues and responses
cues, resps = set(), set()
for key, values in tqdm(eat_dict.items()):
    cues.add(key)
    cue_resps = {list(resp_count.keys())[0] for resp_count in values}
    resps = resps.union(cue_resps)

# Initializing empty dataframe with cues and responses 
eat = pd.DataFrame(index=list(cues), columns=list(resps))
eat.shape

  0%|          | 0/8210 [00:00<?, ?it/s]

(8210, 22764)

In [4]:
# Populating dataframe
for key, values in tqdm(eat_dict.items()):
    for resp_count in values:
        resp = list(resp_count.keys())[0]
        eat.loc[key, resp] = resp_count[resp]

eat = eat.astype(float)
eat

  0%|          | 0/8210 [00:00<?, ?it/s]

Unnamed: 0,Unnamed: 1,REQUIEM,PLASTER,ZOOLOGY,GROWN,STEPMOTHER,SMOKED,GALORE,INLAY,OBSERVER,...,BAILEY,MOTIONS,HICCUP,LACQUER,HEPWORTH'S,ANYHOW,ACTIVITY,BLOC,IMPORTANT,BLOTCH
GHOST,,,,,,,,,,,...,,,,,,,,,,
PLASTER,,,,,,,,,,,...,,,,,,,,,,
ZERO,,,,,,,,,,,...,,,,,,,,,,
ZOOLOGY,,,,,,,,,,,...,,,,,,,,,,
GROWN,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CLERK,,,,,,,,,,,...,,,,,,,,,,
ACTIVITY,,,,,,,,,,,...,,1.0,,,,,,,,
MAJOR,,,,,,,,,,,...,,,,,,,,,1.0,
IMPORTANT,,,,,,,,,,,...,,,,,,,,,2.0,


In [5]:
ppmi_eat = ppmi(eat).fillna(0.0)
ppmi_eat

Unnamed: 0,Unnamed: 1,REQUIEM,PLASTER,ZOOLOGY,GROWN,STEPMOTHER,SMOKED,GALORE,INLAY,OBSERVER,...,BAILEY,MOTIONS,HICCUP,LACQUER,HEPWORTH'S,ANYHOW,ACTIVITY,BLOC,IMPORTANT,BLOTCH
GHOST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
PLASTER,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
ZERO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
ZOOLOGY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
GROWN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CLERK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
ACTIVITY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,7.670195,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
MAJOR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,4.281393,0.0
IMPORTANT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,4.953487,0.0


In [6]:
svd = TruncatedSVD(n_components=300, algorithm='arpack')
ppmi_svd_eat = pd.DataFrame(svd.fit_transform(ppmi_eat.values), index=ppmi_eat.index)
ppmi_svd_eat.index = ppmi_svd_eat.index.str.lower()
ppmi_svd_eat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
ghost,3.143124,-0.806364,0.773935,-1.076283,1.296427,-0.908678,0.120029,2.305159,-1.310765,1.314072,...,1.146717,0.442320,-1.435152,0.269102,1.289817,-0.640381,0.662700,-0.696323,-1.180133,-0.014110
plaster,1.780208,-1.636817,-0.898971,-0.116473,0.285668,0.162246,0.122672,-0.521240,-0.695946,-0.113130,...,1.094155,-0.367318,0.632618,0.291718,-0.129338,0.585291,0.120472,0.143188,-0.394475,0.092080
zero,2.096199,1.567034,-1.103064,0.702470,-0.385038,0.499772,0.002027,0.559389,-0.496517,-0.451998,...,0.338512,0.315372,-0.042440,-0.258977,-0.411403,-0.077875,-0.394323,-0.894552,0.167536,0.167830
zoology,1.907977,-0.424283,0.238040,-1.674460,-0.872163,2.004265,0.786844,0.920888,-0.736738,-3.000628,...,-0.128995,0.943013,0.630946,0.351703,0.607503,-0.451141,0.086735,0.615756,-0.481182,0.493982
grown,2.963737,-1.077703,0.142566,0.030426,-0.928049,-1.123956,-0.837416,-0.775601,-0.460619,-2.145318,...,-0.364134,0.655117,0.637483,-0.402205,-0.373959,-0.134339,0.211661,0.243471,-0.679902,0.117196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
clerk,2.818131,0.104776,-1.297067,-3.798199,-2.619697,0.769770,0.500773,-1.475586,0.609723,-0.294400,...,-0.526537,-0.074440,-0.829320,0.644218,1.080006,-0.320259,0.210638,0.831609,-1.217698,-0.468904
activity,4.256556,0.212074,-1.463351,-1.668155,0.122512,-0.135960,3.046359,0.282145,3.764399,-2.026787,...,0.636082,-0.791405,1.668595,0.424841,1.029997,-0.608965,-0.178420,-0.339075,-0.112263,0.344899
major,1.386894,0.213983,-0.364428,-2.017267,-0.534038,0.086321,-0.648365,-0.237859,0.177452,-0.208857,...,-1.165940,0.002336,-0.408652,-0.602253,0.290610,1.025158,0.031239,-0.303142,0.699018,0.075317
important,5.501912,4.966271,0.023412,0.007275,-1.371886,1.647454,0.631481,0.479704,-0.802232,-0.840455,...,0.652841,1.625469,0.049265,0.402497,-1.232409,1.302855,-0.122502,0.273065,-0.001663,-1.000856


In [7]:
# checking it worked
def find_closest_k(word, k=5):
    word_vec = ppmi_svd_eat.loc[word]
    return ppmi_svd_eat.dot(word_vec).nlargest(k)

find_closest_k('dog')

dog       87.824092
furry     62.784142
animal    61.627352
dogs      59.696172
mice      59.623431
dtype: float64

In [8]:
# Subsetting to only the words in psychNorms norms
to_pull = set(
    pd.read_csv('../../data/psychNorms/psychNorms.zip', index_col=0, low_memory=False, compression='zip').index
)
ppmi_svd_eat = ppmi_svd_eat.loc[ppmi_svd_eat.index.isin(to_pull)].astype(float)

ppmi_svd_eat.to_csv('../../data/embeds/PPMI_SVD_EAT.csv')