In [1]:
import pandas as pd
from embeds import ppmi

# Preprocessing

In [2]:
# Melting into cue-resp df
swow = pd.read_csv('../../data/free_assoc/SWOW-EN.R100.csv')
swow = (swow
        .melt(id_vars='cue', value_vars=['R1', 'R2', 'R3'],
              value_name='resp')
        .drop(columns=['variable'])
        .dropna(axis=0)
        .astype(str))

swow

Unnamed: 0,cue,resp
0,although,nevertheless
1,deal,no
2,music,notes
3,inform,tell
4,way,path
...,...,...
3684595,strange,stranger
3684596,sunset,clause
3684597,useless,worth
3684598,volume,key


In [3]:
# Dropping responses with <5 occurrences
print(len(swow.resp.unique()))
resp_counts = swow.resp.value_counts().to_dict()
swow = swow[swow.resp.map(lambda x: resp_counts[x] >= 5)]
print(len(swow.resp.unique()))

134213
32312


# Building Embedding

In [4]:
from sklearn.decomposition import TruncatedSVD

In [5]:
# Pivoting into cue-rep-count df
swow = (swow
        .value_counts()
        .reset_index().rename(columns={0: 'count'})
        .pivot(index='cue', columns='resp'))

swow

Unnamed: 0_level_0,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count
resp,!,$,%,'sup,-,-ly,.,.com,0,007,...,zombies,zone,zones,zoo,zoology,zoom,zoom in,zucchini,zygote,zz
cue,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Abel,,,,,,,,,,,...,,,,,,,,,,
Aboriginal,,,,,,,,,,,...,,,,,,,,,,
Adam,,,,,,,,,,,...,,,,,,,,,,
Advil,,,,,,,,,,,...,,,,,,,,,,
Africa,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zombies,,,,,,,,,,,...,,,,,,,,,,
zone,,,,,,,,,,,...,,,,,,,,,,
zoo,,,,,,,,,,,...,,,,,2.0,,,,,
zoom,,,,,,,,,,,...,,,,,,2.0,,,,


In [6]:
swow_ppmi = ppmi(swow).fillna(0.0)
swow_ppmi

Unnamed: 0_level_0,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count
resp,!,$,%,'sup,-,-ly,.,.com,0,007,...,zombies,zone,zones,zoo,zoology,zoom,zoom in,zucchini,zygote,zz
cue,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Abel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
Aboriginal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
Adam,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
Advil,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
Africa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zombies,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
zone,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
zoo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,7.844534,0.000000,0.0,0.0,0.0,0.0
zoom,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,5.351898,0.0,0.0,0.0,0.0


In [9]:
# SVD
svd = TruncatedSVD(n_components=300, algorithm='arpack')
swow_ppmi_svd = pd.DataFrame(svd.fit_transform(swow_ppmi.values),
                             index=swow_ppmi.index)
swow_ppmi_svd

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
cue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abel,2.596794,0.299556,0.143823,0.967294,-1.288471,0.643886,0.342490,1.972009,0.813801,1.830905,...,0.312574,0.459090,-1.271551,-0.210143,0.287171,-0.146275,-0.129649,0.156938,0.038982,-0.624552
Aboriginal,3.453354,-1.418625,-0.309232,0.004895,-2.848235,1.015647,-1.200593,2.947173,-0.088245,0.923150,...,0.141971,0.653498,-0.648082,-0.143290,0.430632,1.266303,1.949117,-0.386414,0.365505,0.554787
Adam,2.677174,-0.203603,0.138737,1.568495,-1.295039,0.638541,-0.352811,1.274088,0.263471,0.912076,...,0.780021,0.997879,-0.725559,0.011951,0.111412,0.174819,-0.482924,0.155790,0.037018,-0.618753
Advil,1.989590,-1.322967,0.847095,-0.910866,1.733291,-0.558487,0.400698,-0.966160,1.952438,2.605908,...,-0.921765,1.388033,1.425312,-1.285927,-0.368075,-1.062752,0.284415,-0.071896,1.907177,1.389733
Africa,4.718788,-2.920645,-0.462586,-0.754773,-3.635738,1.059833,0.565142,3.903512,0.603554,0.994736,...,-1.289675,-0.646093,3.391136,0.256842,0.880635,-1.097716,-0.170871,-0.405151,-0.171273,-0.387788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zombies,4.595400,-1.789345,3.450252,-1.167854,-0.383953,-1.637413,-2.532979,2.006725,0.992685,1.257853,...,-0.163420,-0.744620,-0.509140,0.610003,0.439289,-0.035196,-0.714640,-0.650996,-1.116779,0.401379
zone,4.943383,0.099102,-1.760140,-3.450767,-1.652865,0.129844,1.237090,1.679624,0.098315,-0.505914,...,0.068277,-0.459860,0.552081,0.500685,-0.720834,-0.660114,-0.080828,-0.137270,0.018611,1.301483
zoo,3.914599,-2.112934,-0.080263,-0.844014,-1.830328,-0.480745,-0.113124,2.081342,-0.501086,0.117836,...,-1.738868,0.140101,0.545498,-0.436228,1.063970,-0.725592,-0.711304,0.489148,0.309838,-0.454112
zoom,4.114652,-0.383222,-2.097423,-3.037944,-1.393578,-2.728006,-1.217700,-0.440577,0.397753,-2.757840,...,-0.206498,-0.420049,-1.570691,0.003422,-0.350275,-0.006150,0.117937,-1.013131,1.355192,1.033279


In [10]:
# Subsetting to only the words in psychNorms norms
to_pull = set(
    pd.read_csv('../../data/psychNorms/psychNorms.zip', index_col=0, low_memory=False, compression='zip').index
)
swow_ppmi_svd = swow_ppmi_svd.loc[swow_ppmi_svd.index.isin(to_pull)].astype(float)

swow_ppmi_svd.to_csv('../../data/embeds/PPMI_SVD_SWOW.csv')