In [1]:
import pandas as pd
from embeds import ppmi

## Preprocessing

In [2]:
sf = (
    pd.read_csv('../../data/embed_train/south_florida.csv', usecols=['CUE', ' TARGET'])
    .rename(columns={'CUE': 'cue', ' TARGET': 'resp'})
)
sf

Unnamed: 0,cue,resp
0,A,B
1,A,ALPHABET
2,A,THE
3,A,GRADE
4,A,LETTER
...,...,...
72171,ZUCCHINI,EGGPLANT
72172,ZUCCHINI,FRUIT
72173,ZUCCHINI,GROSS
72174,ZUCCHINI,ITALIAN


In [3]:
# Dropping responses wit only 1 occurence
print(len(sf['resp'].unique()))
resp_counts = sf['resp'].value_counts().to_dict()
sf = sf[sf['resp'].map(resp_counts) >= 2]
print(len(sf['resp'].unique()))

10469
6129


## PPMI SVD

In [4]:
from sklearn.decomposition import TruncatedSVD

In [5]:
# Pivoting into cue-resp df
sf = (
    sf
    .value_counts()
    .reset_index().rename(columns={0: 'count'})
    .pivot(index='cue', columns='resp') 
)
sf.shape

(5018, 6129)

In [6]:
ppmi_sf = ppmi(sf).fillna(0.0).astype(float)
sf.iloc[:10, :10]

Unnamed: 0_level_0,count,count,count,count,count,count,count,count,count,count
resp,25 CENTS,7-ELEVEN,A,ABANDON,ABBREVIATION,ABC,ABIDE,ABILITY,ABLE,ABNORMAL
cue,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
A,,,,,,,,,,
AARDVARK,,,,,,,,,,
ABDOMEN,,,,,,,,,,
ABDUCT,,,,,,,,,,
ABILITY,,,,,,,,,1.0,
ABLE,,,,,,,,1.0,,
ABNORMAL,,,,,,,,,,
ABOVE,,,,,,,,,,
ABSENCE,,,,,,,,,,
ABSENT,,,,,,,,,,


In [7]:
svd = TruncatedSVD(n_components=300, algorithm='arpack')
ppmi_svd_sf = pd.DataFrame(
    svd.fit_transform(ppmi_sf),
    index=sf.index.str.lower()
)
ppmi_svd_sf

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
cue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a,0.968818,-0.516965,-0.662094,1.133238,0.150394,0.211033,0.151098,0.001893,-0.136514,-0.078739,...,0.085147,0.289515,0.390023,-0.095107,-0.554990,0.387167,0.141066,0.237608,0.103889,0.550102
aardvark,1.235736,0.584157,0.078721,-0.298430,-0.668566,0.305736,-0.706096,1.095671,-2.011559,-0.323914,...,-0.712772,0.662506,0.486665,0.043652,0.067880,0.487528,-0.676963,0.044486,-0.094738,0.054499
abdomen,1.165667,0.045823,0.833086,-0.258936,-0.256806,0.984543,-1.455135,-0.718998,0.513940,-0.213127,...,-0.747342,-0.282365,0.138461,0.064205,0.048690,-0.286551,1.155925,0.183626,0.269733,-0.384604
abduct,0.897200,-0.539818,0.139589,-0.107784,0.509113,-0.905412,-0.067202,-0.056037,0.366422,-0.076956,...,-0.398564,0.212886,-0.118456,-0.085660,0.203745,-0.023937,0.881739,-0.589141,0.009632,0.296884
ability,1.602899,-1.223389,-1.164496,0.827359,0.251059,0.804518,-0.034790,-0.007837,0.884569,-0.563494,...,-0.008285,0.396632,-0.203328,0.691478,1.413046,0.884936,-0.274899,-0.034012,-0.659182,-0.831511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zipper,1.400228,0.506143,-0.714224,-1.344043,-0.211120,-0.546714,-0.761437,-1.304518,0.007818,0.173199,...,0.079078,-0.282321,-1.003569,-0.112786,0.788214,-0.481913,0.435197,0.631677,-0.201884,-0.243150
zit,1.316361,1.197682,0.820141,0.041448,-0.210493,0.205414,-0.460607,-0.697768,-1.265050,-0.465189,...,-0.377835,0.665946,0.154780,-0.933315,-0.122098,0.704564,0.168952,-0.216047,-0.194981,-0.160195
zone,2.396152,-0.445935,-1.141692,-1.088805,0.913552,0.355692,-0.209030,0.682715,0.471312,-1.714680,...,0.570625,0.980730,0.387639,0.503277,0.718129,1.118771,-1.217579,-1.274145,0.538615,0.160385
zoo,0.786850,0.578637,0.277241,-0.450464,-0.262061,-0.534664,-0.355080,0.586022,-1.607770,-1.440782,...,0.301800,-0.082206,-0.234388,0.519533,0.757565,0.827374,0.344871,0.304993,0.350342,-0.779548


In [8]:
# checking it worked
def find_closest_k(word, k=5):
    word_vec = ppmi_svd_sf.loc[word]
    return ppmi_svd_sf.dot(word_vec).nlargest(k)

find_closest_k('dog')

cue
dog        59.967436
paw        56.222049
animal     49.175692
animals    48.215066
pet        47.111557
dtype: float64

In [9]:
# Subsetting to only the words in psychNorms norms
to_pull = set(
    pd.read_csv('../../data/psychNorms/psychNorms.zip', index_col=0, low_memory=False, compression='zip').index
)
ppmi_svd_sf = ppmi_svd_sf.loc[ppmi_svd_sf.index.isin(to_pull)].astype(float)

ppmi_svd_sf.to_csv('../../data/embeds/PPMI_SVD_South_Florida.csv')