In [1]:
import pandas as pd
from sklearn.preprocessing import minmax_scale
from sklearn.decomposition import TruncatedSVD
from tqdm.notebook import tqdm_notebook as tqdm

In [2]:
sims_long = pd.read_csv('../../data/sim_rel/sim_rel_combined.csv')
sims_long

Unnamed: 0,data,word_1,word_2,value,type
0,AG203,king,cabbage,0.023,similarity
1,AG203,professor,cucumber,0.031,similarity
2,AG203,noon,string,0.054,similarity
3,AG203,chord,smile,0.054,similarity
4,AG203,rooster,voyage,0.062,similarity
...,...,...,...,...,...
27439,Zie30,Eminem,Music,4.137,relatedness
27440,Zie30,Currency Converter,Exchange Rates,4.137,relatedness
27441,Zie30,Cars,Honda,4.176,relatedness
27442,Zie30,EasyJet,Cheap Flights,4.294,relatedness


In [3]:
sims_long['data'].unique()

array(['AG203', 'BakerVerb', 'MartinezAldana', 'MC30', 'MEN3000', 'RG65',
       'SimLex999', 'SimVerb3500', 'SL7576sem', 'SL7576vis', 'WP300',
       'YP130', 'Atlasify240', 'GM30', 'MT287', 'MT771', 'Rel122',
       'RW2034', 'WordSim353', 'Zie25', 'Zie30'], dtype=object)

In [6]:
# Min max scaling
for dataset in tqdm(sims_long['data'].unique()):
    dat_bool = sims_long['data'] == dataset
    sims_long.loc[dat_bool, 'value'] = minmax_scale(sims_long.loc[dat_bool, 'value'])

sims_long

  0%|          | 0/21 [00:00<?, ?it/s]

Unnamed: 0,data,word_1,word_2,value,type
0,AG203,king,cabbage,0.000000,similarity
1,AG203,professor,cucumber,0.008188,similarity
2,AG203,noon,string,0.031730,similarity
3,AG203,chord,smile,0.031730,similarity
4,AG203,rooster,voyage,0.039918,similarity
...,...,...,...,...,...
27439,Zie30,Eminem,Music,0.944963,relatedness
27440,Zie30,Currency Converter,Exchange Rates,0.944963,relatedness
27441,Zie30,Cars,Honda,0.957090,relatedness
27442,Zie30,EasyJet,Cheap Flights,0.993781,relatedness


In [7]:
# Ordering word pairs so that each pair only has one ordering for aggregating
sims_long = pd.DataFrame({
    'word_a': sims_long[['word_1', 'word_2']].min(axis=1),
    'word_b': sims_long[['word_1', 'word_2']].max(axis=1),
    'value': sims_long['value']
})

# Aggregating 
sims_long = sims_long.groupby(['word_a', 'word_b'], as_index=False).mean()

# Pivoting 
voc = pd.concat([sims_long['word_a'], sims_long['word_b']]).unique()
sims_wide = pd.DataFrame(columns=voc, index=voc)
for _, row in tqdm(sims_long.iterrows(), total=len(sims_long)):
    word_a, word_b, val = row['word_a'], row['word_b'], row['value']
    sims_wide.loc[word_a, word_b] = val
    sims_wide.loc[word_b, word_a] = val

sims_wide.shape

  0%|          | 0/19230 [00:00<?, ?it/s]

(6837, 6837)

In [8]:
# Dropping columns with < 5 values
sims_wide = sims_wide.dropna(thresh=5, axis=1)

# Filling in missing values
sims_wide = sims_wide.fillna(0.0)
sims_wide.shape

  sims_wide = sims_wide.fillna(0.0)


(6837, 1754)

In [9]:
# SVD
svd = TruncatedSVD(n_components=300, algorithm='arpack', random_state=42)
sims = pd.DataFrame(svd.fit_transform(sims_wide), index=sims_wide.index)
sims

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
4Chan,1.928169e-07,-2.567416e-07,2.200080e-07,4.151120e-06,3.368289e-07,-3.115937e-08,8.713444e-08,-1.543855e-07,-1.316228e-07,-4.946256e-07,...,7.257943e-05,-1.205384e-07,2.190560e-05,-4.154759e-06,-3.204082e-05,1.745400e-05,2.649955e-07,2.506515e-05,-3.121751e-05,4.464578e-05
ABBA,-1.172135e-16,9.959386e-17,2.229186e-17,1.959649e-17,2.779049e-17,1.119426e-16,-8.561995e-17,-2.308180e-16,1.537380e-17,-1.337232e-17,...,8.113083e-17,-3.676039e-17,9.797920e-17,-5.636274e-18,-2.220127e-17,-8.514396e-17,-9.072177e-17,1.648171e-16,2.454819e-29,-1.854649e-31
AOL Instant Messenger,1.633085e-11,-4.801111e-11,4.397708e-11,9.065854e-10,8.777473e-11,-1.232771e-11,3.959957e-11,-7.903366e-11,-7.442125e-11,-3.056925e-10,...,-8.072857e-06,2.356529e-06,-5.161654e-06,-1.951415e-06,2.963532e-06,2.961440e-07,2.827138e-06,-5.040901e-07,2.416639e-06,-4.936657e-06
Abortion,-2.317434e-17,6.452515e-17,1.963720e-16,2.321183e-16,-1.564090e-16,-7.956750e-18,2.506332e-16,-1.194685e-16,1.571083e-17,1.078124e-16,...,-1.884009e-16,-2.319248e-17,1.491355e-16,-6.790407e-17,-9.248179e-17,-7.466715e-17,7.078841e-29,1.141705e-29,-4.329082e-30,2.534644e-31
Africa,8.170211e-07,-1.100324e-06,9.439845e-07,1.783941e-05,1.452384e-06,-1.355746e-07,3.804035e-07,-6.760704e-07,-5.779222e-07,-2.177043e-06,...,6.702510e-04,-1.614633e-06,2.034998e-04,-3.831733e-05,-3.003825e-04,1.640447e-04,2.104760e-06,2.377431e-04,-2.974277e-04,4.265574e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
winners,0.000000e+00,0.000000e+00,-0.000000e+00,0.000000e+00,0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,0.000000e+00,...,0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00
weekend,0.000000e+00,0.000000e+00,-0.000000e+00,0.000000e+00,0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,0.000000e+00,...,0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00
willingness,0.000000e+00,0.000000e+00,-0.000000e+00,0.000000e+00,0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,0.000000e+00,...,0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00
wireless network,0.000000e+00,0.000000e+00,-0.000000e+00,0.000000e+00,0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,0.000000e+00,...,0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00


In [10]:
sims.to_csv('../../data/embeds/SVD_sim_rel.csv')