In [1]:
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm
import os
import itertools
from rsa import compute_rsa
import gc
import json

In [2]:
with open('../../data/raw/dtype_to_embed.json') as f:
    dtype_to_embed = json.load(f)

names = dtype_to_embed['text'] + dtype_to_embed['behavior'] + dtype_to_embed['brain']
name_combs = list(itertools.combinations(names, 2))
len(name_combs)

325

In [3]:
# Function to process each file pair
def process_file_pair(f_name_i, f_name_j, dir_path):
    
    # Load the RSMs
    rsm_i = pd.read_pickle(dir_path + f_name_i)
    rsm_j = pd.read_pickle(dir_path + f_name_j)

    # Compute RSA
    corr, n_words = compute_rsa(rsm_i, rsm_j, max_n=10000)
    print(f"spearman_r={corr}, n_words={n_words}")
    print('-------------------------------------------------')
    
    # Free memory by deleting the RSMs
    del rsm_i, rsm_j
    gc.collect()

    return corr, n_words


# Make the output directory
output_dir_path = '../../data/final/'
if not os.path.exists(output_dir_path):
    os.makedirs(output_dir_path)

# Compute RSA in parallel
rsm_dir_path = f'../../data/processed/rsms/'
rsa = []
for name_i, name_j in tqdm(name_combs):
    print(f"{name_i, name_j}")
    spearman, n = process_file_pair(f'{name_i}.pkl', f'{name_j}.pkl', rsm_dir_path)
    rsa.append([name_i, name_j, spearman, n])
    
# Save the results
rsa_df = pd.DataFrame(rsa, columns=['name_i', 'name_j', 'spearman', 'n_words'])
rsa_df.to_csv(f'../../data/final/rsa.csv', index=False)

  0%|          | 0/325 [00:00<?, ?it/s]

('CBOW_GoogleNews', 'fastText_CommonCrawl')
spearman_r=0.6317678524987212, n_words=42682
-------------------------------------------------
('CBOW_GoogleNews', 'fastText_Wiki_News')
spearman_r=0.5389631486108302, n_words=42258
-------------------------------------------------
('CBOW_GoogleNews', 'fastTextSub_OpenSub')
spearman_r=0.46191460641278564, n_words=40062
-------------------------------------------------
('CBOW_GoogleNews', 'GloVe_CommonCrawl')
spearman_r=0.49966771829980583, n_words=42660
-------------------------------------------------
('CBOW_GoogleNews', 'GloVe_Twitter')
spearman_r=0.2195737672427991, n_words=32663
-------------------------------------------------
('CBOW_GoogleNews', 'GloVe_Wikipedia')
spearman_r=0.45665182904321394, n_words=39066
-------------------------------------------------
('CBOW_GoogleNews', 'LexVec_CommonCrawl')
spearman_r=0.498118467934779, n_words=41679
-------------------------------------------------
('CBOW_GoogleNews', 'morphoNLM')
spearman_r=0