In [1]:
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm
import os
import itertools
from rsa import compute_rsa
from joblib import Parallel, delayed
import gc
import json
import numpy as np

In [2]:
with open('../../data/raw/dtype_to_embed.json') as f:
    dtype_to_embed = json.load(f)

names = dtype_to_embed['text'] + dtype_to_embed['behavior'] + dtype_to_embed['brain']
name_combs = list(itertools.combinations(names, 2))
len(name_combs)

300

In [None]:
# Function to process each file pair
def process_file_pair(name_i, name_j, dir_path):
    print(f"{name_i, name_j}")

    # Load the RSMs
    rsm_i = pd.read_pickle(dir_path + name_i + '.pkl')
    print(rsm_i.shape)
    rsm_j = pd.read_pickle(dir_path + name_j + '.pkl')
    print(rsm_j.shape)

    # Compute RSA
    corr, n_words = compute_rsa(rsm_i, rsm_j, max_n=10000)

    # Extract names
    name_i, name_j = name_i[:-4], name_j[:-4]
    print(f"spearman_r={corr}, n_words={n_words}")
    print('-------------------------------------------------')
    
    # Free memory by deleting the RSMs
    del rsm_i, rsm_j
    gc.collect()

    return [name_i, name_j, corr, n_words]

# Make the output directory
rsm_dir_path = f'../../data/processed/rsms/'
output_dir_path = '../../data/final/'
if not os.path.exists(output_dir_path):
    os.makedirs(output_dir_path)

# Compute RSA in parallel
rsa = []
for name_i, name_j in tqdm(name_combs):
    rsa.append(process_file_pair(name_i, name_j, rsm_dir_path))
    
# Save the results
rsa_df = pd.DataFrame(rsa, columns=['name_i', 'name_j', 'spearman', 'n_words'])
rsa_df.to_csv(f'../../data/final/rsa.csv', index=False)

  0%|          | 0/300 [00:00<?, ?it/s]

('CBOW_GoogleNews', 'fastText_CommonCrawl')
(42522, 42522)
(44091, 44091)
Filled self-correlations with nan
spearman_r=0.6342034857810949, n_words=42374
-------------------------------------------------
('CBOW_GoogleNews', 'fastText_Wiki_News')
(42522, 42522)
