In [None]:
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm
import os
import json
from rsa import compute_rsm

In [None]:
# Loading dictionary of dtype to embed
with open('../../data/dtype_to_embed.json', 'r') as f:
    dtype_to_embed = json.load(f)
    
brain_behav_names = dtype_to_embed['brain'] + dtype_to_embed['behavior']

# Iterating through embeds and finding union of all brain and behavior vocabs (embeds have already been subsetted to their intersection with the union of all the norm vocabs)
embeds_path = '../../data/embeds/'
brain_behav_union = set()
for name in tqdm(brain_behav_names):
    vocab = set(pd.read_csv(embeds_path + name + '.csv', index_col=0).index)
    brain_behav_union = brain_behav_union.union(vocab)

len(brain_behav_union)    

In [None]:
# Creating output directory
output_dir = '../../data/rsms/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Computing RSMs
order_to_compute = dtype_to_embed['brain'] + dtype_to_embed['behavior'] + dtype_to_embed['text']
for embed_name in tqdm(order_to_compute):
    print(embed_name)
    
    path = embeds_path + embed_name + '.csv'
    embed = pd.read_csv(path, index_col=0)
    
    if embed.shape[0] != embed.shape[1]: # If not square, then it's a word embedding (not a RSM)
        embed = embed[embed.index.isin(brain_behav_union)]
        print(embed.shape)
        embed = compute_rsm(embed)

    embed.to_pickle(output_dir + f'{embed_name}.pkl')