In [1]:
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm
import os
import json
from rsa import compute_rsm

In [2]:
# Loading dictionary of dtype to embed
with open('../../data/raw/dtype_to_embed.json', 'r') as f:
    dtype_to_embed = json.load(f)
    
brain_behav_names = dtype_to_embed['brain'] + dtype_to_embed['behavior']

# Iterating through pulled_embeds and finding union of all brain and behavior vocabs
embeds_path = '../../data/raw/embeds/'
brain_behav_union = set()
for f_name in tqdm(os.listdir(embeds_path)):
    
    embed_name = f_name.split('.')[0]
    if embed_name in brain_behav_names:
        vocab = set(pd.read_csv(embeds_path + f_name, index_col=0).index)
        brain_behav_union = brain_behav_union.union(vocab)

len(brain_behav_union)    

  0%|          | 0/26 [00:00<?, ?it/s]

45882

In [3]:
# Creating output directory
output_dir = '../../data/processed/rsms/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Computing RSMs
for f_name in tqdm(os.listdir(embeds_path)):
    print(f_name)

    embed = pd.read_csv(embeds_path + f_name, index_col=0)
    
    if embed.shape[0] != embed.shape[1]: # If not square, then it's a word embedding (not a RSM)
        embed = embed[embed.index.isin(brain_behav_union)]
        print(embed.shape)
        embed = compute_rsm(embed)

    embed.to_pickle(f'../../data/processed/rsms/{f_name}')

  0%|          | 0/26 [00:00<?, ?it/s]

fMRI_text_hyper_align.csv
(1205, 1000)
norms_sensorimotor.csv
(36854, 11)
EEG_text.csv
(3355, 104)
LexVec_CommonCrawl.csv
(43727, 300)


KeyboardInterrupt: 