In [3]:
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm
import os
import json
from rsa import compute_rsm
import numpy as np

In [4]:
# Loading dictionary of dtype to embed
with open('../../data/raw/dtype_to_embed.json', 'r') as f:
    dtype_to_embed = json.load(f)
brain_behav_names = dtype_to_embed['brain'] + dtype_to_embed['behavior']

# Iterating through pulled_embeds and finding union of all brain and behavior vocabs
pulled_embeds_path = '../../data/raw/embeds/'
brain_behav_union = set()
for f_name in tqdm(os.listdir(pulled_embeds_path)):
    embed_name = f_name.split('.')[0]
    if embed_name in brain_behav_names:
        vocab = set(pd.read_csv(pulled_embeds_path + f_name, index_col=0).index)
        brain_behav_union = brain_behav_union.union(vocab)

len(brain_behav_union)    

  0%|          | 0/26 [00:00<?, ?it/s]

45884

In [5]:
# Pulling and unit norming embeddings
for f_name in tqdm(os.listdir(pulled_embeds_path)):
    embed = pd.read_pickle(pulled_embeds_path + f_name, compression='zip')
    if embed.shape[0] != embed.shape[1]: # If not square, then it's a word embedding (not a RSM)
        embed = embed[embed.index.isin(brain_behav_union)]
        embed = compute_rsm(embed).astype(np.float32)
    
    print(f_name, embed.shape)
    embed.to_pickle(f'../../data/processed/rsms/{f_name}')

  0%|          | 0/26 [00:00<?, ?it/s]

tungsten_microarray.pkl (621, 621)
