In [20]:
import pandas as pd
import os
import sys
from embeds import fix_corrupt, multi_inner_align
import numpy as np
from hypertools.tools import align
import torch
from tqdm.notebook import tqdm

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(help))))


In [14]:
def read_txt(f):
    pulled = {}
    for line in f:
        word, *vec = line.split()
        pulled[word] = vec
    pulled = fix_corrupt(pulled)
    return pd.DataFrame(pulled).T.astype(float)


def read_fmri(path: str) -> pd.DataFrame:
    dfs = []

    for f_name in os.listdir(path):

        if f_name.endswith('.txt'):
            with open(path + f_name, 'r') as f:
                dfs.append(read_txt(f))

        elif f_name.endswith('pth'):
            fmri_dict = torch.load(path + f_name, weights_only=False)
            voc, vecs = fmri_dict['dico'], fmri_dict['vectors'].numpy()
            dfs.append(pd.DataFrame(vecs, index=voc, dtype=float))

        else:
            pass

    # Align indices
    dfs = list(multi_inner_align(dfs))
    
    return dfs

fmris_text_cognival = read_fmri('../../data/fmri_text_cognival/')
fmris_speech_cognival = read_fmri('../../data/fmri_speech_cognival/')

fmris_text_denoise_128d = read_fmri('../../data/fmri_text_denoise/128d/')
fmris_text_denoise_256d = read_fmri('../../data/fmri_text_denoise/256d/')
fmris_text_denoise_512d = read_fmri('../../data/fmri_text_denoise/512d/')

len(fmris_text_cognival), len(fmris_speech_cognival), len(fmris_text_denoise_128d), len(fmris_text_denoise_256d), len(fmris_text_denoise_512d)

(8, 27, 8, 8, 8)

# Hyperaligning individual data

In [15]:
def hyper_align(dfs: list) -> pd.DataFrame:
    df = np.mean(align(dfs, align='hyper'), axis=0)
    return pd.DataFrame(df, index=dfs[0].index)
    
# cognival
fMRI_text_cognival = hyper_align(fmris_text_cognival)
fMRI_speech_cognival = hyper_align(fmris_speech_cognival)

# denoise
fMRI_text_denoise_128d = hyper_align(fmris_text_denoise_128d)
fMRI_text_denoise_256d = hyper_align(fmris_text_denoise_256d)
fMRI_text_denoise_512d = hyper_align(fmris_text_denoise_512d)

## Model comparison

In [16]:
embeds = {
    'fMRI_text_cognival_participant1': fmris_text_cognival[0],
    'fMRI_speech_cognival_participant1': fmris_speech_cognival[0],
    'fMRI_text_denoise_128d_participant1': fmris_text_denoise_128d[0],
    'fMRI_text_denoise_256d_participant1': fmris_text_denoise_256d[0],
    'fMRI_text_denoise_512d_participant1': fmris_text_denoise_512d[0],
    'fMRI_text_cognival': fMRI_text_cognival,
    'fMRI_speech_cognival': fMRI_speech_cognival,
    'fMRI_text_denoise_128d': fMRI_text_denoise_128d,
    'fMRI_text_denoise_256d': fMRI_text_denoise_256d,
    'fMRI_text_denoise_512d': fMRI_text_denoise_512d
}

{name: embed.shape for name, embed in embeds.items()}

{'fMRI_text_cognival_participant1': (1288, 1000),
 'fMRI_speech_cognival_participant1': (588, 6),
 'fMRI_text_denoise_128d_participant1': (1405, 128),
 'fMRI_text_denoise_256d_participant1': (1405, 256),
 'fMRI_text_denoise_512d_participant1': (1405, 512),
 'fMRI_text_cognival': (1288, 1000),
 'fMRI_speech_cognival': (588, 6),
 'fMRI_text_denoise_128d': (1405, 128),
 'fMRI_text_denoise_256d': (1405, 256),
 'fMRI_text_denoise_512d': (1405, 512)}

In [19]:
psychNorms = pd.read_csv('../../data/psychNorms/psychNorms_processed.zip', index_col=0, low_memory=False, compression='zip')
psychNorms_meta = pd.read_csv('../../data/psychNorms/psychNorms_metadata_processed.csv', index_col='norm')
psychNorms

Unnamed: 0_level_0,frequency_lund,frequency_kucera,frequency_subtlexus,frequency_subtlexuk,frequency_blog_gimenes,frequency_twitter_gimenes,frequency_news_gimenes,frequency_written_cobuild,frequency_spoken_cobuild,context_diversity_subtlexus,...,person_vanarsdall,goals_vanarsdall,movement_vanarsdall,concreteness_vanarsdall,familiarity_vanarsdall,imageability_vanarsdall,familiarity_fear,aoa_fear,imageability_fear,sensory_experience_juhasz2013
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'em,0.0,,,,,,,1.3617,1.9138,,...,,,,,,,,,,
'neath,0.0,,,,,,,0.0000,0.0000,,...,,,,,,,,,,
're,0.0,,,,,,,0.9031,1.6335,,...,,,,,,,,,,
'shun,0.0,,,,,,,0.0000,0.0000,,...,,,,,,,,,,
'tis,0.0,,,,,,,0.4771,0.6021,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
shrick,,,,,,,,,,,...,,,,,,,2.62,4.38,2.93,
post office,,,,,,,,,,,...,,,,,,,3.79,3.07,5.29,
fishing rod,,,,,,,,,,,...,,,,,,,2.29,3.38,5.64,
March,,,,,,,,,,,...,,,,,,,3.43,2.76,3.50,


In [None]:
def run_rca(embeds: dict, norms: pd.DataFrame, norm_meta: pd.DataFrame, embed_to_type) -> pd.DataFrame:

    results = []
    for embed_name in tqdm(embeds.keys()):
        embed = embeds[embed]

        to_print = []
        for norm_name in tqdm(norms.columns, desc=embed_name):

            # Aligning vocabs
            y = norms[norm_name].dropna()
            X, y = embed.align(y, axis=0, join='inner', copy=True)

            # Checking norm dtype
            norm_dtype = norm_meta.loc[norm_name, 'type']



# Saving

In [None]:
# Subsetting to only the words in psychNorms norms
to_pull = set(
    pd.read_csv('../../data/psychNorms/psychNorms.zip', index_col=0, low_memory=False, compression='zip').index
)
fMRI_text_cognival = fMRI_text_cognival.loc[fMRI_text_cognival.index.isin(to_pull)].astype(float)
fMRI_speech_cognival = fMRI_speech_cognival.loc[fMRI_speech_cognival.index.isin(to_pull)].astype(float)

# Saving 
fMRI_text_cognival.to_csv('../../data/embeds/fMRI_text_cognival.csv')
fMRI_speech_cognival.to_csv('../../data/embeds/fMRI_speech_cognival.csv')