In [1]:
import pandas as pd
import os
import sys
from embeds import fix_corrupt, multi_inner_align, standardize
import numpy as np
from hypertools.tools import align
import torch

sys.path.append('..')
from rca.rca import run_rca

  from pkg_resources import get_distribution


## Cognival

In [2]:
def read_txt(f) -> pd.DataFrame:
    """For reading the cognival data"""
    pulled = {}
    for line in f:
        word, *vec = line.split()
        pulled[word] = vec
    pulled = fix_corrupt(pulled)
    return pd.DataFrame(pulled).T.astype(float)

def read_individual_fmri(path: str) -> pd.DataFrame:
    dfs = []

    for f_name in os.listdir(path):

        if f_name.endswith('.txt'):
            with open(path + f_name, 'r') as f:
                dfs.append(read_txt(f))
        else:
            pass

    # Align indices
    dfs = list(multi_inner_align(dfs))

    return dfs

fmris_text_cognival = read_individual_fmri('../../data/fmri_text_cognival/')
fmris_speech_cognival = read_individual_fmri('../../data/fmri_speech_cognival/')

len(fmris_text_cognival), len(fmris_speech_cognival)

8 27 8 8 8


In [23]:
# Standardize before hyper-aligning
fmris_text_cognival = [standardize(df) for df in fmris_text_cognival]
fmris_speech_cognival = [standardize(df) for df in fmris_speech_cognival]

# --- Hyper aligning individuals ---
def hyper_align(dfs: list) -> pd.DataFrame:
    df = np.mean(align(dfs, align='hyper'), axis=0)
    return pd.DataFrame(df, index=dfs[0].index)

fMRI_text_cognival = hyper_align(fmris_text_cognival)
fMRI_speech_cognival = hyper_align(fmris_speech_cognival)

## Denoised (Antonia)

In [None]:
def read_from_torch(f_path) -> pd.DataFrame:
    fmri_dict = torch.load(f_path, weights_only=False)
    voc, vecs = fmri_dict['dico'], fmri_dict['vectors'].numpy()
    return pd.DataFrame(vecs, index=voc, dtype=float)

fMRI_text_denoise_128d = read_from_torch('')
fMRI_text_denoise_256d = read_from_torch('')
fMRI_text_denoise_512d = read_from_torch('')

fMRI_text_denoise_128d

In [1]:
# Finding best denoise dimensionality
to_compare = {
    'fMRI_text_cognival': fMRI_text_cognival.copy(),
    'fMRI_text_denoise_128d': fMRI_text_denoise_128d.copy(),
    'fMRI_text_denoise_256d': fMRI_text_denoise_256d.copy(),
    'fMRI_text_denoise_512d': fMRI_text_denoise_512d.copy()
}

# Aligning for fair comparison
to_compare = dict(zip(to_compare.keys(), multi_inner_align(to_compare.values())))

# Standardizing
to_compare = {name: standardize(embed) for name, embed in to_compare.items()}

# Loading norm data
norms = pd.read_csv('../../data/psychNorms/psychNorms_processed.zip', index_col=0, low_memory=False, compression='zip')
norms_meta = pd.read_csv('../../data/psychNorms/psychNorms_metadata_processed.csv', index_col='norm')
norms

NameError: name 'fMRI_text_denoise_128d' is not defined

In [27]:
results = run_rca(to_compare, norms, norms_meta)
results

  0%|          | 0/8 [00:00<?, ?it/s]

fMRI_text_cognival_participant:   0%|          | 0/88 [00:00<?, ?it/s]

                                  norm  train_n   r2_mean     r2_sd check
56         n_semantic_neighbors_shaoul      881  0.018786  0.037550  pass
29                    haptic_lancaster      712  0.007802  0.014477  pass
57  distance_semantic_neighbors_shaoul      881  0.005233  0.035730  pass
45                    valence_mohammad      501  0.003122  0.016986  pass
36                  hand_arm_lancaster      712 -0.001156  0.024811  pass
50                    arousal_mohammad      501 -0.002012  0.016566  pass
53                  dominance_mohammad      501 -0.004229  0.026664  pass
32             interoceptive_lancaster      712 -0.004505  0.010642  pass
16                prevalence_brysbaert      718 -0.005420  0.009512  pass
74              naming_accuracy_balota      882 -0.005446  0.009649  pass


fMRI_text_cognival:   0%|          | 0/88 [00:00<?, ?it/s]

                                 norm  train_n   r2_mean     r2_sd check
2                 frequency_subtlexus      877  0.088550  0.109966  pass
3                 frequency_subtlexuk      880  0.081935  0.100987  pass
0                      frequency_lund      905  0.079998  0.099975  pass
13     context_diversity_news_gimenes      884  0.078482  0.104070  pass
6              frequency_news_gimenes      884  0.078479  0.105064  pass
12  context_diversity_twitter_gimenes      884  0.076833  0.102312  pass
1                    frequency_kucera      872  0.076807  0.136276  pass
5           frequency_twitter_gimenes      884  0.076595  0.102471  pass
11     context_diversity_blog_gimenes      884  0.076045  0.109708  pass
4              frequency_blog_gimenes      884  0.075911  0.112028  pass


fMRI_text_denoise_128d_participant:   0%|          | 0/88 [00:00<?, ?it/s]

                                norm  train_n   r2_mean     r2_sd check
29                  haptic_lancaster      712  0.027954  0.018120  pass
68  visual_lexical_accuracy_keuleers      793 -0.001844  0.002291  pass
32           interoceptive_lancaster      712 -0.002735  0.005123  pass
82                 perc_known_winter      599 -0.003276  0.004154  pass
66    visual_lexical_accuracy_balota      882 -0.003288  0.003635  pass
50                  arousal_mohammad      501 -0.003595  0.008310  pass
28                auditory_lancaster      712 -0.003620  0.011794  pass
18                     aoa_brysbaert      519 -0.003833  0.004064  pass
16              prevalence_brysbaert      718 -0.004339  0.004053  pass
36                hand_arm_lancaster      712 -0.004518  0.027546  pass


fMRI_text_denoise_256d_participant:   0%|          | 0/88 [00:00<?, ?it/s]

                                norm  train_n   r2_mean     r2_sd check
28                auditory_lancaster      712  0.008462  0.004362  pass
32           interoceptive_lancaster      712 -0.000387  0.005901  pass
68  visual_lexical_accuracy_keuleers      793 -0.002260  0.002215  pass
66    visual_lexical_accuracy_balota      882 -0.003531  0.003644  pass
50                  arousal_mohammad      501 -0.003619  0.008356  pass
18                     aoa_brysbaert      519 -0.003827  0.004000  pass
16              prevalence_brysbaert      718 -0.004493  0.004113  pass
67   visual_lexical_accuracy_mandera      718 -0.004872  0.007874  pass
82                 perc_known_winter      599 -0.005466  0.003974  pass
61            cue_probability_nelson      431 -0.005599  0.008637  pass


fMRI_text_denoise_512d_participant:   0%|          | 0/88 [00:00<?, ?it/s]

                                   norm  train_n   r2_mean     r2_sd check
28                   auditory_lancaster      712  0.036367  0.023751  pass
77  recognition_memory_accuracy_cortese      292  0.024766  0.037466  pass
35               mouth_throat_lancaster      712  0.007442  0.007997  pass
56          n_semantic_neighbors_shaoul      881  0.004639  0.009409  pass
82                    perc_known_winter      599  0.003538  0.007849  pass
57   distance_semantic_neighbors_shaoul      881  0.000957  0.007721  pass
29                     haptic_lancaster      712  0.000274  0.010233  pass
36                   hand_arm_lancaster      712 -0.000448  0.016242  pass
32              interoceptive_lancaster      712 -0.001242  0.006653  pass
68     visual_lexical_accuracy_keuleers      793 -0.001707  0.002498  pass


fMRI_text_denoise_128d:   0%|          | 0/88 [00:00<?, ?it/s]

                                   norm  train_n   r2_mean     r2_sd check
28                   auditory_lancaster      712  0.032517  0.015234  pass
29                     haptic_lancaster      712  0.011990  0.016320  pass
77  recognition_memory_accuracy_cortese      292  0.011964  0.016190  pass
36                   hand_arm_lancaster      712  0.005515  0.039182  pass
56          n_semantic_neighbors_shaoul      881 -0.002658  0.012463  pass
22                 concreteness_glasgow      303 -0.003258  0.023928  pass
32              interoceptive_lancaster      712 -0.003628  0.017648  pass
82                    perc_known_winter      599 -0.003841  0.003803  pass
18                        aoa_brysbaert      519 -0.003900  0.004064  pass
16                 prevalence_brysbaert      718 -0.004332  0.004230  pass


fMRI_text_denoise_256d:   0%|          | 0/88 [00:00<?, ?it/s]

                                   norm  train_n   r2_mean     r2_sd check
28                   auditory_lancaster      712  0.060283  0.025268  pass
23                 imageability_glasgow      303  0.027633  0.072153  pass
77  recognition_memory_accuracy_cortese      292  0.022207  0.042576  pass
21               concreteness_brysbaert      712  0.001204  0.016222  pass
32              interoceptive_lancaster      712  0.000119  0.015421  pass
82                    perc_known_winter      599 -0.000346  0.002209  pass
50                     arousal_mohammad      501 -0.000626  0.009732  pass
29                     haptic_lancaster      712 -0.001221  0.010973  pass
57   distance_semantic_neighbors_shaoul      881 -0.001361  0.009737  pass
68     visual_lexical_accuracy_keuleers      793 -0.002046  0.002284  pass


fMRI_text_denoise_512d:   0%|          | 0/88 [00:00<?, ?it/s]

                                  norm  train_n   r2_mean     r2_sd check
28                  auditory_lancaster      712  0.047891  0.009600  pass
23                imageability_glasgow      303  0.034378  0.064394  pass
22                concreteness_glasgow      303  0.031313  0.051942  pass
56         n_semantic_neighbors_shaoul      881  0.005782  0.009343  pass
32             interoceptive_lancaster      712  0.005345  0.003233  pass
57  distance_semantic_neighbors_shaoul      881  0.003011  0.008346  pass
35              mouth_throat_lancaster      712  0.001949  0.008341  pass
29                    haptic_lancaster      712  0.001466  0.013712  pass
36                  hand_arm_lancaster      712 -0.000632  0.015734  pass
68    visual_lexical_accuracy_keuleers      793 -0.001959  0.002592  pass


Unnamed: 0,embed,embed_type,norm,train_n,test_n,p,r2_mean,r2_sd,check
0,fMRI_text_cognival_participant,,frequency_lund,905,227,1000,-0.028576,0.029151,pass
1,fMRI_text_cognival_participant,,frequency_kucera,872,219,1000,-0.049635,0.057808,pass
2,fMRI_text_cognival_participant,,frequency_subtlexus,877,220,1000,-0.046786,0.077060,pass
3,fMRI_text_cognival_participant,,frequency_subtlexuk,880,221,1000,-0.037376,0.066358,pass
4,fMRI_text_cognival_participant,,frequency_blog_gimenes,884,222,1000,-0.026952,0.036523,pass
...,...,...,...,...,...,...,...,...,...
699,fMRI_text_denoise_512d,,iconicity_winter2023,599,150,512,-0.035473,0.027085,pass
700,fMRI_text_denoise_512d,,aoa_schock,126,32,512,-0.027804,0.034623,pass
701,fMRI_text_denoise_512d,,aoa_rt_schock,126,32,512,-0.041706,0.036968,pass
702,fMRI_text_denoise_512d,,iconicity_winter2017,320,80,512,-0.068191,0.064661,pass


## Plotting

In [28]:
# Adding norm category
results['norm_category'] = (
    results['norm']
    .apply(lambda norm: norms_meta.loc[norm]['category'])
    .replace({'_': ' '}, regex=True)
)

results_avg = (
    results[['norm_category', 'embed', 'r2_mean']]
    .groupby(['norm_category', 'embed'], as_index=False).median()
    .dropna()
)

results_avg_piv = results_avg.pivot(columns='embed', index='norm_category', values='r2_mean')
results_avg_piv.round(2)

embed,fMRI_text_cognival,fMRI_text_cognival_participant,fMRI_text_denoise_128d,fMRI_text_denoise_128d_participant,fMRI_text_denoise_256d,fMRI_text_denoise_256d_participant,fMRI_text_denoise_512d,fMRI_text_denoise_512d_participant
norm_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
age of acquisition,-0.02,-0.04,-0.03,-0.03,-0.03,-0.03,-0.03,-0.03
arousal,-0.02,-0.03,-0.02,-0.03,-0.02,-0.02,-0.01,-0.02
auditory lexical decision,-0.03,-0.03,-0.03,-0.03,-0.03,-0.03,-0.03,-0.03
concreteness,-0.01,-0.02,-0.01,-0.02,-0.0,-0.02,0.01,-0.01
dominance,0.0,-0.02,-0.02,-0.01,-0.03,-0.02,-0.05,-0.02
familiarity,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01
frequency,0.08,-0.04,-0.03,-0.04,-0.04,-0.04,-0.02,-0.02
iconicity/transparency,-0.03,-0.06,-0.05,-0.06,-0.05,-0.06,-0.05,-0.06
imageability,-0.0,-0.05,-0.02,-0.02,0.01,-0.02,-0.0,-0.02
motor,-0.02,-0.02,-0.02,-0.02,-0.01,-0.02,-0.01,-0.01


In [None]:
# Finding the top-performing fmri_text_denoise
sorted_denoise = results.filter(like='denoise', axis=1).mean().sort_values()
sorted_denoise

# Saving

In [None]:
# Subsetting to only the words in norms
to_pull = set(
    pd.read_csv('../../data/psychNorms/psychNorms.zip', index_col=0, low_memory=False, compression='zip').index
)
fMRI_text_cognival = fMRI_text_cognival.loc[fMRI_text_cognival.index.isin(to_pull)].astype(float)
fMRI_speech_cognival = fMRI_speech_cognival.loc[fMRI_speech_cognival.index.isin(to_pull)].astype(float)
fMRI_text_denoise = None

# Saving 
fMRI_text_cognival.to_csv('../../data/embeds/fMRI_text_cognival.csv')
fMRI_speech_cognival.to_csv('../../data/embeds/fMRI_speech_cognival.csv')
fMRI_text_denoise.to_csv('../../data/embeds/fMRI_text_denoise.csv')