In [1]:
import pandas as pd
import os
import sys
from embeds import fix_corrupt, multi_inner_align, standardize
import numpy as np
from hypertools.tools import align
import torch

sys.path.append('..')
from rca.rca import run_rca

  from pkg_resources import get_distribution


## Cognival

In [2]:
def read_txt(f) -> pd.DataFrame:
    """For reading the cognival data"""
    pulled = {}
    for line in f:
        word, *vec = line.split()
        pulled[word] = vec
    pulled = fix_corrupt(pulled)
    return pd.DataFrame(pulled).T.astype(float)

def read_individual_fmri(path: str) -> pd.DataFrame:
    dfs = []

    for f_name in os.listdir(path):

        if f_name.endswith('.txt'):
            with open(path + f_name, 'r') as f:
                dfs.append(read_txt(f))
        else:
            pass

    # Align indices
    dfs = list(multi_inner_align(dfs))

    return dfs

fmris_text_cognival = read_individual_fmri('../../data/fmri_text_cognival/')
fmris_speech_cognival = read_individual_fmri('../../data/fmri_speech_cognival/')

len(fmris_text_cognival), len(fmris_speech_cognival)

(8, 27)

In [3]:
# Standardize before hyper-aligning
fmris_text_cognival = [standardize(df) for df in fmris_text_cognival]
fmris_speech_cognival = [standardize(df) for df in fmris_speech_cognival]

# --- Hyper aligning individuals ---
def hyper_align(dfs: list) -> pd.DataFrame:
    df = np.mean(align(dfs, align='hyper'), axis=0)
    return pd.DataFrame(df, index=dfs[0].index)

fMRI_text_cognival = hyper_align(fmris_text_cognival)
fMRI_speech_cognival = hyper_align(fmris_speech_cognival)

## Denoised (Antonia)

In [4]:
def read_from_torch(f_path) -> pd.DataFrame:
    fmri_dict = torch.load(f_path, weights_only=False)
    voc, vecs = fmri_dict['dico'], fmri_dict['vectors'].numpy()
    return pd.DataFrame(vecs, index=voc, dtype=float)

denoise_path_template = '../../data/fmri_text_denoise/fMRI_text_denoise_{}d.pth'

fMRI_text_denoise_128d = read_from_torch(denoise_path_template.format(128))
fMRI_text_denoise_256d = read_from_torch(denoise_path_template.format(256))
fMRI_text_denoise_512d = read_from_torch(denoise_path_template.format(512))
fMRI_text_denoise_1024d = read_from_torch(denoise_path_template.format(1024))

fMRI_text_denoise_128d

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
Harry,-143.982681,51.265324,-73.016899,74.548111,141.569458,-3.107077,14.127753,-29.177664,12.766369,21.466639,...,-6.536313,-1.969234,6.133821,-0.842457,0.370705,2.955463,-4.847620,-1.390713,2.575084,1.202820
had,-127.148003,-417.956604,299.856567,-150.026901,-94.690819,93.597816,63.608559,41.971951,-9.625628,46.413219,...,-5.443430,7.407501,2.746191,-5.817649,-0.660896,2.467710,10.343082,-8.100626,-2.753611,7.861194
never,-725.864990,-410.511505,211.946350,-221.292572,-96.619041,424.433136,212.477356,1.393820,-152.869812,90.387222,...,17.369629,27.869184,-11.553320,-13.483357,10.268998,0.918990,-11.896314,-13.483791,19.849094,16.825212
believed,-2717.895020,-1213.610840,1656.710449,-414.832001,138.131638,981.171082,517.020569,-92.849586,16.785938,559.375793,...,52.605537,54.383560,-109.776634,-44.306526,-11.851886,32.966724,-47.805988,-70.414574,82.728691,16.483789
he,-348.185120,-86.909981,161.300644,425.559357,253.294922,29.610430,-20.238287,-30.480759,-27.076218,36.708912,...,5.568176,-2.478137,-9.667665,0.090169,7.073146,-6.388587,7.460380,-5.873054,17.383780,-3.726576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
grubby,5493.725586,-860.738159,96.581200,-114.270538,-524.003601,248.288269,201.176880,-826.146423,10.335138,271.975098,...,-12.817405,-53.413204,15.273238,11.075395,-32.439362,-51.257465,36.036182,-43.363522,1.828963,-52.249451
vault,5521.819336,-888.447327,73.344925,-55.472729,-534.191345,239.823303,245.571671,-894.114502,-169.258316,58.534554,...,-8.671092,-55.320320,-13.919680,21.021137,-20.458317,-27.349249,54.737080,-37.851021,0.523778,-30.322697
seven,5515.584473,-892.888733,57.494415,-48.431969,-528.394165,241.797775,261.311951,-903.200195,-216.163422,22.045902,...,-9.460995,-51.728687,-28.285343,16.569675,-13.407843,-11.184761,65.148331,-30.104380,-0.721720,-25.696573
hundred,5517.106934,-894.937744,60.638649,-49.630238,-528.597656,241.865143,261.439636,-898.800964,-215.590195,18.218752,...,-9.753736,-51.207165,-29.787228,12.147860,-8.493997,-7.011636,65.823349,-25.347975,-0.434391,-23.963270


In [5]:
# Finding best denoise dimensionality
to_compare = {
    'fMRI_text_cognival': fMRI_text_cognival.copy(),
    'fMRI_text_denoise_128d': fMRI_text_denoise_128d.copy(),
    'fMRI_text_denoise_256d': fMRI_text_denoise_256d.copy(),
    'fMRI_text_denoise_512d': fMRI_text_denoise_512d.copy(),
    'fMRI_text_denoise_1024d': fMRI_text_denoise_1024d.copy()
}

# Aligning for fair comparison
to_compare = dict(zip(to_compare.keys(), multi_inner_align(to_compare.values())))

# Standardizing
to_compare = {name: standardize(embed) for name, embed in to_compare.items()}

# Loading norm data
norms = pd.read_csv('../../data/psychNorms/psychNorms_processed.zip', index_col=0, low_memory=False, compression='zip')
norms_meta = pd.read_csv('../../data/psychNorms/psychNorms_metadata_processed.csv', index_col='norm')
norms

Unnamed: 0_level_0,frequency_lund,frequency_kucera,frequency_subtlexus,frequency_subtlexuk,frequency_blog_gimenes,frequency_twitter_gimenes,frequency_news_gimenes,frequency_written_cobuild,frequency_spoken_cobuild,context_diversity_subtlexus,...,person_vanarsdall,goals_vanarsdall,movement_vanarsdall,concreteness_vanarsdall,familiarity_vanarsdall,imageability_vanarsdall,familiarity_fear,aoa_fear,imageability_fear,sensory_experience_juhasz2013
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'em,0.0,,,,,,,1.3617,1.9138,,...,,,,,,,,,,
'neath,0.0,,,,,,,0.0000,0.0000,,...,,,,,,,,,,
're,0.0,,,,,,,0.9031,1.6335,,...,,,,,,,,,,
'shun,0.0,,,,,,,0.0000,0.0000,,...,,,,,,,,,,
'tis,0.0,,,,,,,0.4771,0.6021,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
shrick,,,,,,,,,,,...,,,,,,,2.62,4.38,2.93,
post office,,,,,,,,,,,...,,,,,,,3.79,3.07,5.29,
fishing rod,,,,,,,,,,,...,,,,,,,2.29,3.38,5.64,
March,,,,,,,,,,,...,,,,,,,3.43,2.76,3.50,


In [6]:
results = run_rca(to_compare, norms, norms_meta, n_jobs=10)
results

  0%|          | 0/5 [00:00<?, ?it/s]

fMRI_text_cognival:   0%|          | 0/291 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Plotting

In [None]:
# Adding norm category
results['norm_category'] = (
    results['norm']
    .apply(lambda norm: norms_meta.loc[norm]['category'])
    .replace({'_': ' '}, regex=True)
)

results_avg = (
    results[['norm_category', 'embed', 'r2_mean']]
    .groupby(['norm_category', 'embed'], as_index=False).median()
    .dropna()
)

results_avg_piv = results_avg.pivot(columns='embed', index='norm_category', values='r2_mean')
results_avg_piv.round(2)

In [None]:
# Finding the top-performing fmri_text_denoise
sorted_denoise = results.filter(like='denoise', axis=1).mean().sort_values()
sorted_denoise

# Saving

In [None]:
# Subsetting to only the words in norms
to_pull = set(
    pd.read_csv('../../data/psychNorms/psychNorms.zip', index_col=0, low_memory=False, compression='zip').index
)
fMRI_text_cognival = fMRI_text_cognival.loc[fMRI_text_cognival.index.isin(to_pull)].astype(float)
fMRI_speech_cognival = fMRI_speech_cognival.loc[fMRI_speech_cognival.index.isin(to_pull)].astype(float)
fMRI_text_denoise = None

# Saving 
fMRI_text_cognival.to_csv('../../data/embeds/fMRI_text_cognival.csv')
fMRI_speech_cognival.to_csv('../../data/embeds/fMRI_speech_cognival.csv')
fMRI_text_denoise.to_csv('../../data/embeds/fMRI_text_denoise.csv')