In [None]:
import pandas as pd
import os
import sys
from embeds import fix_corrupt, multi_inner_align, standardize
import numpy as np
from hypertools.tools import align
import torch

sys.path.append('..')
from rca.rca import run_rca

## Cognival

In [None]:
def read_txt(f) -> pd.DataFrame:
    """For reading the cognival data"""
    pulled = {}
    for line in f:
        word, *vec = line.split()
        pulled[word] = vec
    pulled = fix_corrupt(pulled)
    return pd.DataFrame(pulled).T.astype(float)

def read_individual_fmri(path: str) -> pd.DataFrame:
    dfs = []

    for f_name in os.listdir(path):

        if f_name.endswith('.txt'):
            with open(path + f_name, 'r') as f:
                dfs.append(read_txt(f))
        else:
            pass

    # Align indices
    dfs = list(multi_inner_align(dfs))

    return dfs

fmris_text_cognival = read_individual_fmri('../../data/fmri_text_cognival/')
fmris_speech_cognival = read_individual_fmri('../../data/fmri_speech_cognival/')

len(fmris_text_cognival), len(fmris_speech_cognival)

In [None]:
# Standardize before hyper-aligning
fmris_text_cognival = [standardize(df) for df in fmris_text_cognival]
fmris_speech_cognival = [standardize(df) for df in fmris_speech_cognival]

# --- Hyper aligning individuals ---
def hyper_align(dfs: list) -> pd.DataFrame:
    df = np.mean(align(dfs, align='hyper'), axis=0)
    return pd.DataFrame(df, index=dfs[0].index)

fMRI_text_cognival = hyper_align(fmris_text_cognival)
fMRI_speech_cognival = hyper_align(fmris_speech_cognival)

## Denoised (Antonia)

In [None]:
def read_from_torch(f_path) -> pd.DataFrame:
    fmri_dict = torch.load(f_path, weights_only=False)
    voc, vecs = fmri_dict['dico'], fmri_dict['vectors'].numpy()
    return pd.DataFrame(vecs, index=voc, dtype=float)

denoise_path_template = '../../data/fmri_text_denoise/fMRI_text_denoise_{}d.pth'

fMRI_text_denoise_128d = read_from_torch(denoise_path_template.format(128))
fMRI_text_denoise_256d = read_from_torch(denoise_path_template.format(256))
fMRI_text_denoise_512d = read_from_torch(denoise_path_template.format(512))
fMRI_text_denoise_1024d = read_from_torch(denoise_path_template.format(1024))

fMRI_text_denoise_128d

In [None]:
# Finding best denoise dimensionality
to_compare = {
    'fMRI_text_cognival': fMRI_text_cognival.copy(),
    'fMRI_text_denoise_128d': fMRI_text_denoise_128d.copy(),
    'fMRI_text_denoise_256d': fMRI_text_denoise_256d.copy(),
    'fMRI_text_denoise_512d': fMRI_text_denoise_512d.copy(),
    'fMRI_text_denoise_1024d': fMRI_text_denoise_1024d.copy()
}

# Aligning for fair comparison
to_compare = dict(zip(to_compare.keys(), multi_inner_align(to_compare.values())))

# Standardizing
to_compare = {name: standardize(embed) for name, embed in to_compare.items()}

# Loading norm data
norms = pd.read_csv('../../data/psychNorms/psychNorms_processed.zip', index_col=0, low_memory=False, compression='zip')
norms_meta = pd.read_csv('../../data/psychNorms/psychNorms_metadata_processed.csv', index_col='norm')
norms

In [None]:
results = run_rca(to_compare, norms, norms_meta, n_jobs=10)
results

## Comparing

In [None]:
# Adding norm category
results['norm_category'] = (
    results['norm']
    .apply(lambda norm: norms_meta.loc[norm]['category'])
    .replace({'_': ' '}, regex=True)
)

results_avg = (
    results[['norm_category', 'embed', 'r2_mean']]
    .groupby(['norm_category', 'embed'], as_index=False).median()
    .dropna()
)

results_avg_piv = results_avg.pivot(columns='embed', index='norm_category', values='r2_mean')
results_avg_piv.round(2)

In [None]:
# Finding the top-performing fmri_text_denoise
sorted_overall = results_avg_piv.mean().sort_values(ascending=False)
sorted_overall

# Saving

In [None]:
# Subsetting to only the words in norms
to_pull = set(
    pd.read_csv('../../data/psychNorms/psychNorms.zip', index_col=0, low_memory=False, compression='zip').index
)
fMRI_text_cognival = fMRI_text_cognival.loc[fMRI_text_cognival.index.isin(to_pull)].astype(float)
fMRI_speech_cognival = fMRI_speech_cognival.loc[fMRI_speech_cognival.index.isin(to_pull)].astype(float)
fMRI_text_denoise = fMRI_text_denoise_128d.loc[fMRI_text_denoise_128d.index.isin(to_pull)].astype(float)

# Saving 
fMRI_text_cognival.to_csv('../../data/embeds/fMRI_text_cognival.csv')
fMRI_speech_cognival.to_csv('../../data/embeds/fMRI_speech_cognival.csv')
fMRI_text_denoise.to_csv('../../data/embeds/fMRI_text_denoise.csv')