In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import RidgeCV, LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold, cross_val_score
from tqdm.notebook import tqdm
import json
from rca import process_categorical, best_logistic_solver, make_binary_scoring, make_multiclass_scoring, checker
import pickle

## Loading Data

In [2]:
rca = pd.read_csv('../../data/results/rca.csv').dropna()
meta = pd.read_csv('../../data/psychNorms/psychNorms_metadata_processed.csv', index_col=0)
norms = pd.read_csv('../../data/psychNorms/psychNorms_processed.zip', index_col=0, compression='zip', low_memory=False)

# Adding norm_cat to rca
rca['norm_cat'] = (
    rca['norm'].apply(lambda norm: meta.loc[norm]['category'])
    .replace({'_': ' '}, regex=True)
)

rca

Unnamed: 0,embed,embed_type,norm,train_n,test_n,p,r2_mean,r2_sd,check,norm_cat
0,CBOW_GoogleNews,text,frequency_lund,28012,7003,300,0.522118,0.008398,pass,frequency
1,CBOW_GoogleNews,text,frequency_kucera,19285,4822,300,0.500425,0.009710,pass,frequency
2,CBOW_GoogleNews,text,frequency_subtlexus,28636,7159,300,0.537265,0.009814,pass,frequency
3,CBOW_GoogleNews,text,frequency_subtlexuk,29316,7330,300,0.545643,0.008416,pass,frequency
4,CBOW_GoogleNews,text,frequency_blog_gimenes,31876,7969,300,0.523700,0.008448,pass,frequency
...,...,...,...,...,...,...,...,...,...,...
6979,THINGS,behavior,imageability_vanarsdall,376,95,49,0.074112,0.090266,pass,imageability
6980,THINGS,behavior,familiarity_fear,173,44,49,0.151637,0.158767,pass,familiarity
6981,THINGS,behavior,aoa_fear,173,44,49,0.015012,0.129597,pass,age of acquisition
6982,THINGS,behavior,imageability_fear,173,44,49,-0.023230,0.101781,pass,imageability


In [3]:
embed_avgs = (
    rca[['embed', 'norm_cat', 'r2_mean']]
    .groupby(['embed', 'norm_cat']).median(numeric_only=True) # median is used to mitigate outliers within norm_cats
    .groupby('embed').mean()
    .rename(columns={'r2_mean': 'r2_avg'})
)
embed_avgs

Unnamed: 0_level_0,r2_avg
embed,Unnamed: 1_level_1
CBOW_GoogleNews,0.447376
EEG_speech,-0.065708
EEG_text,-0.047123
GloVe_CommonCrawl,0.313229
GloVe_Twitter,0.296789
GloVe_Wikipedia,0.299761
LexVec_CommonCrawl,0.2906
PPMI_SVD_EAT,0.296205
PPMI_SVD_SWOW,0.398496
PPMI_SVD_SouthFlorida,0.252671


In [4]:
# Adding embed types
with open('../../data/embed_to_dtype.json', 'r') as f:
    embed_to_type = json.load(f)
['type'] = embed_avgs.index.map(embed_to_type)

# Finding top 2 text 
text_name_1, text_name_2 = (
    embed_avgs.query('type == "text"').sort_values('r2_avg', ascending=False).head(2).index.tolist()
)
text_name_1, text_name_2

('CBOW_GoogleNews', 'fastText_CommonCrawl')

In [5]:
# Finding top behavior
behavior_name = (
    embed_avgs.query('type == "behavior"').sort_values('r2_avg', ascending=False).head(1).index[0]
)
behavior_name

'PPMI_SVD_SWOW'

In [6]:
# Finding top brain
brain_name = (
    embed_avgs.query('type == "brain"').sort_values('r2_avg', ascending=False).head(1).index[0]
)
brain_name

'EEG_text'

In [7]:
# Loading embeds
text_1 = pd.read_csv(f'../../data/embeds/{text_name_1}.csv', index_col=0)
text_2 = pd.read_csv(f'../../data/embeds/{text_name_2}.csv', index_col=0)
behavior = pd.read_csv(f'../../data/embeds/{behavior_name}.csv', index_col=0)
brain = pd.read_csv(f'../../data/embeds/{brain_name}.csv', index_col=0)

with open('../../data/brain_behav_union.pkl', 'rb') as f:
    brain_behav_union = list(pickle.load(f))

# Subsdetting to brain and behavior union
text_1 = text_1.loc[text_1.index.isin(brain_behav_union)]
text_2 = text_2.loc[text_2.index.isin(brain_behav_union)]
behavior = behavior.loc[behavior.index.isin(brain_behav_union)]
brain = brain.loc[brain.index.isin(brain_behav_union)]

# Standardizing within brain and behavior union (as in 2_rca.ipynb)
standardize = lambda df: (df - df.mean()) / df.std()
text_1, text_2, behavior = standardize(text_1), standardize(text_2), standardize(behavior)

# Ensembling for text vs behavior comparison
text_behav_embeds = {
    behavior_name: behavior,
    text_name_1: text_1, 
    text_name_2: text_2,
    text_name_1 + '&' + text_name_2: pd.concat([text_1, text_2], axis=1),
    text_name_1 + '&' + behavior_name: pd.concat([text_1, behavior], axis=1),
    text_name_2 + '&' + behavior_name: pd.concat([text_2, behavior], axis=1)
}

# Ensembling for text vs brain comparison
text_brain_embeds = {
    brain_name: brain,
    text_name_1: text_1,
    text_name_2: text_2,
    text_name_1 + '&' + text_name_2: pd.concat([text_1, text_2], axis=1),
    text_name_1 + '&' + brain_name: pd.concat([text_1, brain], axis=1),
    text_name_2 + '&' + brain_name: pd.concat([text_2, brain], axis=1)
}

def multi_df_index_align(embed_dict) -> dict:
    """Inner-aligns the indexes of multiple dfs"""
    vocabs = [set(embed.index) for embed in embed_dict.values()]
    intersect = sorted(list(set.intersection(*vocabs)))
    return {name: df.loc[intersect] for name, df in embed_dict.items()}

# Subsetting each comparison dict to the same vocabulary
text_behav_embeds = multi_df_index_align(text_behav_embeds)
text_brain_embeds = multi_df_index_align(text_brain_embeds)

# Fixing column names
for comparison_dict in [text_behav_embeds, text_brain_embeds]:
    for embed_name, embed in comparison_dict.items():
        embed.columns = list(range(embed.shape[1]))

{name: embed.shape for name, embed in text_behav_embeds.items()}

{'PPMI_SVD_SWOW': (11723, 300),
 'CBOW_GoogleNews': (11723, 300),
 'fastText_CommonCrawl': (11723, 300),
 'CBOW_GoogleNews&fastText_CommonCrawl': (11723, 600),
 'CBOW_GoogleNews&PPMI_SVD_SWOW': (11723, 600),
 'fastText_CommonCrawl&PPMI_SVD_SWOW': (11723, 600)}

In [8]:
{name: embed.shape for name, embed in text_brain_embeds.items()}

{'EEG_text': (3306, 104),
 'CBOW_GoogleNews': (3306, 300),
 'fastText_CommonCrawl': (3306, 300),
 'CBOW_GoogleNews&fastText_CommonCrawl': (3306, 600),
 'CBOW_GoogleNews&EEG_text': (3306, 404),
 'fastText_CommonCrawl&EEG_text': (3306, 404)}

## Cross Validation

In [10]:
# Changing associated_embed to more usable format
meta['associated_embed'] = meta['associated_embed'].str.split(' ')

# Ridge
min_ord, max_ord = -5, 5
alphas = np.logspace(
    min_ord, max_ord, max_ord - min_ord + 1
)
ridge = RidgeCV(alphas=alphas)

# Logistic hyperparameters
Cs = 1 / alphas
inner_cv = 5
penalty = 'l2'

# Scorers
binary_scoring = make_binary_scoring()
multiclass_scoring = make_multiclass_scoring()
continuous_scoring = 'r2'

# outer_cv setting 
outer_cv, n_jobs = 5, 6

comparisons = {
    # 'text_behav': text_behav_embeds,
    'text_brain': text_brain_embeds
}

# Running comparative RCA for text-behavior and text-brain
for comparison_name, comparison_dict in comparisons.items():

    results = []
    for norm_name in tqdm(norms.columns):
        print(f'{norm_name}:')
        y = norms[norm_name].dropna()

        to_print = []
        for embed_name, embed in comparison_dict.items():

            # Aligning embed with norm
            X, y = embed.align(y, axis='index', join='inner', copy=True)

            # Checking norm dtype
            norm_dtype = meta.loc[norm_name, 'type']

            # Solvers, scoring, estimators ir categorical or continuous
            if norm_dtype in ['binary', 'multiclass']: # categorical
                X, y = process_categorical(outer_cv, inner_cv, X, y)

                # may have switched form multi to bin after processing
                norm_dtype = 'binary' if len(y.unique()) == 2 else 'multiclass'

                # Cross validation settings for logistic regression
                solver = best_logistic_solver(y, norm_dtype)

                # Defining logistic regression
                estimator = LogisticRegressionCV(
                    Cs=Cs, penalty=penalty, cv=StratifiedKFold(inner_cv), solver=solver
                )
                scoring = binary_scoring if norm_dtype == 'binary' else multiclass_scoring
            else: # continuous
                estimator, scoring = ridge, continuous_scoring

            # Cross validation
            associated_embed = meta.loc[norm_name, 'associated_embed']
            check = checker(list(comparison_dict.keys()), y, norm_dtype, associated_embed, outer_cv)
            if check == 'pass':
                r2s = cross_val_score(estimator, X, y, cv=outer_cv, scoring=scoring, n_jobs=n_jobs) # stratification is automatically used for classification
            else:
                r2s = pd.Series([np.nan] * outer_cv)

            # Saving
            train_n = int(((outer_cv - 1) / outer_cv) * len(y))
            for i, r2 in enumerate(r2s):
                results.append([embed_name, norm_name, train_n, i + 1, r2, check])

            # For printing
            to_print.append([embed_name, r2s.mean(), r2s.std(), check])

        # Printing
        to_print = pd.DataFrame(to_print, columns=['embed', 'r2_mean', 'r2_std', 'check'])
        print(to_print.sort_values('r2_mean', ascending=False).head(10).reset_index(drop=True))
        print('--------------------------------')


    results = pd.DataFrame(
        results, columns=[
            'embed', 'norm', 'train_n', 'fold', 'r2', 'check']
    )
    results.to_csv(f'../../data/results/rca_{comparison_name}.csv', index=False)


  0%|          | 0/291 [00:00<?, ?it/s]

frequency_lund:
                                  embed   r2_mean    r2_std check
0  CBOW_GoogleNews&fastText_CommonCrawl  0.847037  0.021937  pass
1                  fastText_CommonCrawl  0.832547  0.019327  pass
2         fastText_CommonCrawl&EEG_text  0.831930  0.019700  pass
3              CBOW_GoogleNews&EEG_text  0.723892  0.023722  pass
4                       CBOW_GoogleNews  0.723272  0.022823  pass
5                              EEG_text -0.010507  0.014033  pass
--------------------------------
frequency_kucera:
                                  embed   r2_mean    r2_std check
0  CBOW_GoogleNews&fastText_CommonCrawl  0.768437  0.016121  pass
1                  fastText_CommonCrawl  0.746552  0.027365  pass
2         fastText_CommonCrawl&EEG_text  0.745482  0.026614  pass
3              CBOW_GoogleNews&EEG_text  0.662091  0.014080  pass
4                       CBOW_GoogleNews  0.661609  0.013261  pass
5                              EEG_text -0.008826  0.010033  pass
---------



KeyboardInterrupt: 