In [12]:
import os
import json
import pickle
import pandas as pd
import numpy as np
from sklearn.utils.multiclass import type_of_target
from sklearn.linear_model import RidgeCV, LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm
from rca import make_binary_scoring, make_multiclass_scoring, process_categorical, best_logistic_solver, checker, k_fold_cross_val

## Preparing embeds

We drop feature_overlap because it contains many NaNs and compo_attribs because it doesn't have a large enough vocabulary and is also a identical to a 65 of the norms in the psychNorms dataset.

In [3]:
with open('../../data/brain_behav_union.pkl', 'rb') as f:
    brain_behav_union = pickle.load(f)

# Loading dictionary of dtype to embed
with open('../../data/dtype_to_embed.json', 'r') as f:
    dtype_to_embed = json.load(f)
    
brain_behav_names = dtype_to_embed['brain'] + dtype_to_embed['behavior']

# Pulling and standardising embeddings
embeds = {}
embeds_path = '../../data/embeds/'
for f_name in tqdm(os.listdir(embeds_path)):
    if f_name not in ['feature_overlap.csv', 'compo_attribs.csv']:  # dropping since contains many NaNs
        
        embed = pd.read_csv(embeds_path + f_name, index_col=0)
        embed_name = f_name.split('.')[0]
        
        # Subsetting to brain and behavior vocab
        embed = embed.loc[embed.index.intersection(brain_behav_union)]
        
        # Standardising
        embeds[embed_name] = (embed - embed.mean()) / embed.std()

{name: embed.shape for name, embed in embeds.items()}

  0%|          | 0/26 [00:00<?, ?it/s]

{'fMRI_text_hyper_align': (1205, 1000),
 'norms_sensorimotor': (36854, 11),
 'EEG_text': (3355, 104),
 'LexVec_CommonCrawl': (44082, 300),
 'fastText_CommonCrawl': (44443, 300),
 'spherical_text_Wikipedia': (35533, 300),
 'GloVe_CommonCrawl': (44278, 300),
 'EEG_speech': (1591, 130),
 'THINGS': (1562, 49),
 'fMRI_speech_hyper_align': (579, 6),
 'CBOW_GoogleNews': (42830, 300),
 'morphoNLM': (32769, 50),
 'microarray': (626, 15),
 'PPMI_SVD_SouthFlorida': (4959, 300),
 'fastText_Wiki_News': (43143, 300),
 'fastTextSub_OpenSub': (40607, 300),
 'SGSoftMaxOutput_SWOW': (25442, 300),
 'PPMI_SVD_SWOW': (11783, 300),
 'GloVe_Twitter': (32947, 200),
 'GloVe_Wikipedia': (39421, 300),
 'eye_tracking': (7486, 6),
 'SGSoftMaxInput_SWOW': (11783, 300),
 'SVD_sim_rel': (6002, 300),
 'PPMI_SVD_EAT': (7775, 300)}

In [None]:
with open('../../data/embed_to_dtype.json', 'r') as f:
    embed_to_type = json.load(f)
embed_to_type

## Preparing norms

In [14]:
# Loading norms
norms = pd.read_csv('../../data/psychNorms/psychNorms.zip', index_col=0, compression='zip', low_memory=False)
norm_meta = pd.read_csv('../../data/psychNorms/psychNorms_metadata.csv', index_col='norm')
norms

Unnamed: 0_level_0,frequency_lund,frequency_kucera,frequency_subtlexus,frequency_subtlexuk,frequency_blog_gimenes,frequency_twitter_gimenes,frequency_news_gimenes,frequency_written_cobuild,frequency_spoken_cobuild,context_diversity_subtlexus,...,person_vanarsdall,goals_vanarsdall,movement_vanarsdall,concreteness_vanarsdall,familiarity_vanarsdall,imageability_vanarsdall,familiarity_fear,aoa_fear,imageability_fear,sensory_experience_juhasz2013
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'em,0.0,,,,,,,1.3617,1.9138,,...,,,,,,,,,,
'neath,0.0,,,,,,,0.0000,0.0000,,...,,,,,,,,,,
're,0.0,,,,,,,0.9031,1.6335,,...,,,,,,,,,,
'shun,0.0,,,,,,,0.0000,0.0000,,...,,,,,,,,,,
'tis,0.0,,,,,,,0.4771,0.6021,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
shrick,,,,,,,,,,,...,,,,,,,2.62,4.38,2.93,
post office,,,,,,,,,,,...,,,,,,,3.79,3.07,5.29,
fishing rod,,,,,,,,,,,...,,,,,,,2.29,3.38,5.64,
March,,,,,,,,,,,...,,,,,,,3.43,2.76,3.50,


In [15]:
# Adding 'associated_embed' to metadata to avoid data leakage
norm_meta['associated_embed'] = np.nan
norm_meta['associated_embed'][norm_meta.index.str.contains('_lancaster')] = 'norms_sensorimotor'
norm_meta['associated_embed'][norm_meta.index == 'association_frequency_dedeyne'] = 'PPMI_SVD_SWOW SGSoftMaxInput_SWOW SGSoftMaxOutput_SWOW'

# Adding 'type' to metadata (numeric, binary, multiclass)
norm_meta['type'] = [type_of_target(norms[name].dropna()) for name in norm_meta.index]
norm_meta['type'] = norm_meta['type'].replace('continuous', 'numeric')

# Manually fixing mistyped norms
numeric_norms = [
    'n_senses_wordnet_miller', 'n_senses_wordsmyth_rice', 'n_meanings_websters_gao', 'n_features_buchanan',
    'n_semantic_neighbors_shaoul', 'association_frequency_dedeyne', 'cue_setsize_nelson', 'difficulty_rudell',
    'likableness_anderson', 'meaningfulness_anderson'
]
for norm in norm_meta.index:
    if 'vanarsdall' in norm:
        numeric_norms.append(norm)
        
norm_meta.loc[numeric_norms, 'type'] = 'numeric'

# Saving metadata
norm_meta.to_csv('../../data/psychNorms/psychNorms_metadata_processed.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  norm_meta['associated_embed'][norm_meta.index.str.contains('_lancaster')] = 'norms_sensorimotor'


In [5]:
# Log transforming selected norms
norms_to_log = pd.read_csv('../../data/norms_to_log.csv')['norm']
norms[norms_to_log] = norms[norms_to_log].apply(np.log1p)
norms_to_log

0             Nsenses_WordNet
1           Nsenses_Wordsmyth
2         Nmeanings_Wordsmyth
3          Nmeanings_Websters
4                   NFeatures
5                       Sem_N
6         Assoc_Freq_Token123
7                 Cue_SetSize
8           LexicalD_RT_V_ELP
9           LexicalD_RT_V_ECP
10          LexicalD_RT_V_BLP
11         LexicalD_RT_A_MALD
12         LexicalD_RT_A_AELP
13              Naming_RT_ELP
14       SemanticD_RT_Calgary
15                  rt_khanna
16                     rt_ley
17               rt_chiarello
18                    rt_chen
19             aoa_rt_cortese
20    imageability_rt_cortese
21                  rt_schock
Name: norm, dtype: object

## Cross Validation

In [None]:
# Ridge
min_ord, max_ord = -5, 5
alphas = np.logspace(
    min_ord, max_ord, max_ord - min_ord + 1
)
ridge = RidgeCV(alphas=alphas)

# Logistic hyperparameters
Cs = 1 / alphas
inner_cv = 5
penalty = 'l2'

# Scorers
binary_scoring = make_binary_scoring()
multiclass_scoring = make_multiclass_scoring()
continuous_scoring = {'r2': 'r2', 'neg_mse': 'neg_mean_squared_error'}

# outer_cv setting 
outer_cv, n_jobs = 5, 10

In [None]:
# RCA
rca = []
for embed_name in tqdm(embeds.keys()):
    embed = embeds[embed_name]
    
    to_print = []
    for norm_name in tqdm(norms.columns, desc=embed_name):
        
        # Aligning data
        y = norms[norm_name].dropna()
        X, y = embed.align(y, axis=0, join='inner', copy=True) 
        
        # Checking norm dtype 
        norm_dtype = norm_meta.loc[norm_name, 'type']
        
        # Solvers, scoring, estimators
        if norm_dtype in ['binary', 'multiclass']:
            X, y = process_categorical(outer_cv, inner_cv, X, y)
            
            # may have switched form multi to bin after processing
            norm_dtype = 'binary' if len(y.unique()) == 2 else 'multiclass'
            
            # Cross validation settings for logistic regression
            solver = best_logistic_solver(X, norm_dtype)
            
            # Defining logistic regression 
            estimator = LogisticRegressionCV(
                Cs=Cs, penalty=penalty, cv=StratifiedKFold(inner_cv),
                solver=solver, n_jobs=8
            )
            scoring = binary_scoring if norm_dtype == 'binary' else multiclass_scoring
        else: # continuous
            estimator, scoring = ridge, continuous_scoring
  
        # Cross validation
        associated_embed = norm_meta.loc[norm_name, 'associated_embed']
        check = checker(embed_name, y, norm_dtype, associated_embed, outer_cv)
        if check == 'pass':
            scores = k_fold_cross_val(estimator, X, y, outer_cv, scoring, n_jobs) # stratification is automatically used for classification
            r2s, mses = scores['test_r2'], - scores['test_neg_mse']
            r2_mean, r2_sd = r2s.mean(), r2s.std()
            mse_mean, mse_sd = mses.mean(), mses.std()
        else:
            r2_mean, r2_sd = np.nan, np.nan
            mse_mean, mse_sd = np.nan, np.nan
            
        # Saving
        train_n = int(((outer_cv - 1) / outer_cv) * len(X))
        test_n = len(X) - train_n
        p = X.shape[1]
        embed_type = embed_to_type[embed_name]
        rca.append([
            embed_name, embed_type, norm_name, train_n, test_n, p, 
            r2_mean, r2_sd, mse_mean, mse_sd, check
        ])
        
        to_print.append([norm_name, train_n, r2_mean, r2_sd, check])

    to_print = pd.DataFrame(to_print, columns=['norm' , 'train_n', 'r2_mean', 'r2_sd', 'check'])
    print(to_print.sort_values('r2_mean', ascending=False).head(10))

rca = pd.DataFrame(
    rca, columns=[
        'embed', 'embed_type', 'norm', 'train_n', 'test_n', 'p', 
        'r2_mean', 'r2_sd', 'mse_mean', 'mse_sd', 'check'
    ]
)

rca.to_csv('../../data/results/rca.csv', index=False)
rca