To do:
1. double check logstic convergence issues
2. check alpha range in supplementary repo

In [1]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.linear_model import RidgeCV, LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm
from rca import make_binary_scorer, make_multiclass_scorer, process_categorical, best_logistic_solver, checker, k_fold_cross_val
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

## Loading Data

In [2]:
# Pulling and standardising embeddings
embeds = {}
embeds_path = '../../data/raw/embeds/'
for f_name in tqdm(os.listdir(embeds_path)):
    embed = pd.read_csv(embeds_path + f_name, index_col=0)
    embed_name = f_name.split('.')[0]
    embeds[embed_name] = (embed - embed.mean()) / embed.std()

{name: embed.shape for name, embed in embeds.items()}

  0%|          | 0/26 [00:00<?, ?it/s]

{'fMRI_text_hyper_align': (1205, 1000),
 'norms_sensorimotor': (36854, 11),
 'EEG_text': (3355, 104),
 'LexVec_CommonCrawl': (87635, 300),
 'fastText_CommonCrawl': (88986, 300),
 'spherical_text_Wikipedia': (59012, 300),
 'GloVe_CommonCrawl': (88440, 300),
 'EEG_speech': (1591, 130),
 'PPMI_SVD_SOUTH_FLORIDA': (4959, 300),
 'THINGS': (1562, 49),
 'fMRI_speech_hyper_align': (579, 6),
 'CBOW_GoogleNews': (79279, 300),
 'compo_attribs': (534, 62),
 'morphoNLM': (50506, 50),
 'microarray': (626, 15),
 'fastText_Wiki_News': (81728, 300),
 'fastTextSub_OpenSub': (72538, 300),
 'feature_overlap': (4384, 4384),
 'SGSoftMaxOutput_SWOW': (25442, 300),
 'PPMI_SVD_SWOW': (11783, 300),
 'GloVe_Twitter': (48614, 200),
 'GloVe_Wikipedia': (68943, 300),
 'eye_tracking': (7486, 6),
 'SGSoftMaxInput_SWOW': (11783, 300),
 'SVD_sim_rel': (6002, 300),
 'PPMI_SVD_EAT': (7775, 300)}

In [3]:
norms = pd.read_csv('../../data/raw/psychNorms.zip', index_col=0, compression='zip')
norm_metadata = pd.read_csv('../../data/raw/psychNorms_metadata.csv', index_col='norm')
norm_metadata['associated_embed'] = norm_metadata['associated_embed'].astype(str)
norms

  norms = pd.read_csv('../../data/raw/psychNorms.zip', index_col=0, compression='zip')


Unnamed: 0,Freq_HAL,Freq_KF,Freq_SUBTLEXUS,Freq_SUBTLEXUK,Freq_Blog,Freq_Twitter,Freq_News,Freq_CobW,Freq_CobS,CD_SUBTLEXUS,...,reproduction_vanarsdall,person_vanarsdall,goals_vanarsdall,movement_vanarsdall,concreteness_vanarsdall,familiarity_vanarsdall,imageability_vanarsdall,familiarity_fear,aoa_fear,imageability_fear
'em,0.0,,,,,,,1.3617,1.9138,,...,,,,,,,,,,
'neath,0.0,,,,,,,0.0000,0.0000,,...,,,,,,,,,,
're,0.0,,,,,,,0.9031,1.6335,,...,,,,,,,,,,
'shun,0.0,,,,,,,0.0000,0.0000,,...,,,,,,,,,,
'tis,0.0,,,,,,,0.4771,0.6021,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
shrick,,,,,,,,,,,...,,,,,,,,2.62,4.38,2.93
post office,,,,,,,,,,,...,,,,,,,,3.79,3.07,5.29
fishing rod,,,,,,,,,,,...,,,,,,,,2.29,3.38,5.64
March,,,,,,,,,,,...,,,,,,,,3.43,2.76,3.50


In [3]:
with open('../../data/raw/embed_to_dtype.json', 'r') as f:
    embed_to_type = json.load(f)
embed_to_type

{'CBOW_GoogleNews': 'text',
 'fastText_CommonCrawl': 'text',
 'fastText_Wiki_News': 'text',
 'fastTextSub_OpenSub': 'text',
 'GloVe_CommonCrawl': 'text',
 'GloVe_Twitter': 'text',
 'GloVe_Wikipedia': 'text',
 'LexVec_CommonCrawl': 'text',
 'morphoNLM': 'text',
 'spherical_text_Wikipedia': 'text',
 'eye_tracking': 'brain',
 'EEG_speech': 'brain',
 'EEG_text': 'brain',
 'fMRI_speech_hyper_align': 'brain',
 'fMRI_text_hyper_align': 'brain',
 'microarray': 'brain',
 'PPMI_SVD_SWOW': 'behavior',
 'SGSoftMaxInput_SWOW': 'behavior',
 'SGSoftMaxOutput_SWOW': 'behavior',
 'PPMI_SVD_SOUTH_FLORIDA': 'behavior',
 'THINGS': 'behavior',
 'feature_overlap': 'behavior',
 'norms_sensorimotor': 'behavior',
 'compo_attribs': 'behavior',
 'SVD_sim_rel': 'behavior'}

## Cross Validation

In [5]:
# Ridge
alphas = np.logspace(-3, 3, 6)
ridge = RidgeCV(alphas=alphas)

# Logistic hyperparameters
Cs = 1 / alphas
inner_cv = 5
penalty = 'l2'

# Scorers
binary_scorer = make_binary_scorer()
multiclass_scorer = make_multiclass_scorer()

# outer_cv setting 
outer_cv, n_jobs = 5, 8

In [6]:
# RCA
rca = []
for embed_name in tqdm(embeds.keys()):
    embed = embeds[embed_name]
    
    to_print = []
    for norm_name in tqdm(norms.columns, desc=embed_name):
        
        # Aligning data
        norm = norms[norm_name].dropna()
        embed, norm = embed.align(norm, axis=0, join='inner')
        
        # Checking norm dtype 
        norm_dtype = norm_metadata.loc[norm_name, 'type']
        
        # Solvers, scoring, estimators
        if norm_dtype in ['binary', 'multiclass']:
            embed, norm = process_categorical(embed, norm, outer_cv, inner_cv)
            
            # may have switched form multi to bin after processing
            norm_dtype = 'binary' if len(norm.unique()) == 2 else 'multiclass'
            
            # Cross validation settings for logistic regression
            solver = best_logistic_solver(embed, norm_dtype)
            
            # Defining logistic regression 
            estimator = LogisticRegressionCV(
                Cs=Cs, penalty=penalty, cv=StratifiedKFold(inner_cv),
                solver=solver, n_jobs=8
            )
            scoring = binary_scorer if norm_dtype == 'binary' else multiclass_scorer
        else: # continuous
            estimator, scoring = ridge, 'r2'

            
        # Cross validation
        check = checker(
            embed_name, norm, norm_dtype, norm_metadata, outer_cv, norm_name
        )
        if check == 'pass':
             scores = k_fold_cross_val(estimator, embed, norm, outer_cv, scoring, n_jobs)
             r2_mean, r2_sd = scores.mean(), scores.std()
        else:
            r2_mean, r2_sd = np.nan, np.nan
            
        # Saving
        train_n = int(((outer_cv - 1) / outer_cv) * len(embed))
        p = embed.shape[1]
        embed_type = embed_to_type[embed_name]
        rca.append([embed_name, embed_type, norm_name, train_n, p, r2_mean, r2_sd, check])
        
        to_print.append([norm_name, train_n, r2_mean, r2_sd, check])

    
    to_print = (
        pd.DataFrame(to_print, columns=['norm' , 'train_n', 'r2_mean', 'r2_sd', 'check'])
        .sort_values('r2_mean', ascending=False).head(10)
    )
    print(to_print)

rca = pd.DataFrame(rca, columns=['embed', 'embed_type', 'norm', 'train_n', 'p', 'r2_mean', 'r2_sd', 'check'])
# rca.to_csv('../../data/final/rca.csv', index=False)
rca

  0%|          | 0/26 [00:00<?, ?it/s]

fMRI_text_hyper_align:   0%|          | 0/292 [00:00<?, ?it/s]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

              norm  train_n   r2_mean     r2_sd check
17       AoA_Kuper      288 -0.001846  0.061931  pass
23       Conc_Brys      243 -0.008377  0.035188  pass
18         AoA_LWV      277 -0.018197  0.058262  pass
12      CD_Twitter      875 -0.023236  0.099616  pass
5     Freq_Twitter      877 -0.023670  0.090382  pass
25    Imag_Glasgow      243 -0.032579  0.047846  pass
10    CD_SUBTLEXUK      875 -0.034641  0.090348  pass
9     CD_SUBTLEXUS      875 -0.034682  0.084748  pass
2   Freq_SUBTLEXUS      879 -0.036402  0.112891  pass
3   Freq_SUBTLEXUK      877 -0.036876  0.102930  pass


norms_sensorimotor:   0%|          | 0/292 [00:00<?, ?it/s]



                norm  train_n   r2_mean     r2_sd check
23         Conc_Brys     2660  0.567533  0.035037  pass
24      Conc_Glasgow     2660  0.558799  0.030787  pass
25      Imag_Glasgow     2660  0.536960  0.020668  pass
42               BOI      118  0.490491  0.094716  pass
43              CBOI       89  0.454173  0.213098  pass
44  Sem_Size_Glasgow       89  0.279290  0.130280  pass
18           AoA_LWV     3005  0.179925  0.034946  pass
17         AoA_Kuper     3111  0.164209  0.032923  pass
19       AoA_Glasgow     3005  0.162158  0.030589  pass
14       Fam_Glasgow     3196  0.144811  0.016783  pass


EEG_text:   0%|          | 0/292 [00:00<?, ?it/s]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 