In [9]:
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import RidgeCV, LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm
import json
from rca import process_categorical, best_logistic_solver, k_fold_cross_val, make_binary_scoring, make_multiclass_scoring, checker

## Loading Data

In [10]:
rca = pd.read_csv('../../data/results/rca.csv')
rca

Unnamed: 0,embed,embed_type,norm,train_n,test_n,p,r2_mean,r2_sd,mse_mean,mse_sd,check
0,CBOW_GoogleNews,text,Freq_HAL,28012,7003,300,0.522106,0.008390,2.715519,0.072449,pass
1,CBOW_GoogleNews,text,Freq_KF,19285,4822,300,0.500385,0.009733,0.156678,0.004765,pass
2,CBOW_GoogleNews,text,Freq_SUBTLEXUS,28636,7159,300,0.537246,0.009834,0.361360,0.007800,pass
3,CBOW_GoogleNews,text,Freq_SUBTLEXUK,29316,7330,300,0.545626,0.008433,0.446059,0.009071,pass
4,CBOW_GoogleNews,text,Freq_Blog,31876,7969,300,0.523688,0.008467,0.400176,0.009336,pass
...,...,...,...,...,...,...,...,...,...,...,...
7295,THINGS,behavior,familiarity_vanarsdall,376,95,49,0.060692,0.083942,3326.617511,620.203961,pass
7296,THINGS,behavior,imageability_vanarsdall,376,95,49,0.053593,0.095680,1345.018025,292.430129,pass
7297,THINGS,behavior,familiarity_fear,173,44,49,0.139160,0.160894,0.790788,0.216911,pass
7298,THINGS,behavior,aoa_fear,173,44,49,-0.021206,0.121789,0.533727,0.039008,pass


In [11]:
embed_means = rca.groupby('embed').mean(numeric_only=True)

# Adding embed types
with open('../../data/embed_to_dtype.json', 'r') as f:
    embed_to_type = json.load(f)
embed_means['type'] = embed_means.index.map(embed_to_type)

# Finding top 2 text 
text_name_1, text_name_2 = (
    embed_means.query('type == "text"').sort_values('r2_mean', ascending=False).head(2).index.tolist()
)
text_name_1, text_name_2

('CBOW_GoogleNews', 'morphoNLM')

In [12]:
# Finding top behavior
behavior_name = (
    embed_means.query('type == "behavior"').sort_values('r2_mean', ascending=False).head(1).index[0]
)
behavior_name

'PPMI_SVD_SWOW'

In [13]:
# Loading embeds
text_1 = pd.read_csv(f'../../data/embeds/{text_name_1}.csv', index_col=0)
text_2 = pd.read_csv(f'../../data/embeds/{text_name_2}.csv', index_col=0)
behavior = pd.read_csv(f'../../data/embeds/{behavior_name}.csv', index_col=0)

# Aligning vocabs
intersect = sorted(list(set.intersection(set(text_1.index), set(text_2.index), set(behavior.index))))
text_1, text_2, behavior = text_1.loc[intersect], text_2.loc[intersect], behavior.loc[intersect]

# Standardizing
standardize = lambda df: (df - df.mean()) / df.std()
text_1, text_2, behavior = standardize(text_1), standardize(text_2), standardize(behavior)

# Ensembling for comparison
embeds = {
    text_name_1: text_1, 
    text_name_2: text_2,
    text_name_1 + '&' + text_name_2: pd.concat([text_1, text_2], axis=1),
    text_name_1 + '&' + behavior_name: pd.concat([text_1, behavior], axis=1),
    text_name_2 + '&' + behavior_name: pd.concat([text_2, behavior], axis=1)
}

# Fixing column names
for embed_name, embed in embeds.items():
    embed.columns = list(range(embed.shape[1]))
    embeds[embed_name] = embed

{name: embed.shape for name, embed in embeds.items()}

{'CBOW_GoogleNews': (11182, 300),
 'morphoNLM': (11182, 50),
 'CBOW_GoogleNews&morphoNLM': (11182, 350),
 'CBOW_GoogleNews&PPMI_SVD_SWOW': (11182, 600),
 'morphoNLM&PPMI_SVD_SWOW': (11182, 350)}

In [14]:
meta = pd.read_csv('../../data/psychNorms/psychNorms_metadata.csv', index_col=0)
meta['associated_embed'] = meta['associated_embed'].str.split(' ')

norms = pd.read_csv('../../data/psychNorms/psychNorms.zip', index_col=0, compression='zip', low_memory=False)
norms

Unnamed: 0,Freq_HAL,Freq_KF,Freq_SUBTLEXUS,Freq_SUBTLEXUK,Freq_Blog,Freq_Twitter,Freq_News,Freq_CobW,Freq_CobS,CD_SUBTLEXUS,...,reproduction_vanarsdall,person_vanarsdall,goals_vanarsdall,movement_vanarsdall,concreteness_vanarsdall,familiarity_vanarsdall,imageability_vanarsdall,familiarity_fear,aoa_fear,imageability_fear
'em,0.0,,,,,,,1.3617,1.9138,,...,,,,,,,,,,
'neath,0.0,,,,,,,0.0000,0.0000,,...,,,,,,,,,,
're,0.0,,,,,,,0.9031,1.6335,,...,,,,,,,,,,
'shun,0.0,,,,,,,0.0000,0.0000,,...,,,,,,,,,,
'tis,0.0,,,,,,,0.4771,0.6021,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
shrick,,,,,,,,,,,...,,,,,,,,2.62,4.38,2.93
post office,,,,,,,,,,,...,,,,,,,,3.79,3.07,5.29
fishing rod,,,,,,,,,,,...,,,,,,,,2.29,3.38,5.64
March,,,,,,,,,,,...,,,,,,,,3.43,2.76,3.50


In [15]:
# Log transforming selected norms
with open('../../data/norms_to_log.pkl', 'rb') as f:
    norms_to_log = pickle.load(f)
    norms[norms_to_log] = norms[norms_to_log].apply(np.log1p)

## Cross Validation

In [16]:
# Ridge
min_ord, max_ord = -5, 5
alphas = np.logspace(
    min_ord, max_ord, max_ord - min_ord + 1
)
ridge = RidgeCV(alphas=alphas)

# Logistic hyperparameters
Cs = 1 / alphas
inner_cv = 5
penalty = 'l2'

# Scorers
binary_scoring = make_binary_scoring()
multiclass_scoring = make_multiclass_scoring()
continuous_scoring = {'r2': 'r2', 'neg_mse': 'neg_mean_squared_error'}

# outer_cv setting 
outer_cv, n_jobs = 5, 6

solo_embed_names = [text_name_1, text_name_2, behavior_name] # For checking data leakage in checker

In [17]:
# RCA
rca = []
for norm_name in tqdm(norms.columns):
    print(f'{norm_name}:')
    y = norms[norm_name].dropna()
    
    to_print = []
    for embed_name, embed in embeds.items():
        
        # Aligning embed with norm
        X, y = embed.align(y, axis='index', join='inner', copy=True)
        
        # Checking norm dtype 
        norm_dtype = meta.loc[norm_name, 'type']
        
        # Solvers, scoring, estimators ir categorical or continuous
        if norm_dtype in ['binary', 'multiclass']: # categorical
            X, y = process_categorical(outer_cv, inner_cv, X, y)
            
            # may have switched form multi to bin after processing
            norm_dtype = 'binary' if len(y.unique()) == 2 else 'multiclass'
            
            # Cross validation settings for logistic regression
            solver = best_logistic_solver(y, norm_dtype)
            
            # Defining logistic regression 
            estimator = LogisticRegressionCV(
                Cs=Cs, penalty=penalty, cv=StratifiedKFold(inner_cv), solver=solver
            )
            scoring = binary_scoring if norm_dtype == 'binary' else multiclass_scoring
        else: # continuous
            estimator, scoring = ridge, continuous_scoring
            
        # Cross validation
        associated_embed = meta.loc[norm_name, 'associated_embed']
        check = checker(solo_embed_names, y, norm_dtype, associated_embed, outer_cv)
        if check == 'pass':
            scores = k_fold_cross_val(estimator, X, y, outer_cv, scoring, n_jobs) # stratification is automatically used for classification
            r2s, mses = scores['test_r2'], - scores['test_neg_mse']
        else:
            r2s, mses = pd.Series([np.nan] * outer_cv), pd.Series([np.nan] * outer_cv)
            
        # Saving
        train_n = int(((outer_cv - 1) / outer_cv) * len(y))
        for i, (r2, mse) in enumerate(zip(r2s, mses)):
            rca.append([embed_name, norm_name, train_n, i + 1, r2, mse, check])
            
        # Printing
        to_print.append([embed_name, r2s.mean(), r2s.std(), check])
    to_print = pd.DataFrame(to_print, columns=['embed', 'r2_mean', 'r2_std', 'check'])
    print(to_print.sort_values('r2_mean', ascending=False).head(10).reset_index(drop=True))
    print('--------------------------------')
 
 
rca = pd.DataFrame(
    rca, columns=[
        'embed', 'norm', 'train_n', 'fold', 'r2', 'mse', 'check']
)
rca.to_csv('../../data/results/rca_ensemb.csv', index=False)
rca

  0%|          | 0/292 [00:00<?, ?it/s]

Freq_HAL:
                           embed   r2_mean    r2_std check
0      CBOW_GoogleNews&morphoNLM  0.734332  0.014650  pass
1  CBOW_GoogleNews&PPMI_SVD_SWOW  0.722789  0.014559  pass
2        morphoNLM&PPMI_SVD_SWOW  0.693546  0.017359  pass
3                CBOW_GoogleNews  0.668775  0.011807  pass
4                      morphoNLM  0.500359  0.041678  pass
--------------------------------
Freq_KF:
                           embed   r2_mean    r2_std check
0  CBOW_GoogleNews&PPMI_SVD_SWOW  0.664336  0.018458  pass
1      CBOW_GoogleNews&morphoNLM  0.661353  0.014110  pass
2        morphoNLM&PPMI_SVD_SWOW  0.626375  0.026625  pass
3                CBOW_GoogleNews  0.610917  0.010366  pass
4                      morphoNLM  0.415260  0.048085  pass
--------------------------------
Freq_SUBTLEXUS:
                           embed   r2_mean    r2_std check
0      CBOW_GoogleNews&morphoNLM  0.735633  0.015040  pass
1  CBOW_GoogleNews&PPMI_SVD_SWOW  0.730976  0.019344  pass
2             



                           embed   r2_mean    r2_std check
0      CBOW_GoogleNews&morphoNLM  0.738394  0.013610  pass
1  CBOW_GoogleNews&PPMI_SVD_SWOW  0.712817  0.009234  pass
2                CBOW_GoogleNews  0.703777  0.007526  pass
3        morphoNLM&PPMI_SVD_SWOW  0.625629  0.012562  pass
4                      morphoNLM  0.478132  0.029926  pass
--------------------------------
DPoS_VanH:




                           embed   r2_mean    r2_std check
0      CBOW_GoogleNews&morphoNLM  0.667474  0.007843  pass
1  CBOW_GoogleNews&PPMI_SVD_SWOW  0.640707  0.005588  pass
2                CBOW_GoogleNews  0.632706  0.004964  pass
3        morphoNLM&PPMI_SVD_SWOW  0.559527  0.015359  pass
4                      morphoNLM  0.449859  0.029690  pass
--------------------------------
Conc_Brys:
                           embed   r2_mean    r2_std check
0  CBOW_GoogleNews&PPMI_SVD_SWOW  0.834569  0.004902  pass
1      CBOW_GoogleNews&morphoNLM  0.784481  0.008937  pass
2        morphoNLM&PPMI_SVD_SWOW  0.770602  0.007375  pass
3                CBOW_GoogleNews  0.769221  0.010234  pass
4                      morphoNLM  0.525626  0.017414  pass
--------------------------------
Conc_Glasgow:
                           embed   r2_mean    r2_std check
0  CBOW_GoogleNews&PPMI_SVD_SWOW  0.833330  0.003774  pass
1      CBOW_GoogleNews&morphoNLM  0.799784  0.007685  pass
2                CBOW_Go



                           embed   r2_mean    r2_std check
0        morphoNLM&PPMI_SVD_SWOW  0.414850  0.034441  pass
1  CBOW_GoogleNews&PPMI_SVD_SWOW  0.399798  0.036357  pass
2      CBOW_GoogleNews&morphoNLM  0.351462  0.024185  pass
3                CBOW_GoogleNews  0.350730  0.022965  pass
4                      morphoNLM  0.207353  0.021248  pass
--------------------------------
Emot_Assoc_Anticipation:




                           embed   r2_mean    r2_std check
0        morphoNLM&PPMI_SVD_SWOW  0.169642  0.031962  pass
1  CBOW_GoogleNews&PPMI_SVD_SWOW  0.162445  0.029328  pass
2      CBOW_GoogleNews&morphoNLM  0.153669  0.018299  pass
3                CBOW_GoogleNews  0.153532  0.014987  pass
4                      morphoNLM  0.064160  0.015593  pass
--------------------------------
Emot_Assoc_Disgust:




                           embed   r2_mean    r2_std check
0        morphoNLM&PPMI_SVD_SWOW  0.390003  0.034232  pass
1  CBOW_GoogleNews&PPMI_SVD_SWOW  0.389310  0.040427  pass
2                CBOW_GoogleNews  0.371623  0.044903  pass
3      CBOW_GoogleNews&morphoNLM  0.367211  0.048004  pass
4                      morphoNLM  0.096472  0.009408  pass
--------------------------------
Emot_Assoc_Fear:




                           embed   r2_mean    r2_std check
0        morphoNLM&PPMI_SVD_SWOW  0.378850  0.016828  pass
1  CBOW_GoogleNews&PPMI_SVD_SWOW  0.368984  0.027109  pass
2      CBOW_GoogleNews&morphoNLM  0.354726  0.015381  pass
3                CBOW_GoogleNews  0.353450  0.015951  pass
4                      morphoNLM  0.209875  0.013590  pass
--------------------------------
Emot_Assoc_Joy:




                           embed   r2_mean    r2_std check
0  CBOW_GoogleNews&PPMI_SVD_SWOW  0.413917  0.044206  pass
1        morphoNLM&PPMI_SVD_SWOW  0.405087  0.040501  pass
2      CBOW_GoogleNews&morphoNLM  0.362079  0.022070  pass
3                CBOW_GoogleNews  0.358413  0.028827  pass
4                      morphoNLM  0.141075  0.028326  pass
--------------------------------
Emot_Assoc_Negative:




                           embed   r2_mean    r2_std check
0        morphoNLM&PPMI_SVD_SWOW  0.506092  0.033437  pass
1  CBOW_GoogleNews&PPMI_SVD_SWOW  0.505136  0.031989  pass
2                CBOW_GoogleNews  0.472723  0.032119  pass
3      CBOW_GoogleNews&morphoNLM  0.471110  0.037500  pass
4                      morphoNLM  0.232076  0.036721  pass
--------------------------------
Emot_Assoc_Positive:




                           embed   r2_mean    r2_std check
0        morphoNLM&PPMI_SVD_SWOW  0.333749  0.053219  pass
1  CBOW_GoogleNews&PPMI_SVD_SWOW  0.321070  0.063844  pass
2                CBOW_GoogleNews  0.286458  0.056597  pass
3      CBOW_GoogleNews&morphoNLM  0.285044  0.061219  pass
4                      morphoNLM  0.120359  0.029801  pass
--------------------------------
Emot_Assoc_Sadness:




                           embed   r2_mean    r2_std check
0        morphoNLM&PPMI_SVD_SWOW  0.387299  0.042906  pass
1  CBOW_GoogleNews&PPMI_SVD_SWOW  0.380755  0.051022  pass
2      CBOW_GoogleNews&morphoNLM  0.343850  0.048389  pass
3                CBOW_GoogleNews  0.342561  0.043115  pass
4                      morphoNLM  0.126120  0.032888  pass
--------------------------------
Emot_Assoc_Surprise:




                           embed   r2_mean    r2_std check
0  CBOW_GoogleNews&PPMI_SVD_SWOW  0.160193  0.050923  pass
1      CBOW_GoogleNews&morphoNLM  0.145313  0.040504  pass
2                CBOW_GoogleNews  0.145174  0.039846  pass
3        morphoNLM&PPMI_SVD_SWOW  0.133519  0.018387  pass
4                      morphoNLM  0.050999  0.026752  pass
--------------------------------
Emot_Assoc_Trust:




                           embed   r2_mean    r2_std check
0  CBOW_GoogleNews&PPMI_SVD_SWOW  0.226851  0.024280  pass
1        morphoNLM&PPMI_SVD_SWOW  0.220807  0.008609  pass
2                CBOW_GoogleNews  0.196456  0.011793  pass
3      CBOW_GoogleNews&morphoNLM  0.192078  0.012655  pass
4                      morphoNLM  0.076965  0.004935  pass
--------------------------------
Sem_Diversity:
                           embed   r2_mean    r2_std check
0  CBOW_GoogleNews&PPMI_SVD_SWOW  0.701378  0.009179  pass
1      CBOW_GoogleNews&morphoNLM  0.698977  0.012386  pass
2                CBOW_GoogleNews  0.687747  0.011937  pass
3        morphoNLM&PPMI_SVD_SWOW  0.566096  0.015318  pass
4                      morphoNLM  0.416885  0.022940  pass
--------------------------------
Sem_N:
                           embed   r2_mean    r2_std check
0      CBOW_GoogleNews&morphoNLM  0.677512  0.009357  pass
1        morphoNLM&PPMI_SVD_SWOW  0.641285  0.008565  pass
2  CBOW_GoogleNews&PPMI_SVD

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

                           embed   r2_mean    r2_std check
0  CBOW_GoogleNews&PPMI_SVD_SWOW  0.487503  0.203448  pass
1        morphoNLM&PPMI_SVD_SWOW  0.483213  0.166186  pass
2                CBOW_GoogleNews  0.445316  0.126590  pass
3      CBOW_GoogleNews&morphoNLM  0.385321  0.336383  pass
4                      morphoNLM  0.042943  0.408295  pass
--------------------------------
imagery_toronto:
                           embed   r2_mean    r2_std check
0  CBOW_GoogleNews&PPMI_SVD_SWOW  0.640457  0.052027  pass
1      CBOW_GoogleNews&morphoNLM  0.618222  0.045067  pass
2        morphoNLM&PPMI_SVD_SWOW  0.606365  0.069751  pass
3                CBOW_GoogleNews  0.579941  0.039133  pass
4                      morphoNLM  0.532646  0.073656  pass
--------------------------------
concreteness_toronto:
                           embed   r2_mean    r2_std check
0  CBOW_GoogleNews&PPMI_SVD_SWOW  0.704782  0.040215  pass
1      CBOW_GoogleNews&morphoNLM  0.704015  0.045144  pass
2         

Unnamed: 0,embed,norm,train_n,fold,r2,mse,check
0,CBOW_GoogleNews,Freq_HAL,8668,1,0.660651,1.065544,pass
1,CBOW_GoogleNews,Freq_HAL,8668,2,0.659874,1.047850,pass
2,CBOW_GoogleNews,Freq_HAL,8668,3,0.678498,1.135581,pass
3,CBOW_GoogleNews,Freq_HAL,8668,4,0.657754,1.111463,pass
4,CBOW_GoogleNews,Freq_HAL,8668,5,0.687100,1.171432,pass
...,...,...,...,...,...,...,...
7295,morphoNLM&PPMI_SVD_SWOW,imageability_fear,500,1,0.512233,0.710529,pass
7296,morphoNLM&PPMI_SVD_SWOW,imageability_fear,500,2,0.586266,0.844248,pass
7297,morphoNLM&PPMI_SVD_SWOW,imageability_fear,500,3,0.555577,0.862000,pass
7298,morphoNLM&PPMI_SVD_SWOW,imageability_fear,500,4,0.437230,0.919240,pass
