In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import RidgeCV, LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm
import itertools
import json
from rca import process_categorical, best_logistic_solver, k_fold_cross_val, make_binary_scorer, make_multiclass_scorer, checker

## Loading Data

In [2]:
rca = pd.read_csv('../../data/final/rca.csv')
rca

Unnamed: 0,embed,embed_type,norm,train_n,p,r2_mean,r2_sd,check
0,CBOW_GoogleNews,text,Freq_HAL,51174,300,0.422344,0.006255,pass
1,CBOW_GoogleNews,text,Freq_KF,26605,300,0.463358,0.009852,pass
2,CBOW_GoogleNews,text,Freq_SUBTLEXUS,43939,300,0.488748,0.006706,pass
3,CBOW_GoogleNews,text,Freq_SUBTLEXUK,47398,300,0.479608,0.008173,pass
4,CBOW_GoogleNews,text,Freq_Blog,53251,300,0.463084,0.006165,pass
...,...,...,...,...,...,...,...,...
7295,THINGS,behavior,familiarity_vanarsdall,376,49,0.060692,0.083942,pass
7296,THINGS,behavior,imageability_vanarsdall,376,49,0.053593,0.095680,pass
7297,THINGS,behavior,familiarity_fear,173,49,0.139160,0.160894,pass
7298,THINGS,behavior,aoa_fear,173,49,-0.021206,0.121789,pass


In [3]:
embed_means = rca.groupby('embed').mean(numeric_only=True)

# Adding embed types
with open('../../data/raw/embed_to_dtype.json', 'r') as f:
    embed_to_type = json.load(f)
embed_means['type'] = embed_means.index.map(embed_to_type)

top_n = 2

# ensembling top text
top_text_names = embed_means.query('type == "text"').sort_values('r2_mean', ascending=False).head(top_n).index.tolist()
text_text_names = list(itertools.combinations(top_text_names, r=2))
text_text_names

[('CBOW_GoogleNews', 'morphoNLM')]

In [4]:
# Ensembling top behavior
top_behavior_names = embed_means.query('type == "behavior"').sort_values('r2_mean', ascending=False).head(top_n).index.tolist()
text_behavior_names =  []
for text_name in top_text_names:
    for behavior_name in top_behavior_names:
        text_behavior_names.append((text_name, behavior_name))
text_behavior_names

[('CBOW_GoogleNews', 'PPMI_SVD_SWOW'),
 ('CBOW_GoogleNews', 'SGSoftMaxInput_SWOW'),
 ('morphoNLM', 'PPMI_SVD_SWOW'),
 ('morphoNLM', 'SGSoftMaxInput_SWOW')]

In [5]:
standarize = lambda df: (df - df.mean()) / df.std()

# Loading embeddings
embeds = {}
for name in top_text_names + top_behavior_names:
    embeds[name] = pd.read_csv(f'../../data/raw/embeds/{name}.csv', index_col=0)

{name: embed.shape for name, embed in embeds.items()}

{'CBOW_GoogleNews': (79280, 300),
 'morphoNLM': (50508, 50),
 'PPMI_SVD_SWOW': (11783, 300),
 'SGSoftMaxInput_SWOW': (11783, 300)}

In [6]:
meta = pd.read_csv('../../data/raw/psychNorms_metadata.csv', index_col=0)
meta['associated_embed'] = meta['associated_embed'].str.split(' ')

norms = pd.read_csv('../../data/raw/psychNorms.zip', index_col=0, compression='zip', low_memory=False)
norms

Unnamed: 0,Freq_HAL,Freq_KF,Freq_SUBTLEXUS,Freq_SUBTLEXUK,Freq_Blog,Freq_Twitter,Freq_News,Freq_CobW,Freq_CobS,CD_SUBTLEXUS,...,reproduction_vanarsdall,person_vanarsdall,goals_vanarsdall,movement_vanarsdall,concreteness_vanarsdall,familiarity_vanarsdall,imageability_vanarsdall,familiarity_fear,aoa_fear,imageability_fear
'em,0.0,,,,,,,1.3617,1.9138,,...,,,,,,,,,,
'neath,0.0,,,,,,,0.0000,0.0000,,...,,,,,,,,,,
're,0.0,,,,,,,0.9031,1.6335,,...,,,,,,,,,,
'shun,0.0,,,,,,,0.0000,0.0000,,...,,,,,,,,,,
'tis,0.0,,,,,,,0.4771,0.6021,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
shrick,,,,,,,,,,,...,,,,,,,,2.62,4.38,2.93
post office,,,,,,,,,,,...,,,,,,,,3.79,3.07,5.29
fishing rod,,,,,,,,,,,...,,,,,,,,2.29,3.38,5.64
March,,,,,,,,,,,...,,,,,,,,3.43,2.76,3.50


## Cross Validation

In [7]:
standardize = lambda df: (df - df.mean()) / df.std()

# Ridge
min_alpha, max_alpha = -3, 6 
alphas = np.logspace(min_alpha, max_alpha,  max_alpha - min_alpha + 1)
ridge = RidgeCV(alphas=alphas)

# Logistic hyperparameters
Cs = 1 / alphas
inner_cv = 5
penalty = 'l2'

# Scorers
binary_scorer = make_binary_scorer()
multiclass_scorer = make_multiclass_scorer()

# outer_cv setting 
outer_cv, n_jobs = 5, 8

In [8]:
# RCA
rca = []
for (text_name, behavior_name) in tqdm(text_behavior_names):
    
    # Loading text-text baseline embedding
    text_text_embed = pd.concat([embeds[name] for name in top_text_names], axis=1, join='inner')
    text_text_embed.columns = list(range(text_text_embed.shape[1]))
    text_text_name = '&'.join(top_text_names)

    # Loading text-behavior embedding
    text_behavior_embed = pd.concat([embeds[text_name], embeds[behavior_name]], axis=1, join='inner')
    text_behavior_embed.columns = list(range(text_behavior_embed.shape[1]))
    text_behavior_name = f'{text_name}&{behavior_name}'
    
    # Aligning embedding to have same vocab for fair comparison
    text_text_embed, text_behavior_embed = text_text_embed.align(
        text_behavior_embed, axis='index', join='inner', copy=True
    )
    
    # Standardizing
    text_text_embed, text_behavior_embed = standardize(text_text_embed), standardize(text_behavior_embed)
       
    for norm_name in tqdm(norms.columns, desc=text_behavior_name):
        
        # Aligning embeddings with norm
        y = norms[norm_name].dropna()
        X_tt, y = text_text_embed.align(y, axis='index', join='inner', copy=True)
        X_tb, y = text_behavior_embed.align(y, axis='index', join='inner', copy=True)
        
        # Checking norm dtype 
        norm_dtype = meta.loc[norm_name, 'type']
        
        # Solvers, scoring, estimators ir categorical or continuous
        if norm_dtype in ['binary', 'multiclass']: # categorical
            X_tt, X_tb, y = process_categorical(outer_cv, inner_cv, y, X_tt, X_tb)
            
            # may have switched form multi to bin after processing
            norm_dtype = 'binary' if len(y.unique()) == 2 else 'multiclass'
            
            # Cross validation settings for logistic regression
            solver = best_logistic_solver(y, norm_dtype)
            
            # Defining logistic regression 
            estimator = LogisticRegressionCV(
                Cs=Cs, penalty=penalty, cv=StratifiedKFold(inner_cv), solver=solver
            )
            scoring = binary_scorer if norm_dtype == 'binary' else multiclass_scorer
        else: # continuous
            estimator, scoring = ridge, 'r2'
            
        # Cross validation
        embed_names = top_text_names + [behavior_name]
        data_check = checker(embed_names, y, norm_dtype, meta, outer_cv, norm_name)
        if data_check == 'pass':
            text_text_scores = k_fold_cross_val(estimator, X_tt, y, outer_cv, scoring, n_jobs)
            text_behavior_scores = k_fold_cross_val(estimator, X_tb, y, outer_cv, scoring, n_jobs)
        else:
            text_text_scores, text_behavior_scores = [np.nan] * outer_cv, [np.nan] * outer_cv
            
        # Saving
        train_n = int(((outer_cv - 1) / outer_cv) * len(y))
        for text_score, text_behavior_score in zip(text_text_scores, text_behavior_scores):
            rca.append([
                text_text_name, text_behavior_name, norm_name, train_n, text_score, 
                text_behavior_score, data_check
            ])
 
 
rca = pd.DataFrame(
    rca, columns=[
        'text_text_name', 'text_behavior_name', 'norm', 'train_n', 
        'r2_mean_tt',  'r2_mean_tb', 'data_check'
    ]
)
rca.to_csv('../../data/final/rca_ensemb.csv', index=False)
rca

  0%|          | 0/4 [00:00<?, ?it/s]

CBOW_GoogleNews&PPMI_SVD_SWOW:   0%|          | 0/292 [00:00<?, ?it/s]

KeyboardInterrupt: 