In [1]:
import os
import json
import pickle
import pandas as pd
from tqdm.notebook import tqdm
from rca import run_rca

## Loading Data

We drop feature_overlap because it contains many NaNs and compo_attribs because it doesn't have a large enough vocabulary and is also a identical to a 65 of the norms in the psychNorms dataset.

In [2]:
with open('../../data/brain_behav_union.pkl', 'rb') as f:
    brain_behav_union = pickle.load(f)

# Loading dictionary of dtype to embed
with open('../../data/dtype_to_embed.json', 'r') as f:
    dtype_to_embed = json.load(f)
    
brain_behav_names = dtype_to_embed['brain'] + dtype_to_embed['behavior']

# Pulling and standardising embeddings
embeds = {}
embeds_path = '../../data/embeds/'
for f_name in tqdm(os.listdir(embeds_path)):
    if f_name not in ['feature_overlap.csv', 'compo_attribs.csv']:  # dropping since contains many NaNs
        
        embed = pd.read_csv(embeds_path + f_name, index_col=0)
        embed_name = f_name.split('.')[0]
        
        # Subsetting to brain and behavior vocab
        embed = embed.loc[embed.index.intersection(brain_behav_union)]
        
        # Standardising
        embeds[embed_name] = (embed - embed.mean()) / embed.std()

{name: embed.shape for name, embed in embeds.items()}

  0%|          | 0/26 [00:00<?, ?it/s]

{'norms_sensorimotor': (36854, 11),
 'EEG_text': (3355, 104),
 'LexVec_CommonCrawl': (44082, 300),
 'fastText_CommonCrawl': (44443, 300),
 'spherical_text_Wikipedia': (35533, 300),
 'GloVe_CommonCrawl': (44278, 300),
 'EEG_speech': (1591, 130),
 'THINGS': (1562, 49),
 'CBOW_GoogleNews': (42830, 300),
 'morphoNLM': (32769, 50),
 'microarray': (626, 15),
 'PPMI_SVD_SouthFlorida': (4959, 300),
 'fastText_Wiki_News': (43143, 300),
 'fastTextSub_OpenSub': (40607, 300),
 'SGSoftMaxOutput_SWOW': (25442, 300),
 'fMRI_speech_cognival': (579, 6),
 'PPMI_SVD_SWOW': (11783, 300),
 'fMRI_text_cognival': (1205, 1000),
 'GloVe_Twitter': (32947, 200),
 'GloVe_Wikipedia': (39421, 300),
 'eye_tracking': (7486, 6),
 'SGSoftMaxInput_SWOW': (11783, 300),
 'SVD_sim_rel': (6837, 300),
 'PPMI_SVD_EAT': (7775, 300)}

In [3]:
with open('../../data/embed_to_dtype.json', 'r') as f:
    embed_to_type = json.load(f)
embed_to_type

{'CBOW_GoogleNews': 'text',
 'fastText_CommonCrawl': 'text',
 'fastText_Wiki_News': 'text',
 'fastTextSub_OpenSub': 'text',
 'GloVe_CommonCrawl': 'text',
 'GloVe_Twitter': 'text',
 'GloVe_Wikipedia': 'text',
 'LexVec_CommonCrawl': 'text',
 'morphoNLM': 'text',
 'spherical_text_Wikipedia': 'text',
 'eye_tracking': 'brain',
 'EEG_speech': 'brain',
 'EEG_text': 'brain',
 'fMRI_speech_cognival': 'brain',
 'fMRI_text_cognival': 'brain',
 'microarray': 'brain',
 'PPMI_SVD_SWOW': 'behavior',
 'SGSoftMaxInput_SWOW': 'behavior',
 'SGSoftMaxOutput_SWOW': 'behavior',
 'PPMI_SVD_SouthFlorida': 'behavior',
 'PPMI_SVD_EAT': 'behavior',
 'THINGS': 'behavior',
 'feature_overlap': 'behavior',
 'norms_sensorimotor': 'behavior',
 'compo_attribs': 'behavior',
 'SVD_sim_rel': 'behavior'}

In [4]:
# Loading norms
norms = pd.read_csv('../../data/psychNorms/psychNorms_processed.zip', index_col=0, compression='zip', low_memory=False)
norm_meta = pd.read_csv('../../data/psychNorms/psychNorms_metadata_processed.csv', index_col='norm')
norms

Unnamed: 0_level_0,frequency_lund,frequency_kucera,frequency_subtlexus,frequency_subtlexuk,frequency_blog_gimenes,frequency_twitter_gimenes,frequency_news_gimenes,frequency_written_cobuild,frequency_spoken_cobuild,context_diversity_subtlexus,...,person_vanarsdall,goals_vanarsdall,movement_vanarsdall,concreteness_vanarsdall,familiarity_vanarsdall,imageability_vanarsdall,familiarity_fear,aoa_fear,imageability_fear,sensory_experience_juhasz2013
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'em,0.0,,,,,,,1.3617,1.9138,,...,,,,,,,,,,
'neath,0.0,,,,,,,0.0000,0.0000,,...,,,,,,,,,,
're,0.0,,,,,,,0.9031,1.6335,,...,,,,,,,,,,
'shun,0.0,,,,,,,0.0000,0.0000,,...,,,,,,,,,,
'tis,0.0,,,,,,,0.4771,0.6021,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
shrick,,,,,,,,,,,...,,,,,,,2.62,4.38,2.93,
post office,,,,,,,,,,,...,,,,,,,3.79,3.07,5.29,
fishing rod,,,,,,,,,,,...,,,,,,,2.29,3.38,5.64,
March,,,,,,,,,,,...,,,,,,,3.43,2.76,3.50,


## Cross Validation

In [5]:
results = run_rca(embeds, norms, norm_meta, embed_to_type)
results

  0%|          | 0/24 [00:00<?, ?it/s]

norms_sensorimotor:   0%|          | 0/291 [00:00<?, ?it/s]

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

                         norm  train_n   r2_mean     r2_sd check
123              taste_binder      404  0.914986  0.025669  pass
124              smell_binder      404  0.854866  0.024746  pass
165           auditory_lynott      292  0.850399  0.031244  pass
166          olfactory_lynott      292  0.829914  0.030290  pass
167          gustatory_lynott      292  0.822489  0.049956  pass
164             haptic_lynott      292  0.773726  0.053972  pass
163             visual_lynott      292  0.756229  0.056084  pass
168  dominant_modality_lynott      275  0.753448  0.072065  pass
111              touch_binder      404  0.749546  0.022353  pass
125               head_binder      404  0.711896  0.067124  pass


EEG_text:   0%|          | 0/291 [00:00<?, ?it/s]



KeyboardInterrupt: 

In [None]:
results.to_csv('../../data/results/rca.csv', index=False)
results