In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils.multiclass import type_of_target

In [None]:
norms = pd.read_csv('../../data/psychNorms/psychNorms.zip', index_col=0, compression='zip', low_memory=False)
meta = pd.read_csv('../../data/psychNorms/psychNorms_metadata.csv', index_col='norm')
norms

In [None]:
# Adding 'associated_embed' to metadata to avoid data leakage
meta['associated_embed'] = np.nan
meta.loc[meta.index.str.contains('_lancaster'), 'associated_embed'] = 'norms_sensorimotor'
meta.loc[meta.index == 'association_frequency_dedeyne', 'associated_embed'] = 'PPMI_SVD_SWOW SGSoftMaxInput_SWOW SGSoftMaxOutput_SWOW'

# Adding 'type' to metadata (numeric, binary, multiclass)
meta['type'] = [type_of_target(norms[name].dropna()) for name in meta.index]
meta['type'] = meta['type'].replace('continuous', 'numeric')

# Manually fixing mistyped norms
numeric_norms = [
    'n_senses_wordnet_miller', 'n_senses_wordsmyth_rice', 'n_meanings_websters_gao', 'n_features_buchanan',
    'n_semantic_neighbors_shaoul', 'association_frequency_dedeyne', 'cue_setsize_nelson', 'difficulty_rudell',
    'likableness_anderson', 'meaningfulness_anderson'
]
for norm in meta.index:
    if 'vanarsdall' in norm:
        numeric_norms.append(norm)
        
meta.loc[numeric_norms, 'type'] = 'numeric'

# Identifying count-based norms to investigate which need log transformation
numeric_norms = meta.query('type == "numeric"').index
count_norms = [norm for norm in numeric_norms if norms[norm].dropna().apply(float.is_integer).all()]
meta.loc[count_norms]

In [None]:
norms['n_meanings_wordsmyth_rice'].hist()

In [None]:
# Keeping only norms that were manually confirmed as count-based
count_norms = [
    'n_senses_wordnet_miller', 'n_senses_wordsmyth_rice', 'n_meanings_websters_gao', 
    'n_features_buchanan', 'n_semantic_neighbors_shaoul', 'association_frequency_dedeyne',
    'cue_setsize_nelson'
]


rt_norms = [norm for norm in numeric_norms if '_rt_' in norm]
rt_norms

In [None]:
# Log transforming 
to_log = count_norms + rt_norms
norms[to_log] = norms[to_log].apply(np.log1p)

# Checking it roughly worked
for norm in to_log:
    print(norm)
    norms[norm].hist()
    plt.show()

In [None]:
# Log transforming selected norms
norms[to_log] = norms[to_log].apply(np.log1p)

# Saving
norms.to_csv('../../data/psychNorms/psychNorms_processed.zip', compression='zip')
meta.to_csv('../../data/psychNorms/psychNorms_metadata_processed.csv')