In [1]:
import nltk
import numpy as np
from scipy.io import loadmat
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from scipy import stats
from sklearn.decomposition import PCA
from IPython.core.debugger import set_trace
import pickle
import os
from tqdm import tqdm
import random
from joblib import Parallel, delayed

random.seed(97)
np.random.seed(97)

root_save_dir = "predictions/"
sub_data = "sub_space_data/"

In [2]:
def R2(Real,Pred):
    SSres = np.mean((Real-Pred)**2,0)
    SStot = np.var(Real,0)
    return np.nan_to_num(1-SSres/SStot)


def compute_sig(sub, feat1, feat2):
    print("Subject {} Feature 1 : {} Feature 2: {}".format(sub,feat1,feat2))
        
    save_dir = root_save_dir + "{}_diff_{}/".format(feat1, feat2)
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    
    feat1_dir = root_save_dir + "{}/".format(feat1)
    feat2_dir = root_save_dir + "{}/".format(feat2)
    
    if os.path.exists(save_dir + "{}_sig_boot.npy".format(sub)):
        return 0.0002
    
    reals = []
    for split_num in range(4):
        reals.append(np.load(feat2_dir + sub + "_y_test_{}.npy".format(split_num)))
    all_reals = np.vstack(reals)

    preds_feat1 = []
    for split_num in range(4):
        preds_feat1.append(np.load(feat1_dir + sub + "_y_pred_{}.npy".format(split_num)))
    all_preds_feat1 = np.vstack(preds_feat1)
    
    preds_feat2 = []
    for split_num in range(4):
        preds_feat2.append(np.load(feat2_dir + sub + "_y_pred_{}.npy".format(split_num)))
    all_preds_feat2 = np.vstack(preds_feat2)

    real_r2_feat1 = np.load(feat1_dir + sub + "_r2s.npy")
    real_r2_feat2 = np.load(feat2_dir + sub + "_r2s.npy")
    
    real_diff = real_r2_feat1 - real_r2_feat2
    
    greater = np.zeros(real_diff.shape[0])

    blocks_feat1 = []
    for i in range(4):
        p = np.array([preds_feat1[i][j:j+blocksize] for j in range(0,len(preds_feat1[i]),blocksize)])
        blocks_feat1.append(p)
        
    blocks_feat2 = []
    for i in range(4):
        p = np.array([preds_feat2[i][j:j+blocksize] for j in range(0,len(preds_feat2[i]),blocksize)])
        blocks_feat2.append(p)
        
    blocks_real = []
    for i in range(4):
        p = np.array([reals[i][j:j+blocksize] for j in range(0,len(reals[i]),blocksize)])
        blocks_real.append(p)
    
    for s in tqdm(range(iters)):
        new_preds_feat1 = []
        new_preds_feat2 = []
        new_reals = []
        for i in range(4):
            sample = np.random.choice(len(blocks_feat1[i]),len(blocks_feat1[i]))
            
            bl1 = blocks_feat1[i][sample]
            shuffled_preds_feat1 = np.vstack(bl1)
            new_preds_feat1.append(shuffled_preds_feat1)
            
            bl2 = blocks_feat2[i][sample]
            shuffled_preds_feat2 = np.vstack(bl2)
            new_preds_feat2.append(shuffled_preds_feat2)
            
            blr = blocks_real[i][sample]
            shuffled_reals = np.vstack(blr)
            new_reals.append(shuffled_reals)
            
        new_preds_feat1 = np.vstack(new_preds_feat1)
        new_preds_feat2 = np.vstack(new_preds_feat2)
        new_reals = np.vstack(new_reals)
        
        shuffled_r2_feat1 = R2(new_reals,new_preds_feat1)
        shuffled_r2_feat1[real_r2_feat1 == 0] = 0
        
        shuffled_r2_feat2 = R2(new_reals,new_preds_feat2)
        shuffled_r2_feat2[real_r2_feat2 == 0] = 0
        
        shuffled_diff = shuffled_r2_feat1 - shuffled_r2_feat2        
        greater[shuffled_diff <= 0] += 1            
    sig = greater/iters
    sig[sig == 0] = 1/iters
    np.save(save_dir + "{}_sig_boot.npy".format(sub),sig)
    return np.min(sig)

In [None]:
all_feature_pairs = [
                     ("node_count_punct", "punct_final"), # (NC + PU) - (PU)
                     ("syntactic_surprisal_punct", "punct_final"), # (SS + PU) - (PU)                 
                     ("word_frequency_punct", "punct_final"), # (WF + PU) - (PU)
                     ("word_length_punct", "punct_final"), # (WL + PU) - (PU)
                     ("all_complexity_metrics_punct", "punct_final"), # (CM + PU) - (PU)
                     ("pos_dep_tags_all_complexity_metrics", "all_complexity_metrics_punct"), # since PD already contains PU, this tests (PD + CM + PU) - (CM + PU)
                     ("aggregated_contrege_comp_pos_dep_tags_all_complexity_metrics", "pos_dep_tags_all_complexity_metrics"), # (CC + PD + CM + PU) - (PD + CM + PU)
                     ("aggregated_contrege_incomp_pos_dep_tags_all_complexity_metrics", "pos_dep_tags_all_complexity_metrics"), # (CI + PD + CM + PU) - (PD + CM + PU)
                     ("aggregated_incontrege_pos_dep_tags_all_complexity_metrics", "pos_dep_tags_all_complexity_metrics"), # (INC + PD + CM + PU) - (PD + CM + PU)
                     ("aggregated_bert_PCA_dims_15_contrege_incomp_pos_dep_tags_all_complexity_metrics", "aggregated_contrege_incomp_pos_dep_tags_all_complexity_metrics") # (BERT + CI + PD + CM + PU) - (CI + PD + CM + PU)
                    ]

print(all_feature_pairs)

all_subjects = ["F","G","H","I","J","K","L","M","N"]

blocksize = 10
iters = 5000

Parallel(n_jobs=18)(delayed(compute_sig)(sub,feat[0],feat[1]) for sub in all_subjects for feat in all_feature_pairs)
# for sub in all_subjects:
#     for feat in all_feature_pairs:
#         compute_sig(sub,feat[0],feat[1])

[('node_count_punct', 'punct_final'), ('syntactic_surprisal_punct', 'punct_final'), ('word_frequency_punct', 'punct_final'), ('word_length_punct', 'punct_final'), ('all_effort_based_metrics_punct', 'punct_final'), ('pos_dep_tags_all_effort_based_metrics', 'all_effort_based_metrics_punct'), ('aggregated_contrege_pos_dep_tags_all_effort_based_metrics', 'pos_dep_tags_all_effort_based_metrics'), ('aggregated_contrege_comp_pos_dep_tags_all_effort_based_metrics', 'pos_dep_tags_all_effort_based_metrics'), ('aggregated_beam_incontrege_pos_dep_tags_all_effort_based_metrics', 'pos_dep_tags_all_effort_based_metrics'), ('aggregated_bert_PCA_dims_15_contrege_pos_dep_tags_all_effort_based_metrics', 'aggregated_contrege_pos_dep_tags_all_effort_based_metrics')]
