# Bootstrap from training samplt for sa

In [1]:
import numpy as np
import pandas as pd
import os

from algo import *

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import LeaveOneOut,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import nibabel as nib
import time 

# Data preparition 

In [2]:
# define data path
path = '/home/jupyter-amyang/data/attention_signature/train_test_data'

# read fa train and test data, also test on sa
train_X_sa = np.load(os.path.join(path,'train_X_sa.npy'))
test_X_sa = np.load(os.path.join(path,'test_X_sa.npy'))
train_Y_sa = np.load(os.path.join(path,'train_Y_sa.npy'))
test_Y_sa = np.load(os.path.join(path,'test_Y_sa.npy'))
X_fa = np.load(os.path.join(path,'X_fa.npy'))
Y_fa = np.load(os.path.join(path,'Y_fa.npy'))

mask_NaN = np.load(os.path.join(path,'mask_NaN.npy'))

# scale input 
test_X_sa = scale(test_X_sa)
X_fa = scale(X_fa)

# Result holder

In [7]:
boot_coef = np.zeros(mask_NaN.shape[0])
train_accuracy = []
test_accuracy = []
fa_accuracy = []
hyper_c = []
beta_cutoff_value = []
pc_num = []
concat_data = np.c_[train_X_sa,train_Y_sa]

# For loop

In [8]:
boot_size = 1
for i in range(boot_size):
    start = time.time()
    
    # sample a new set of data 
    bootstraped_data = bootstrap_data(concat_data)
    X = bootstraped_data[:,:-1] 
    Y = bootstraped_data[:,-1]
    
    # PCA
    train_X_sa = scale(X)
    train_Y_sa = Y # rename Y
    
    # reduce dimention with PCA
    pca = PCA() # reduce dimentions to the number of observations
    pca.fit(train_X_sa)
    train_X_sa_pc = pca.transform(train_X_sa)
    test_X_sa_pc = pca.transform(test_X_sa)
    X_fa_pc = pca.transform(X_fa)
    
    # grid search for beta hyper-parameter
    tuned_parameters = [{'C': np.arange(0.05,1.05,0.05)}]
    clf = GridSearchCV(LogisticRegression(penalty='l1',solver='liblinear'),tuned_parameters,cv=10,scoring='f1')
    clf.fit(train_X_sa_pc, train_Y_sa)

    C_best = clf.best_params_['C']
    hyper_c.append(C_best)
    
    clf = LogisticRegression(penalty='l1',solver='liblinear',C=C_best) # C value determined by grid search
    clf.fit(train_X_sa_pc,train_Y_sa)
    LR_beta_mean = clf.coef_[0]
    
    # gradually rule out PCs with low beta
    PC_record = {} # dictionary to store predicition accuracy with each phase of reduction of PCs
    for beta_cutoff in np.arange(0,np.max(abs(LR_beta_mean)),0.001):
        mask_beta = (abs(LR_beta_mean) > beta_cutoff)
        train_X_sa_masked = train_X_sa_pc[:,mask_beta]
        test_X_sa_masked = test_X_sa_pc[:,mask_beta]
    
        clf.fit(train_X_sa_masked,train_Y_sa)
    
        acc = clf.score(test_X_sa_masked,test_Y_sa)
        PC_record[str(beta_cutoff)] = acc
    
    pc_pd = pd.DataFrame({'beta_cutoff':PC_record.keys(),
                     'accuracy':PC_record.values()})
    
    index = pc_pd['accuracy'].argmax()
    beta_cutoff_optimized = pc_pd['beta_cutoff'][index]
    beta_cutoff_optimized = float(beta_cutoff_optimized)
    beta_cutoff_value.append(beta_cutoff_optimized)
    
    # train logistic regression with whole training data 
    
    # employ new X with mask of optimized beta cutoff
    mask_beta_optimized = (abs(LR_beta_mean) > beta_cutoff_optimized)
    pc_num.append(np.sum(mask_beta_optimized))
    train_X_sa_optimized = train_X_sa_pc[:,mask_beta_optimized]
    test_X_sa_optimized = test_X_sa_pc[:,mask_beta_optimized]
    X_fa_optimized = X_fa_pc[:,mask_beta_optimized]

    clf.fit(train_X_sa_optimized,train_Y_sa)
    
    # get beta and transfer back to mni size
    coef = clf.coef_[0]
    coef_modified = mask_back(mask_beta_optimized,coef,mask_type='beta')
    inverse_coef= pca.inverse_transform(coef_modified)
    coef_mni = mask_back(mask_NaN,inverse_coef,mask_type='mni')
    boot_coef = np.vstack((boot_coef,coef_mni))
    
    # store accuracy 
    train_accuracy.append(clf.score(train_X_sa_optimized,train_Y_sa))
    test_accuracy.append(clf.score(test_X_sa_optimized,test_Y_sa))
    fa_accuracy.append(clf.score(X_fa_optimized,Y_fa))
    
    # report 
    end = time.time()
    print(f'The {i}th calculation has been completed, using time {end-start}s')

boot_coef = boot_coef[1:,:]

# dataFrame the prediction accuracy 
result_matrix = np.c_[np.array(train_accuracy),np.array(test_accuracy),np.array(fa_accuracy),
                      np.array(hyper_c),np.array(beta_cutoff_value),np.array(pc_num)]
df_result = pd.DataFrame(result_matrix,columns=['train_accuracy','test_accuracy','fa_accuracy',
                                               'hyper_c','beta_cutoff','pc_num'])

# save_data
save_path = '/home/jupyter-amyang/workingdir/attention_results/bootstrap/sa'
np.save(os.path.join(save_path,'beta_train_sa.npy'),boot_coef)
df_result.to_csv(os.path.join(save_path,'train_sa_accuracy.csv'))

The 0th calculation has been completed, using time 159.24211263656616s
