# Bootstrap from training samplt for fa
calculate the time required in every step of bootstrap full procedure 

In [11]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

from algo import *

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import LeaveOneOut,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import nibabel as nib
import time 

# Read corresponding data

In [27]:
# define data path
path = '/nfs/s2/userhome/yanganmin/workingdir/attention_data_complete/train_test_data'

# read fa train and test data, also test on sa
train_X_fa = np.load(os.path.join(path,'train_X_fa.npy'))
test_X_fa = np.load(os.path.join(path,'test_X_fa.npy'))
train_Y_fa = np.load(os.path.join(path,'train_Y_fa.npy'))
test_Y_fa = np.load(os.path.join(path,'test_Y_fa.npy'))
X_sa = np.load(os.path.join(path,'X_sa.npy'))
Y_sa = np.load(os.path.join(path,'Y_sa.npy'))

mask_NaN = np.load(os.path.join(path,'mask_NaN.npy'))

# Scale irrelavant input 

In [17]:
test_X_fa = scale(test_X_fa)
X_sa = scale(X_sa)

# Calculate time in every bootstrap procedure

In [13]:
boot_coef = np.zeros(mask_NaN.shape[0])
train_accuracy = []
test_accuracy = []
sa_accuracy = []
concat_data = np.c_[train_X_fa,train_Y_fa]
time_dic = {}

## time of bootstrap datas from train sample 

In [14]:
start = time.time()
bootstraped_data = bootstrap_data(concat_data)
X = bootstraped_data[:,:-1] 
Y = bootstraped_data[:,-1]
end = time.time()
time_dic['bootstrap_data'] = end-start
print(f'Using time : {end-start}s.')

Using time : 132.5588836669922s.


## time of PCA 

In [29]:
start = time.time()

# scale data
train_X_fa = scale(X)

# rename Y 
train_Y_fa = Y

# reduce dimention with PCA
pca = PCA() # reduce dimentions to the number of observations
pca.fit(train_X_fa)
train_X_fa_pc = pca.transform(train_X_fa)
test_X_fa_pc = pca.transform(test_X_fa)
X_sa_pc = pca.transform(X_sa)

end = time.time()
time_dic['PCA'] = end-start
print(f'Using time : {end-start}s.')

Using time : 38.51430559158325s.


## Grid search for beta hyper-parameter 

In [19]:
start = time.time()

# grid-search for best C parameter 
tuned_parameters = [{'C': np.arange(0.05,1.05,0.05)}]
clf = GridSearchCV(LogisticRegression(penalty='l1',solver='liblinear'),tuned_parameters,cv=10,scoring='f1')
clf.fit(train_X_fa_pc, test_Y_fa)

C_best = clf.best_params_['C']

print(f'Best hyper-Parameter C is {C_best}.')

end = time.time()
time_dic['hyper parameter c'] = end-start
print(f'Using time : {end-start}s.')

Best hyper-Parameter C is 0.05.
Using time : 4.864996433258057s.


In [20]:
clf = LogisticRegression(penalty='l1',solver='liblinear',C=C_best) # C value determined by grid search

In [22]:
clf.fit(train_X_fa_pc,test_Y_fa)
LR_beta_mean = clf.coef_[0]

## Gradually rule out PCs with low beta

In [41]:
start = time.time()

PC_record = {} # dictionary to store predicition accuracy with each phase of reduction of PCs

for beta_cutoff in np.arange(0,np.max(abs(LR_beta_mean)),0.001):
    mask_beta = (abs(LR_beta_mean) > beta_cutoff)
    train_X_fa_masked = train_X_fa_pc[:,mask_beta]
    test_X_fa_masked = test_X_fa_pc[:,mask_beta]
    
    clf.fit(train_X_fa_masked,train_Y_fa)
    
    acc = clf.score(test_X_fa_masked,test_Y_fa)
    PC_record[str(beta_cutoff)] = acc

pc_pd = pd.DataFrame({'beta_cutoff':PC_record.keys(),
                     'accuracy':PC_record.values()})
pc_pd[pc_pd['accuracy'] == pc_pd.max()['accuracy']]   # 问题出在如何得到accuracy 最大的beta_cutoff 上 

Unnamed: 0,beta_cutoff,accuracy
16,0.016,0.551282
17,0.017,0.551282
18,0.018,0.551282


In [33]:
beta_cutoff_optimized = pc_pd[pc_pd['accuracy'] == pc_pd.max()['accuracy']]['beta_cutoff'][0]
beta_cutoff_optimized = float(beta_cutoff_optimized)
print(f'optimized_beta_cutoff is {beta_cutoff_optimized}.')

end = time.time()
time_dic['optimize PC num'] = end-start
print(f'Using time : {end-start}s.'

str