In [1]:
# import data
import numpy as np
import json

# Change .json filename to select animal ID number

# import standard experiment data (XS) and metadata (S_labels)
with open('data/pyData/2705_Data_S_Notch14_1.json') as f:
    data = json.load(f)
pxx = np.array(data[0])
pxx = np.log(pxx)
a, b, c = pxx.shape
XS = pxx.reshape(a*b, c).T
S_labels = data[1]

sID = dict()
y = dict()

# Train data ONLY on standard experiments from day 1: X, S_labels where subject contains 1
# stimON = 1 vs nonstim = 0 (baseline AND stimOFF)
for i in range(1, 10):
    sID[i] = []
    y[i] = []

for k in range(0, len(S_labels['allWindows']['subject'])):
    d = int(S_labels['allWindows']['subject'][k][6])
    sID[d].append(k)
    y[d].append(int(S_labels['allWindows']['stimOn'][k]))
    
X_all = XS
Y_all = np.array(S_labels['allWindows']['stimOn'])

In [2]:
# nested cross validation model: 9 days of "standard experiment"
#    train model on 7 days, validate on 1, and test on 1.
#       \/ each training runs ICA, repeating 5 values of the number of components (C): 5, 10, 20, 50, 100
#       \/ validate each 7-day training for all 5 values of C on 1-day validate set - get best performing C
#       \/ keep test data the same, rotate the validate set through all 8 possibilities - get confidence for best C
#    re-train model on 7 train data using the "best" value of C, and test on 1-day test data set.
#    repeat this 7-1-1 model through all 9 possible sets of test data

#    for testID = 1:9
#        for validateID = 1:8
#            for C = [5, 10, 20, 50, 100]
#                ica
#                clf on trainID ~= testID && trainID ~= validateID
#                keep score, as function of testID, validateID, C
            

# convert data into frequency (scores) and component profiles (factors) from ICA

from sklearn.decomposition import FastICA
from sklearn import svm
from sklearn.metrics import roc_curve, auc

components = [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
roc_auc = np.zeros((9, 8, len(components)))

list_All = list(range(0,max(sID[9])))
t = 0
print('Test day iteration:')
for testDay in range(1, 10):
    val = list(range(1,10))
    val.remove(testDay)
    list_notTest = [ID for ID in list_All if ID not in sID[testDay]]
    print('\t',testDay)
    v = 0
    for validateDay in val:
        trainDay = val[:]
        trainDay.remove(validateDay)        
        trainID = [ID for ID in list_notTest if ID not in sID[validateDay]]        
        c = 0
        for C in components: 
            ica = FastICA(n_components = C, max_iter = 10000,tol = 0.0001) #max = 5000, tol = 0.001
            X_train, X_Validate = X_all[trainID], X_all[sID[validateDay]]
            scoresTrain = ica.fit_transform(X_train)
            scoresValidate = ica.transform(X_Validate)
            y_train, y_Validate = Y_all[trainID], Y_all[sID[validateDay]]
            
            clf = svm.SVC(kernel = 'linear', class_weight = 'balanced', probability = True)
            y_score = clf.fit(scoresTrain, y_train).decision_function(scoresValidate)
            fpr, tpr, _ = roc_curve(y_Validate, y_score)            
            roc_auc[t, v, c] = auc(fpr, tpr)
                      
            c += 1
        v += 1
    t += 1

Test day iteration:
	 1




	 2
	 3
	 4
	 5
	 6
	 7
	 8
	 9


In [1]:
import matplotlib.pyplot as plt
plt.boxplot(roc_auc.reshape(9*8, len(components)))
plt.ylim([0, 1])
plt.ylabel('ROC AUC')
plt.xticks(np.arange(1, 1+len(components)), components)
plt.xlabel('Number of components')
plt.show()

NameError: name 'roc_auc' is not defined