In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pylab as P
import time, glob, os, math
from matplotlib import cm 

from sklearn.decomposition import PCA, FastICA, KernelPCA, FactorAnalysis
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.random_projection import GaussianRandomProjection
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.mixture import GaussianMixture as GM
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier as DT

In [2]:
rs = 42

In [3]:
def output_data(metafile, data):
        assert(os.path.exists(metafile)),"Path to metadata file does not exist"
        datafile = file(metafile)
        datafile.seek(0)
        for line in datafile:
                if data in line:
                        break
        line = line.split(',')
        data_value = line[-1].rstrip()
        datafile.close()
        return data_value

def plot(x, y, xlabel, ylabel):     #Create a plot
    plt.figure()
    plt.plot(x,y,color='b')
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()
    plt.close()
    
def scatter(ax, x, y, clr, xlabel, ylabel, title):     #Create a scatter plot

    clr_label = {0:'orange', 1:'forestgreen', 2:'deepskyblue',3:'gold', 4:'b', 5:'k', 6:'red', 7:'darkmagenta' }
    names = {'NonSpinning':0,'AlignedSpins':1,'Precessing':2}
    target_names = np.vectorize(names.get)(clr)
    target_labels =  np.vectorize(clr_label.get)(target_names)
   
    ax.scatter(x,y,facecolors=target_labels, edgecolors=None) 
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    ax.legend()
    return ax


In [4]:
#Principal Component Analysis
def func_PCA(n_comp, data):
    '''Compute PCA'''
    data_copy = np.copy(data)
    num_samples, num_features = np.shape(data_copy)
    time_init = time.time()
    clf = PCA(svd_solver='full', n_components=n_comp, random_state=rs)   
    transformed_data = clf.fit_transform(data_copy)
    var = clf.explained_variance_
    score = clf.score(data_copy)
    time_pca = time.time() - time_init
    
    #print("Projected {} samples from {} to {} with score {}" .format(num_samples, num_features, n_comp, score))
    return transformed_data, time_pca

In [12]:
#Loading the Dataset

data, wf_tag, metadata = [], [],[]
GT_filepaths = sorted(glob.glob("../../Data/FilteredData/GT*.txt"))
SXS_filepaths = sorted(glob.glob("../../Data/FilteredData/SXS*.txt"))

for f in GT_filepaths:
    dataset = np.loadtxt(f)
    
    dataset = np.concatenate(dataset.T)
    data.append(dataset)
    wf_tag.append(os.path.basename(f).split(".")[0])
    
    metafile = os.path.join("../../Data/Metadata", "Metadata_"+wf_tag[-1]+".csv")
    metadata.append(output_data(metafile, 'spin-type'))
    
    
for f in SXS_filepaths:
    dataset = np.loadtxt(f)
    dataset = np.concatenate(dataset)
    data.append(dataset)
    wf_tag.append(os.path.basename(f).split(".")[0])
    
    metafile = os.path.join("../../Data/Metadata", "Metadata_"+wf_tag[-1]+".csv")
    metadata.append(output_data(metafile, 'spin-type'))
    
data = np.matrix(data)
y = np.array(metadata)

          


In [13]:
def func_dectree(data, trainsize, pca_comp):
    data_train, data_test, y_train, y_test = train_test_split(data, y, train_size=trainsize, random_state=12)
    data_train, y_train = shuffle(data_train, y_train, random_state=rs)
    
    SS_data =StandardScaler()
    #SS_y = StandardScaler()

    data_train =  SS_data.fit_transform(data_train)
    data_test = SS_data.transform(data_test)
    data_pca, _ = func_PCA(pca_comp, data_train)
    data_pca_test, _ = func_PCA(pca_comp, data_test)
    clf = DT()
    data_dt = clf.fit_transform(data_pca, y_train)
    cv_scores = (np.mean(cross_val_score(clf, data_dt, y_train, cv=3)))
    score_train = clf.score(data_pca, y_train)
    score_test = clf.score(data_pca_test,y_test)
    print("Training Size = %f(%f), Cross Validation Score = %f, Train Score = %f \n Testing size=%d, Testing Score = %f"%(trainsize,len(y_train),cv_scores,score_train,len(y_test), score_test))
    return cv_scores, score_train, score_test, data_dt

In [14]:
#Finding optimal training data size

n  = np.arange(0.2,1,0.1)
for i in n:
    func_dectree(data, i, 40)

AttributeError: 'DecisionTreeClassifier' object has no attribute 'fit_transform'

In [None]:
#FInding optimal number of PCA components
n  = np.arange(20,42,5)
for i in n:
    func_dectree(data, 0.6,i)