In [None]:
import numpy as np
import h5py
import scipy.io


In [None]:
# Loads KRAKEN data
def load_dataset(idx):
    filepath = "files_116/files_VLA_rand_"+ str(idx) +"_116.mat"
    
    mat = scipy.io.loadmat(filepath)
    signal_train=np.array([])
  
    # Training data
    args = (mat['p_cl_n'], mat['p_si_n'], mat['p_sa_n'],mat['p_gr_n'])
    for v in args: 
        tmp=np.vstack([np.real(v), np.imag(v)]) 
        signal_train=np.hstack([signal_train, tmp]) if signal_train.size else tmp
    
    
    labels_train=np.array([int(np.floor(i/1000)) for i in range(4000)])
    
    # Test data
    signal_test=np.array([])
    labels_test=[]
    
    for j in range(10):
        i=j+1
        args=(mat["p_cl_n"+str(i)], mat['p_si_n'+str(i)],
              mat["p_sa_n"+str(i)], mat["p_gr_n"+str(i)])
        for v in args: 
            tmp=np.vstack([np.real(v), np.imag(v)])
            signal_test=np.hstack([signal_test, tmp]) if signal_test.size else tmp

        labels_test=np.real(np.append([labels_test], [labels_train]))
    
    # Test labels are perturbed, associate them with the correct material type
    Y=labels_test
    labels=labels_test
    for i in range(len(labels)):
        if abs(labels[i]-1500)<20:
            Y[i]=0
        else:
            if abs(labels[i]-1575)<20:
                Y[i]=1
            else:
                if abs(labels[i]-1650)<20:
                    Y[i]=2
                else:
                    if abs(labels[i]-1800)<20:
                        Y[i]=3
    
    X_train = signal_train.transpose()
    y_train = np.array(labels_train, dtype=int);
    X_test = signal_test.transpose()
    y_test = np.array(Y, dtype=int);
    
    return X_train, y_train, X_test, y_test


In [None]:
from time import time
from sklearn import metrics
from sklearn.metrics import classification_report

# Benchmark metrics for each classifier
# Adapted from Peter Prettenhofer, et. al,
# https://scikit-learn.org/0.19/auto_examples/text/document_classification_20newsgroups.html
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
   
    t0 = time()
    clf.fit(X_train, y_train)
  
    target_names=[]
    targets=np.unique(y);

    for i in range(len(targets)):
        target_names.append(np.str(targets[i]))
    
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    
    print("test time:  %0.3fs" % test_time)
    
    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)
    print("classification report:")
    print(metrics.classification_report(y_test, pred, target_names=target_names))
    
    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    clf_descr = str(str(clf.estimator))
    clf_rep = str(metrics.classification_report(y_test, pred, target_names=target_names))
    clf_cm = str(metrics.confusion_matrix(y_test, pred))
    return clf_descr, score, train_time, test_time, clf_rep, clf_cm
      

In [None]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform, expon
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis


# list of (estimator, param_dist), where param_dist is used  in RandomizedSearchCV
classifiers = [
     (KNeighborsClassifier(), {
         'n_neighbors': sp_randint(4, 10),
         'weights': ['uniform', 'distance'],
         'algorithm':['ball_tree', 'kd_tree'],
         'p': [1,2]
     }),
    (NearestCentroid(), {
         'metric': ['euclidean', 'manhattan','minkowski','chebyshev'],
         'shrink_threshold': [None, .001, .0001, .1, .01]
     }),
    (SVC(kernel='linear'), {
        'C': np.logspace(-1, 3, 100),
        'tol': np.logspace(-8, -2, 100)
    }),
    (SVC(random_state=42, gamma='scale'), {
        'C': [.1, 1, 10, 100, 1000],
        'kernel': ['rbf', 'poly', 'linear', 'sigmoid'], 
        'class_weight':['balanced', None],
        'tol': np.logspace(-8, -2, 100)
    }),
    (MLPClassifier(max_iter=100000), {
        'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
        'alpha': expon(scale=.1),
        'learning_rate': ['constant','adaptive'],
    }),
    (DecisionTreeClassifier(),{
        'criterion': ['gini', 'entropy'],
        'splitter': ['best','random'],
        'max_features':[None, 'auto', 'sqrt', 'log2'],
    }),
    (RandomForestClassifier(max_depth=5, n_jobs=-1), {
        "max_depth": sp_randint(2, 20),
        "n_estimators": sp_randint(2, 50),
        "max_features": ['auto', 'log2'],
        'class_weight': ['balanced', 'balanced_subsample'],
        'criterion': ['gini', 'entropy']
    }),
    (GaussianProcessClassifier(n_jobs=-1, random_state=42),{
        'max_iter_predict': [10, 50, 100],
        'multi_class': ["one_vs_rest", "one_vs_one"],
    }),
     (GaussianNB(), {
         'var_smoothing': np.logspace(-12, -8, 100)
     }), 
     (LinearDiscriminantAnalysis(n_components=2), {
         'solver': ['svd', 'lsqr'],
         'tol': np.logspace(-10, -2, 100)
     }),
     (LogisticRegression(solver='newton-cg', random_state=0, max_iter=100000), {
        'C': [.1, 1, 10, 100, 1000],
         'multi_class': ["auto", "ovr","multinomial"],
         'solver': ['sag', 'saga', 'newton-cg'],
         'tol': np.logspace(-6, -4, 100)
     }),
]
names = [e.__class__.__name__ for e, g in classifiers]

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split


data_name=['KRAKEN 2 layer']
dset_name=data_name

noiselevel = np.array([15, 17 ,19, 21, 26, 30, 32 ,33, 60])
grid_searches = [dict() for x in range(len(noiselevel))]
results = [[] for x in range(len(noiselevel))]
ds_cnt=-1
X0, y0, X_test, y_test= load_dataset(15)
for nl in noiselevel:
    ds_cnt=ds_cnt+1
    X_tmp, y_tmp, X_test, y_test= load_dataset(nl)
    
    # Split into training and validation
    X_train, X_val, y_train, y_val = train_test_split(X0, y0, test_size=.2, random_state=42)

    X=X_train
    y=y_train
 
    print('\n Noise level:', str(nl),'\n')
    # preprocess dataset
    X = StandardScaler().fit_transform(X)

    # iterate over classifiers
    for est_idx, (name, (estimator, param_grid)) in \
        enumerate(zip(names, classifiers)):

        # Perform randomized grid search over possible hyperparameters
        clf = RandomizedSearchCV(estimator=estimator, param_distributions=param_grid, n_iter=20, cv=5,
                           iid='true', verbose=0, n_jobs=-1)

        with ignore_warnings(category=ConvergenceWarning):
            results[ds_cnt].append(benchmark(clf))


        # Store the classifier
        grid_searches[ds_cnt][name] = clf

        # Make predictions for the four test cases
        print(name)
        print('Validation Score', str(clf.score(X_val, y_val)))
        print('Test Score', str(clf.score(X_test, y_test)))
      
        
    

In [None]:
# Displaying results in a pandas dataframe
#Code adapted from https://www.kaggle.com/grfiv4/displaying-the-results-of-a-grid-search
import pandas as pd
def score_summary(grid_searches, sort_by='mean_test_score'):
        frames = []
        for name, grid_search in grid_searches.items():
            frame = pd.DataFrame(grid_search.cv_results_)
            frame = frame.filter(regex='^(?!.*param_).*$')
            frame['estimator'] = len(frame)*[name]
            frames.append(frame)
        df = pd.concat(frames)
        
        df = df.sort_values([sort_by], ascending=False)
        df = df.reset_index()
        df = df.drop(['rank_test_score', 'index'], 1)
        
        columns = df.columns.tolist()
        columns.remove('estimator')
        columns = ['estimator']+columns
        df = df[columns]
        return df

In [None]:
df=score_summary(grid_searches[0])
print(noiselevel[0])
df