<a href="https://colab.research.google.com/github/cf87/Seabed-Classification-GP-Predictions/blob/main/Seabed_Classification_GP_Predictions_PUBLIC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from os.path import exists
# Set our RNG seed for reproducibility.
RANDOM_STATE_SEED = 123
np.random.seed(RANDOM_STATE_SEED)


import matplotlib as mpl
import matplotlib.pyplot as plt
from IPython.display import display

import pandas as pd
import numpy as np
import scipy.io
import glob

import itertools

import sklearn
from sklearn import preprocessing
from sklearn import utils
from sklearn.model_selection import train_test_split,cross_val_score,KFold,RandomizedSearchCV

from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.pipeline import Pipeline
from scipy.stats import randint as sp_randint
from scipy.stats import uniform, expon
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from keras.utils.np_utils import to_categorical


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
## list of (estimator, param_dist), where param_dist is used  in RandomizedSearchCV
classifiers = [
    (KNeighborsClassifier(n_jobs=-1), {
        'n_neighbors': sp_randint(4, 10),
        'weights': ['uniform', 'distance'],
        'algorithm':['auto', 'ball_tree', 'kd_tree'],
        'p': [1,2]
    }),
    (SVC(random_state=42,max_iter=20), {
        'C': np.logspace(-1, 3, 100),
        'kernel': ['rbf','linear'], 
        'class_weight':['balanced', None]
    }),
    (MLPClassifier(max_iter=20), {
        'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
        'activation': ['relu','tanh'],
        'solver': ['adam','sgd'],
        'alpha': expon(scale=.1),
        'learning_rate': ['constant','adaptive'],
    }),
    (RandomForestClassifier(max_depth=5, n_jobs=-1), {
        "max_depth": sp_randint(2, 20),
        "n_estimators": sp_randint(2, 50),
        'max_features':[None, 'auto', 'sqrt', 'log2'],
        'criterion': ['gini', 'entropy']
    }),
    (LinearDiscriminantAnalysis(), {
        'solver': ['svd', 'lsqr'],
        'tol': np.logspace(-3, -1, 3)
    }),
    (LogisticRegression(random_state=0,n_jobs=-1, max_iter=20), {
        'C': [.1, 1, 10, 100, 1000],
        'multi_class': ["auto", "ovr","multinomial"],
        'solver': ['sag', 'saga', 'newton-cg'],
    }),
]
names = [e.__class__.__name__ for e, g in classifiers]


In [None]:
from time import time
from sklearn import metrics
from sklearn.metrics import classification_report,accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

# Benchmark metrics for each classifier
# Adapted from Peter Prettenhofer, et. al,
# https://scikit-learn.org/0.19/auto_examples/text/document_classification_20newsgroups.html

def benchmark(model):
    print(name)
    global X_test, y_test
    t0 = time()
    model.fit(X_train, y_train)
    train_time = time() - t0

    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=1)
    train_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', n_jobs=-1)
    
    t0 = time()
    y_test_pred = model.predict(X_test)
    test_time = time() - t0
    
    model_rep = str(metrics.classification_report(y_test, y_test_pred))
    cm = metrics.confusion_matrix(y_test, y_test_pred)
    model_cm = str(cm)
    return score, train_scores.mean(), train_scores.std(), y_test, y_test_pred, train_time, test_time, model_rep, model_cm




In [None]:
def load_ws_data(table_structure, noiseLevel=1):
    # get prepared data structure
    data = table_structure[0, 0]['table']['data']
    
    # get prepared column names
    data_cols = [name[0] for name in table_structure[0, 0]['columns'][0]]
    table_dict = {}
 
    
    for colidx in range(len(data_cols)):
        table_dict[data_cols[colidx]] = [np.squeeze(val) for val in data[0, 0][0, colidx]]
        
    datatable=pd.DataFrame(table_dict)
    data=datatable.loc[datatable.Noise==noiseLevel, ['X']]
    X=np.asarray([np.asarray(data)[k][0] for k in range(len(data))])
    data=datatable.loc[datatable.Noise==noiseLevel, ['Range']]
    ran=np.asarray([np.asarray(data)[k][0] for k in range(len(data))])
    data=datatable.loc[datatable.Noise==noiseLevel, ['Sediment']]
    sed=np.asarray([np.asarray(data)[k][0] for k in range(len(data))])
    return X, ran, sed



In [None]:
import scipy.io as sio

safari_data = sio.loadmat('SafariData_struct.mat')
mustar_data = sio.loadmat('mustarData_struct')
training_data_100 = sio.loadmat('trainingData_100_struct.mat')
training_data = sio.loadmat('trainingData_struct.mat')


In [None]:
# Run ML classification over varying noise level, N_train, data type, and classifier

stdlist=[1,2,3,4]
for N_train in [100, 500, 1000, 5000, 10000]:
  for dataType in [ 'Safari', 'mustar']:
    filestr='classType+'_ML_'+dataType+'_N_train_'+str(N_train)+'_'+str(Nr)+'.mat'
    
    cmsslist=[]
    print(dataType)
    scores=np.zeros((len(names),len(stdlist)),float)
    train_scores=np.zeros((len(names),len(stdlist)),float)
    train_stds=np.zeros((len(names),len(stdlist)),float)
    ytests=[]
    ypreds=[]
    cms=[[[] for x in stdlist] for y in names]
    print(names)
    train_times=np.zeros((len(names),len(stdlist)),float)
    test_times=np.zeros((len(names),len(stdlist)),float)

    

    ds_cnt=-1
    y_cnt=-1
  
    for std in stdlist:
      ds_cnt=ds_cnt+1
      if dataType=='mustar':
        X_test, ran_test, sed_test= load_ws_data(mustar_data['T_struct'], noiseLevel=std)
        X_train, ran_train, sed_train= load_ws_data(training_data['T_struct'], noiseLevel=std)
        img_width=199
      elif dataType=='Safari':
        X_test, ran_test, sed_test= load_ws_data(safari_data['T_struct'], noiseLevel=std)
        X_train, ran_train, sed_train= load_ws_data(training_data_100['T_struct'], noiseLevel=std)
        img_width=100
    
      
      
      #Train data
      X_train = StandardScaler().fit_transform(X_train)
      y1=sed_train
      y2=ran_train
      y_train=[y1[i].strip()+'_' +str(y2[i]) for i in range(len(y2))];

      #Test Data 
      X_test = StandardScaler().fit_transform(X_test)
      yT1=sed_test
      yT2=ran_test
      y_test=[yT1[i].strip()+'_' +str(yT2[i]) for i in range(len(yT2))];   
    
      
      encoder = LabelEncoder()
      encoder.fit(np.unique(y))
      yC = encoder.transform(y)
      labels=encoder.classes_
    
      # iterate over classifiers
      for est_idx, (name, (estimator, param_grid)) in enumerate(zip(names, classifiers)):
        y_cnt=y_cnt+1
        
        # Perform randomized grid search over possible hyperparameters
        model = RandomizedSearchCV(estimator=estimator, param_distributions=param_grid, n_iter=5, verbose=0, n_jobs=-1)
            
        score, val_score,val_std, y_test, y_test_pred, train_time, test_time, model_rep, model_cm=benchmark(model)
        scores[est_idx][ds_cnt]=score
        train_scores[est_idx][ds_cnt]=val_score
        train_stds[est_idx][ds_cnt]=val_std
        train_times[est_idx][ds_cnt]=train_time
        test_times[est_idx][ds_cnt]=test_time
        cms[est_idx][ds_cnt]=model_cm
    
    

    scorelist=[scores[k,:] for k in range(len(names))]
    trainscorelist=[train_scores[k,:] for k in range(len(names))]
    trainstdlist=[train_stds[k,:] for k in range(len(names))]
    train_timeslist=[train_times[k,:] for k in range(len(names))]
    test_timeslist=[test_times[k,:] for k in range(len(names))]
    cmsslist=[[cms[k][j] for k in range(len(names))] for j in range(len(stdlist))]
    
    # initialise data of lists.
    data = {'Names':names,
            'Scores':scorelist,
            'trainScores':trainscorelist,
            'trainstds':trainstdlist,
            'train_time':train_timeslist, 'test_time':test_timeslist}

    # Create DataFrame
    df = pd.DataFrame(data)
    df
    scipy.io.savemat(classType+'_ML_'+dataType+'_N_train_'+str(N_train)+'_'+str(Nr)+'.mat', {'struct':df.to_dict("list"), 'Confusion': cmsslist, 'y_pred':y_test_pred, 'y_test': y_test})   



  