In [None]:
#import required libraries
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
import sklearn.metrics as metrics

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

import pandas as pd
import numpy as np

import os

In [None]:
#read in prepared csv's from EDA
training_read = pd.read_csv(".\Data\stellar_training_data.csv",index_col=0)
test_read = pd.read_csv(".\Data\stellar_test_data.csv",index_col=0)

# Define Functions for Splitting Data & Scoring

In [None]:
#cross-validation stretegy; common for all models
CV = StratifiedKFold(n_splits=10,
                     shuffle=True,
                     random_state=42)


#dataset sizes
DATASET_SIZES = [78052, 15610, 7805, 1561, 781, 156, 78]

#dataframe to store results
TEST_RESULTS=pd.DataFrame()

#number of cores to be used for running script
N_CORES=10

TEST_RESULTS_FILE_NAME = "benchmarking_results_12Feb22.csv"

In [None]:
#short function to append results

def append_predictions(clf, results_in, X, algo):
    y_preds=clf.predict(X)
    y_preds=pd.DataFrame(y_preds,columns=[f"{algo}_{i}"])
    results_out=pd.concat([results_in,y_preds],axis=1)    
    return results_out

In [None]:
def sample_scale_data(train_data, test_data, size):
    #sample the dataset
    train_data = train_data.groupby("class", group_keys=False).apply(lambda x: x.sample(int(np.rint(size*len(x)/len(train_data)))))

    #split the training and test data into predictors and targets
    X_train = train_data.drop(["class"],axis=1)
    y_train = train_data[["class"]].values.ravel()
    
    X_test = test_data.drop(["class"],axis=1)
    y_test = test_data[["class"]].values.ravel()
          
    #scale the predictors
    scaler = MinMaxScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, y_train, y_test

# Modelling

## K-Nearest Neighbors

In [None]:
def KNearestNeighbors_modelling():
    algo="KNN"
    
    #define the hyper-parmeter space to grid-search across
    knn_param_space={"algorithm":["ball_tree","kd_tree","brute"],
                    "leaf_size":np.arange(1,21,1),
                    }
    
    #create initialise gridsearch and fit
    clf_grid = GridSearchCV(KNeighborsClassifier(n_neighbors=3),
                           knn_param_space,
                           cv=CV,
                           n_jobs=N_CORES,
                           verbose=3).fit(X_train_scaled,y_train)
    
    #hand off best parameters to dictionary
    clf_best_params = clf_grid.best_params_
    algorithm=clf_best_params.get("algorithm")
    leaf_size=clf_best_params.get("leaf_size")
    
    #create the final model with best parameters from gridsearch
    clf = KNeighborsClassifier(n_neighbors=3,
                              algorithm=algorithm,
                              leaf_size=leaf_size).fit(X_train_scaled,y_train)
    return clf, algo

## Logistic Regression

In [None]:
def LogisticRegression_modelling():
    algo="LogisticRegression"
    
    #define the hyper-parmeter space to grid-search across
    lr_param_space={"C":np.arange(1,153,2)
        
    }
    
    #create initialise gridsearch and fit
    clf_grid = GridSearchCV(LogisticRegression(random_state=42,
                                               multi_class="multinomial",
                                               solver="lbfgs",
                                               max_iter=1000),
                           lr_param_space,
                           cv=CV,
                           n_jobs=N_CORES,
                           verbose=3).fit(X_train_scaled, y_train)
    
    
    #hand off best parameters to dictionary
    clf_best_params = clf_grid.best_params_
    C=clf_best_params.get("C")
    
    #create the final model with best parameters from gridsearch
    clf = LogisticRegression(random_state=42,
                             C=C,
                             multi_class="multinomial",
                             solver="lbfgs",
                             max_iter=1000).fit(X_train_scaled, y_train)
    
    return clf, algo

## Support Vector Machine

In [None]:
def SVC_modelling():
    algo="SVC"
    
    #define the hyper-parmeter space to grid-search across
    svc_param_space={"C":np.arange(5,15,5),
                "kernel":["poly","rbf"]}
    
    #create initialise gridsearch and fit
    clf_grid = GridSearchCV(SVC(random_state=42),
                           svc_param_space,
                           cv=CV,
                           n_jobs=N_CORES,
                           verbose=3).fit(X_train_scaled,y_train)
    
    #hand off best parameters to dictionary
    clf_best_params = clf_grid.best_params_
    C=clf_best_params.get("C")
    kernel=clf_best_params.get("kernel")
    
    #create the final model with best parameters from gridsearch
    clf = SVC(random_state=42,
             C=C,
             kernel=kernel,
             probability=True).fit(X_train_scaled,y_train)

    return clf, algo

## MLP

In [None]:
def MLP_modelling():
    algo="MLP"
    
   #define the hyper-parmeter space to grid-search across
    mlp_param_space={
        "hidden_layer_sizes":[[50,10],[50,20],[50,30],
                              [40,8],[40,16],[40,24],
                              [30,6],[30,12],[30,18],
                              [20,4],[20,8],[20,12],
                              [10,2],[10,4],[10,6]],
        "activation":["identity","logistic","tanh","relu"],
        "solver":["lbfgs","sgd","adam"],
        "learning_rate":["constant","adaptive"]}
    
    #create initialise gridsearch and fit
    clf_grid = GridSearchCV(MLPClassifier(random_state=42,
                                         max_iter=1000),
                           mlp_param_space,
                           cv=CV,
                           n_jobs=N_CORES,
                           verbose=3).fit(X_train_scaled, y_train)
    
    
    #hand off best parameters to dictionary
    clf_best_params = clf_grid.best_params_
    hidden_layer_sizes=clf_best_params.get("hidden_layer_sizes")
    activation=clf_best_params.get("activation")
    solver=clf_best_params.get("solver")
    learning_rate=clf_best_params.get("learning_rate")
    
    #create the final model with best parameters from gridsearch
    clf=MLPClassifier(random_state=42,
                     max_iter=10000,
                     hidden_layer_sizes=hidden_layer_sizes,
                     activation=activation,
                     solver=solver,
                     learning_rate=learning_rate,
                     early_stopping=True).fit(X_train_scaled, y_train)
    
    return clf, algo

# Experiment

In [None]:
for i in DATASET_SIZES:
    training_results = pd.DataFrame()
    #split data
    X_train_scaled, X_test_scaled, y_train, y_test=sample_scale_data(training_read, test_read, i)
    
    #export the splits to .csv's for MatLab
    training = np.column_stack([X_train_scaled, y_train])
    fname = f".\Data\stellar_training_{i}.csv"
    np.savetxt(fname, training, delimiter=",")
    
    testing = np.column_stack([X_test_scaled, y_test])
    fname = f".\Data\stellar_testing_{i}.csv"
    np.savetxt(fname, testing, delimiter=",")  
    
    #KNearestNeighbors Algorithm
    knn_clf, algo = KNearestNeighbors_modelling()
    training_results = append_predictions(knn_clf, training_results, X_train_scaled, algo)
    training_results.to_csv(f"training_results_{i}.csv")
    TEST_RESULTS = append_predictions(knn_clf, TEST_RESULTS, X_test_scaled, algo)
    TEST_RESULTS.to_csv(TEST_RESULTS_FILE_NAME)
        
    #LogisticRegression Algorithm
    lr_clf, algo = LogisticRegression_modelling()
    training_results = append_predictions(lr_clf, training_results, X_train_scaled, algo)
    training_results.to_csv(f"training_results_{i}.csv")
    TEST_RESULTS = append_predictions(lr_clf, TEST_RESULTS, X_test_scaled, algo)
    TEST_RESULTS.to_csv(TEST_RESULTS_FILE_NAME)
    
    #SupportVectorMachine Algorithm
    svc_clf, algo = SVC_modelling()
    training_results = append_predictions(svc_clf, training_results, X_train_scaled, algo)
    training_results.to_csv(f"training_results_{i}.csv")
    TEST_RESULTS = append_predictions(svc_clf, TEST_RESULTS, X_test_scaled, algo)
    TEST_RESULTS.to_csv(TEST_RESULTS_FILE_NAME)
    
    #Neural Network Algorithm
    mlp_clf, algo = MLP_modelling()
    training_results = append_predictions(mlp_clf, training_results, X_train_scaled, algo)
    training_results.to_csv(f"training_results_{i}.csv")
    TEST_RESULTS = append_predictions(mlp_clf, TEST_RESULTS, X_test_scaled, algo)
    TEST_RESULTS.to_csv(TEST_RESULTS_FILE_NAME)
    
    y_train = pd.DataFrame(y_train,columns=["y_train"])
    training_results = pd.concat([y_train,training_results],axis=1)
    training_results.to_csv(f"training_results_{i}.csv")
    
y_test = pd.DataFrame(y_test, columns=["y_test"])
TEST_RESULTS = pd.concat([y_test,TEST_RESULTS],axis=1)
TEST_RESULTS.to_csv(TEST_RESULTS_FILE_NAME)