In [3]:
import pandas as pd
import numpy as np
import pickle

from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report, cohen_kappa_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier

from imblearn.keras import BalancedBatchGenerator
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from imblearn.under_sampling import NearMiss, RandomUnderSampler

class ClaimClassifier:
    def __init__(self, epochs=50, batch_size=16, sampler=ADASYN(), hlayer_size=6, dropout=True):
        """
        Feel free to alter this as you wish, adding instance variables as
        necessary. 
        """
        self.epochs = epochs
        self.batch_size = batch_size
        self.sampler = sampler
        self.hlayer_size = hlayer_size
        self.dropout = dropout
        self.metrics = [
                              keras.metrics.BinaryAccuracy(name='accuracy'),
                              keras.metrics.SensitivityAtSpecificity(1),    
                              keras.metrics.Precision(name='precision'),
                              keras.metrics.Recall(name='recall'),
                              keras.metrics.AUC(name='auc'),
                        ]




    def _preprocessor(self, X_raw, y_raw=None):
        """Data preprocessing function.

        This function prepares the features of the data for training,
        evaluation, and prediction.

        Parameters
        ----------
        X_raw : numpy.ndarray (NOTE, IF WE CAN USE PANDAS HERE IT WOULD BE GREAT)
            A numpy array, this is the raw data as downloaded

        Returns
        -------
        X: numpy.ndarray (NOTE, IF WE CAN USE PANDAS HERE IT WOULD BE GREAT)
            A clean data set that is used for training and prediction.
        """

        scaler = preprocessing.StandardScaler()
        scaled_data = scaler.fit_transform(X_raw)
        
        # Oversample
        if y_raw is not None:
            sampler = self.sampler(random_state=42)
            X_res, y_res = sampler.fit_resample(scaled_data, y_raw)
            return  X_res, y_res
        return scaled_data

    def fit(self, X_raw, y_raw):
        """Classifier training function.

        Here you will implement the training function for your classifier.

        Parameters
        ----------
        X_raw : numpy.ndarray
            A numpy array, this is the raw data as downloaded
        y_raw : numpy.ndarray (optional)
            A one dimensional numpy array, this is the binary target variable

        Returns
        -------
        ?
        """
        X_clean, y_raw = self._preprocessor(X_raw, y_raw)

        # Config
        input_dim = X_clean.shape[1]
        num_classes = len(np.unique(y_raw))-1

        model = Sequential()
        model.add(Dense(self.hlayer_size, input_dim=input_dim, activation= 'relu'))
        if self.dropout:
            model.add(Dropout(0.2))
        model.add(Dense(self.hlayer_size, kernel_initializer = 'glorot_uniform',activation = 'relu'))
        if self.dropout:
            model.add(Dropout(0.2))
        model.add(Dense(num_classes, kernel_initializer = 'glorot_uniform',activation = 'sigmoid'))
        model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = self.metrics)

        # Create batch generator
#         training_generator = BalancedBatchGenerator(
#          X_clean, y_raw, sampler=self.sampler(), batch_size=self.batch_size, random_state=42)

        # Fit model
        model.fit(X_clean, y_raw, batch_size=self.batch_size,
                                               epochs=self.epochs, verbose=0)
        self.model = model

#         self.save_model()
        return self

    def predict(self, X_raw):
        """Classifier probability prediction function.

        Here you will implement the predict function for your classifier.

        Parameters
        ----------
        X_raw : numpy.ndarray
            A numpy array, this is the raw data as downloaded

        Returns
        -------
        numpy.ndarray
            A one dimensional array of the same length as the input with
            values corresponding to the probability of beloning to the
            POSITIVE class (that had accidents)
        """

        try:
            X_clean = self._preprocessor(X_raw)
            predictions = self.model.predict(X_clean)
            probability = np.count_nonzero(predictions)/np.size(X_clean)

        except AttributeError:
            raise("There is no model saved on this class, please run ClaimClassifier.fit() first.")

        return  predictions
    
    def predict_classes(self, X_raw):
        try:
            X_clean = self._preprocessor(X_raw)
            predictions = self.model.predict_classes(X_clean)

        except AttributeError:
            raise("There is no model saved on this class, please run ClaimClassifier.fit() first.")
        return predictions
    def evaluate_architecture(self, X_raw, y_raw):
        """Architecture evaluation utility.

        Populate this function with evaluation utilities for your
        neural network.

        You can use external libraries such as scikit-learn for this
        if necessary.
        """
        X_clean = self._preprocessor(X_raw)
        predictions = self.model.predict_classes(X_clean)
        cm = confusion_matrix(y_raw, predictions)
        scores = self.model.evaluate(X_clean, y_raw)
        roc_auc = roc_auc_score(y_raw, predictions)
        print(f"roc_auc: {roc_auc}")
        print(f"accuracy: {scores[0]}")
        print(f"Confusion matrix \n {cm}")
        print(f"Evaluation scores {scores}\n\n")
        return roc_auc, scores, cm

    def save_model(self):
        with open("part2_claim_classifier.pickle", "wb") as target:
            pickle.dump(self, target)


def ClaimClassifierHyperParameterSearch():  # ENSURE TO ADD IN WHATEVER INPUTS YOU DEEM NECESSARRY TO THIS FUNCTION
    """Performs a hyper-parameter for fine-tuning the classifier.

    Implement a function that performs a hyper-parameter search for your
    architecture as implemented in the ClaimClassifier class. 

    The function should return your optimised hyper-parameters. 
    """
    
    data = np.genfromtxt('part2_data.csv', delimiter=',')
    features = data[1:, :-3]
    labels = data[1:, -1]

    x_train, x_test, y_train, y_test = train_test_split(features, labels,
                                                      test_size = .1,
                                                      random_state=12)

    hyperparameters = {
        'epochs': [100, 200],
        'batch_size': [8, 16, 32],
        'sampler': [SMOTE], #ADASYN, SMOTE, 
        'hlayer_size': [4, 8, 12],
        'dropout': [True, False]
        }
    roc_auc= []
    accuracy = []
    scores = []
    cm = []
    all_params = list(ParameterGrid(hyperparameters))
    for params in all_params:
        print(f"Parameters used:{params}")
        cc = ClaimClassifier(**params)
        cc.fit(x_train, y_train)
        res = cc.evaluate_architecture(x_test,y_test)
        roc_auc.append(res[0])
        scores.append(res[1])
        accuracy.append(res[1][0])
        cm.append(res[2])
    return roc_auc, all_params, scores, accuracy, cm

In [None]:
results = ClaimClassifierHyperParameterSearch()

Parameters used:{'batch_size': 8, 'dropout': True, 'epochs': 100, 'hlayer_size': 4, 'sampler': <class 'imblearn.over_sampling._smote.SMOTE'>}


In [None]:
##### FOR ALVARO
data = np.genfromtxt('part2_data.csv', delimiter=',')
features = data[1:, :-3]
labels = data[1:, -1]

x_train, x_test, y_train, y_test = train_test_split(features, labels,
                                                  test_size = .1,
                                                  random_state=12)
cc = ClaimClassifier(**params)
cc.fit(x_train, y_train)
results = cc.evaluate_architecture(x_test,y_test)