In [17]:
import joblib
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

test_size = 0.20

processed_data = None
categorical = None
label_encoders = {}

def preprocessing(dataset, data, test_size):
    """
    Preprocess dataset

    Parameters
    ----------
    data: DataFrame
        Pandas dataframe containing German dataset.
    """
    
    global processed_data
    global categorical
    global label_encoders

    # Reset global variables
    
    processed_data = None
    categorical = None
    label_encoders = {}


    if dataset == "German":
        # Drop savings account and checkings account columns as they contain a lot
        # of NaN values and may not always be available in real life scenarios
        data = data.drop(columns = ['Saving accounts', 'Checking account'])
        
    dat_dict = data.to_dict()
    new_dat_dict = {}

    # rename columns(Make them lowercase and snakecase)
    for key, value in dat_dict.items():
        newKey = key
        if type(key) == str:
            newKey = newKey.lower().replace(' ', '_')
        # if newKey != key:
        new_dat_dict[newKey] = dat_dict[key]
    del dat_dict

    data = pd.DataFrame.from_dict(new_dat_dict)
    del new_dat_dict

    # print(data.describe())
    # print(data.describe(include='O'))
    
    if dataset == "German":
        one_hot_columns = ['sex', 'housing', 'purpose']
    else:
        one_hot_columns = []
    
    # Drop null rows
    data = data.dropna()

    # if the column is not One-hot encoded, we will use categorical labelling
    for column in data.columns:
        if column not in one_hot_columns and data[column].dtype == 'object':
            le = LabelEncoder()
            data[column] = le.fit_transform(data[column])
    if one_hot_columns:
        data = pd.get_dummies(data, columns=one_hot_columns)
    
    # We dont need to normalise here as we are normalising after spliting into training and testing sets
    
    # for col in data.columns:
    #     if(col not in categorical):
    #         data[col] = (data[col].astype('float') - np.mean(data[col].astype('float')))/np.std(data[col].astype('float'))

    # print(data.describe())
    # print(data.describe(include='O'))

    processed_data = data

    # Get Training parameters
    if dataset == "German":
        target_col = data.columns[-1]
        X = data.drop(columns=target_col, axis=1)
        y = data[target_col].astype('int')
    elif dataset == "Australian":
        X = data.drop(columns=14, axis=1)
        y = data[14].astype('int')
    elif dataset == "Japanese":
        X = data.drop(columns=15, axis=1)
        y = data[15].astype('int')
    elif dataset == "Taiwan":
        X = data.drop(columns='default_payment_next_month', axis=1)
        y = data['default_payment_next_month'].astype('int')
    elif dataset == "Polish":
        X = data.drop(columns='class', axis=1)
        y = data['class'].astype('int')

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = test_size)
    x_train = pd.DataFrame(x_train)
    y_train = pd.DataFrame(y_train)

    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)

    return (x_train, x_test, y_train, y_test)


In [18]:
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score

class Model(object):
    """
    Basic Scorecard Model

    Warning: This class should not be used directly. Use derived classes
    instead.
    """

    def __init__(self,
                 classifier=None,
                 test_size=test_size,
                 n_splits=1,
                 random_state=None,
                 n_jobs=None,
                 params=None):
                 
        self.classifier = classifier
        self.params = params
        self.random_state = random_state
        self.test_size = test_size
        self.n_splits = n_splits
        self.n_jobs = n_jobs

        self.model = GridSearchCV(estimator=classifier,
                                  param_grid=params,
                                  n_jobs=n_jobs,
                                  cv=ShuffleSplit(test_size=test_size,
                                  n_splits=n_splits,
                                  random_state=0))
    
    def __str__(self):
        return f"""
        Model Object
        ----------------------------------------------------------------

        Classifier: {self.classifier.__class__.__name__}
        Test Size: {self.test_size}
        Random State: {self.random_state}
        Number of Splits: {self.n_splits}
        Parameter Grid: {self.params}

        {self.model}
        """
    
    def __repr__(self):
        return self.__str__()

    def train(self, x_train, y_train):
        """
        Train scorecard model
        
        Args:
            x_train:
                array of training parameters
            y_train:
                pandas dataframe with training labels
        """

        self.model = self.model.fit(x_train, y_train.values.ravel())
        return self

    def predict(self, data):
        """
        Predict scorecard model

        Args:
            data: array
                Data to perform prediction on.
        """

        return self.model.predict(data)

    def accuracy(self, x_test, y_test):
        """
        Compute scorecard model accuracy

        Args:
            x_test: array
                The test parameters.
            y_test: array
                The labels
        """

        y_pred = self.predict(x_test)
        return accuracy_score(y_test, y_pred, normalize=False)

    def metrics(self, x_test, y_test):
        """
        Comput scorecard model metrics
        
        Args:
            x_test: array
                The test parameters.
            y_test: array
                The labels
        """

        y_pred = self.predict(x_test)
        
        cm = confusion_matrix(y_pred, y_test)
        accuracy = accuracy_score(y_test, y_pred, normalize=True)
        f1 = f1_score(y_test, y_pred, average="macro")
        recall = recall_score(y_test, y_pred, average="macro")
        precision = precision_score(y_test, y_pred, average="macro")

        return {"accuracy" : accuracy,
                "f1_score" : f1,
                "recall_score" : recall,
                "precision_score": precision}

class RandomForest(Model):
    def __init__(self,
                 classifier=RandomForestClassifier(),
                 test_size=test_size,
                 n_splits=1,
                 random_state=0,
                 n_jobs=None,
                 params={'n_estimators' : [20, 30, 40], 'random_state' : [0]}):        
        super(RandomForest, self).__init__(classifier,
                                           test_size,
                                           n_splits,
                                           random_state,
                                           n_jobs,
                                           params)

class SVC(Model):
    def __init__(self,
                 classifier=SVC(),
                 test_size=test_size,
                 n_splits=1,
                 random_state=0,
                 n_jobs=None,
                 params={'kernel' : ['poly'], 'degree' : [2, 3, 4]}):
        super(SVC, self).__init__(classifier,
                                  test_size,
                                  n_splits,
                                  random_state,
                                  n_jobs,
                                  params)

class MLP(Model):
    def __init__(self,
                 classifier=MLPClassifier(),
                 test_size=test_size,
                 n_splits=1,
                 random_state=0,
                 n_jobs=-1,
                 params={'hidden_layer_sizes' : [(100, 50 ,10)],
                         'max_iter' : [500],
                         'activation' : ['relu'],
                         'solver' : ['adam'],
                         'random_state' : [1]}):
        super(MLP, self).__init__(classifier,
                                  test_size,
                                  n_splits,
                                  random_state,
                                  n_jobs,
                                  params)

class GradientBoost(Model):
    def __init__(self,
                 classifier=GradientBoostingClassifier(),
                 test_size=test_size,
                 n_splits=1,
                 random_state=0,
                 n_jobs=None,
                 params={'n_estimators' : [100, 200, 50],
                         'random_state' : [0],
                         'learning_rate' : [1.0],
                         'max_depth' : [1, 2, 3]}):
        super(GradientBoost, self).__init__(classifier,
                                            test_size,
                                            n_splits,
                                            random_state,
                                            n_jobs,
                                            params)



In [19]:
# GERMAN DATASET
german = pd.read_csv('../zoo/data/german.csv', index_col=0)
x_train, x_test, y_train, y_test = preprocessing("German", german, test_size)

# Print Encoders
print(categorical)
print(label_encoders)

# Set and Train the models
print('\nRF')
RFmodel = RandomForest().train(x_train, y_train)
print(f"Random Forest: {RFmodel.metrics(x_test, y_test)}")

print("\nSVC")
SVCmodel = SVC().train(x_train, y_train)
print(f"SVM: {SVCmodel.metrics(x_test, y_test)}")

print("\nMLP")
MLPmodel = MLP().train(x_train, y_train)
print(f"MLP: {MLPmodel.metrics(x_test, y_test)}")

print("\nGB")
GBmodel = GradientBoost().train(x_train, y_train)
print(f"Gradient Boost: {GBmodel.metrics(x_test, y_test)}")

# Save Training Data
joblib.dump(categorical, "../zoo/models/german/categorical.joblib", compress=True)
joblib.dump(label_encoders, "../zoo/models/german/label_encoders.joblib", compress=True)

joblib.dump(RFmodel.model, "../zoo/models/german/rf_classifier.joblib", compress=True)
joblib.dump(SVCmodel.model, "../zoo/models/german/svc_classifier.joblib", compress=True)
joblib.dump(MLPmodel.model, "../zoo/models/german/mlp_classifier.joblib", compress=True)
joblib.dump(GBmodel.model, "../zoo/models/german/gb_classifier.joblib", compress=True)

None
{}

RF


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest: {'accuracy': 0.995, 'f1_score': 0.49874686716791977, 'recall_score': 0.5, 'precision_score': 0.4975}

SVC
SVM: {'accuracy': 0.995, 'f1_score': 0.49874686716791977, 'recall_score': 0.5, 'precision_score': 0.4975}

MLP


  _warn_prf(average, modifier, msg_start, len(result))


MLP: {'accuracy': 0.995, 'f1_score': 0.49874686716791977, 'recall_score': 0.5, 'precision_score': 0.4975}

GB
Gradient Boost: {'accuracy': 0.975, 'f1_score': 0.6364958197019266, 'recall_score': 0.9874371859296482, 'precision_score': 0.5833333333333334}


['../zoo/models/german/gb_classifier.joblib']

In [20]:
# Australian DATASET
australian = [i.strip().split() for i in open("../zoo/data/australian.dat").readlines()]
australian = pd.DataFrame(australian)
x_train, x_test, y_train, y_test = preprocessing("Australian", australian, test_size)

# Print Encoders
print(categorical)
print(label_encoders)

# Set and Train the models
print('\nRF')
RFmodel = RandomForest().train(x_train, y_train)
print(f"Random Forest: {RFmodel.metrics(x_test, y_test)}")

print("\nSVC")
SVCmodel = SVC().train(x_train, y_train)
print(f"SVM: {SVCmodel.metrics(x_test, y_test)}")

print("\nMLP")
MLPmodel = MLP().train(x_train, y_train)
print(f"MLP: {MLPmodel.metrics(x_test, y_test)}")

print("\nGB")
GBmodel = GradientBoost().train(x_train, y_train)
print(f"Gradient Boost: {GBmodel.metrics(x_test, y_test)}")

# Save Training Data
joblib.dump(categorical, "../zoo/models/australian/categorical.joblib", compress=True)
joblib.dump(label_encoders, "../zoo/models/australian/label_encoders.joblib", compress=True)

joblib.dump(RFmodel.model, "../zoo/models/australian/rf_classifier.joblib", compress=True)
joblib.dump(SVCmodel.model, "../zoo/models/australian/svc_classifier.joblib", compress=True)
joblib.dump(MLPmodel.model, "../zoo/models/australian/mlp_classifier.joblib", compress=True)
joblib.dump(GBmodel.model, "../zoo/models/australian/gb_classifier.joblib", compress=True)

None
{}

RF
Random Forest: {'accuracy': 0.8840579710144928, 'f1_score': 0.8828522920203734, 'recall_score': 0.8841813923781137, 'precision_score': 0.8819047619047619}

SVC
SVM: {'accuracy': 0.8478260869565217, 'f1_score': 0.8454812050119969, 'recall_score': 0.8449010006387055, 'precision_score': 0.8461538461538461}

MLP
MLP: {'accuracy': 0.8768115942028986, 'f1_score': 0.8753387533875339, 'recall_score': 0.8759846710666382, 'precision_score': 0.8747877758913413}

GB
Gradient Boost: {'accuracy': 0.8695652173913043, 'f1_score': 0.8667667882428662, 'recall_score': 0.8643815201192251, 'precision_score': 0.8706952566601689}


['../zoo/models/australian/gb_classifier.joblib']

In [21]:
# Japanese DATASET
japanese = [i.strip().split(",") for i in open("../zoo/data/japanese/japanese.data").readlines()]
japanese = pd.DataFrame(japanese)
x_train, x_test, y_train, y_test = preprocessing("Japanese", japanese, test_size)

# Print Encoders
print(categorical)
print(label_encoders)

# Set and Train the models
print('\nRF')
RFmodel = RandomForest().train(x_train, y_train)
print(f"Random Forest: {RFmodel.metrics(x_test, y_test)}")

print("\nSVC")
SVCmodel = SVC().train(x_train, y_train)
print(f"SVM: {SVCmodel.metrics(x_test, y_test)}")

print("\nMLP")
MLPmodel = MLP().train(x_train, y_train)
print(f"MLP: {MLPmodel.metrics(x_test, y_test)}")

print("\nGB")
GBmodel = GradientBoost().train(x_train, y_train)
print(f"Gradient Boost: {GBmodel.metrics(x_test, y_test)}")

# Save Training Data
joblib.dump(categorical, "../zoo/models/japanese/categorical.joblib", compress=True)
joblib.dump(label_encoders, "../zoo/models/japanese/label_encoders.joblib", compress=True)

joblib.dump(RFmodel.model, "../zoo/models/japanese/rf_classifier.joblib", compress=True)
joblib.dump(SVCmodel.model, "../zoo/models/japanese/svc_classifier.joblib", compress=True)
joblib.dump(MLPmodel.model, "../zoo/models/japanese/mlp_classifier.joblib", compress=True)
joblib.dump(GBmodel.model, "../zoo/models/japanese/gb_classifier.joblib", compress=True)

None
{}

RF


Random Forest: {'accuracy': 0.8478260869565217, 'f1_score': 0.8460066953610712, 'recall_score': 0.8480769230769231, 'precision_score': 0.8447619047619048}

SVC
SVM: {'accuracy': 0.8333333333333334, 'f1_score': 0.831841059602649, 'recall_score': 0.8352564102564102, 'precision_score': 0.8306638566912539}

MLP
MLP: {'accuracy': 0.8043478260869565, 'f1_score': 0.799709724238026, 'recall_score': 0.7980769230769231, 'precision_score': 0.8021442495126705}

GB
Gradient Boost: {'accuracy': 0.8478260869565217, 'f1_score': 0.8464635761589403, 'recall_score': 0.8500000000000001, 'precision_score': 0.8452054794520548}


['../zoo/models/japanese/gb_classifier.joblib']

In [22]:
# Taiwan DATASET
taiwan = pd.read_excel('../zoo/data/taiwan.xls', index_col=0, header=1)
x_train, x_test, y_train, y_test = preprocessing("Taiwan", taiwan, test_size)

# Print Encoders
print(categorical)
print(label_encoders)

# Set and Train the models
print('\nRF')
RFmodel = RandomForest().train(x_train, y_train)
print(f"Random Forest: {RFmodel.metrics(x_test, y_test)}")

print("\nSVC")
SVCmodel = SVC().train(x_train, y_train)
print(f"SVM: {SVCmodel.metrics(x_test, y_test)}")

print("\nMLP")
MLPmodel = MLP().train(x_train, y_train)
print(f"MLP: {MLPmodel.metrics(x_test, y_test)}")

print("\nGB")
GBmodel = GradientBoost().train(x_train, y_train)
print(f"Gradient Boost: {GBmodel.metrics(x_test, y_test)}")

# Save Training Data
joblib.dump(categorical, "../zoo/models/taiwan/categorical.joblib", compress=True)
joblib.dump(label_encoders, "../zoo/models/taiwan/label_encoders.joblib", compress=True)

joblib.dump(RFmodel.model, "../zoo/models/taiwan/rf_classifier.joblib", compress=True)
joblib.dump(SVCmodel.model, "../zoo/models/taiwan/svc_classifier.joblib", compress=True)
joblib.dump(MLPmodel.model, "../zoo/models/taiwan/mlp_classifier.joblib", compress=True)
joblib.dump(GBmodel.model, "../zoo/models/taiwan/gb_classifier.joblib", compress=True)

None
{}

RF
Random Forest: {'accuracy': 0.8206666666666667, 'f1_score': 0.6795023907255062, 'recall_score': 0.6548772504091653, 'precision_score': 0.7495203661143801}

SVC
SVM: {'accuracy': 0.8055, 'f1_score': 0.6079709000415936, 'recall_score': 0.5940016366612111, 'precision_score': 0.7342833157727924}

MLP
MLP: {'accuracy': 0.7665, 'f1_score': 0.6386418349497656, 'recall_score': 0.6325450081833061, 'precision_score': 0.6470625889008195}

GB
Gradient Boost: {'accuracy': 0.8195, 'f1_score': 0.6696545694047971, 'recall_score': 0.6449509001636661, 'precision_score': 0.7516252007617654}


['../zoo/models/taiwan/gb_classifier.joblib']

In [23]:
# Polish DATASET
from scipy.io import arff

year_1 = pd.DataFrame(arff.loadarff('../zoo/data/polish/1year.arff')[0])
year_2 = pd.DataFrame(arff.loadarff('../zoo/data/polish/2year.arff')[0])
year_3 = pd.DataFrame(arff.loadarff('../zoo/data/polish/3year.arff')[0])
year_4 = pd.DataFrame(arff.loadarff('../zoo/data/polish/4year.arff')[0])
year_5 = pd.DataFrame(arff.loadarff('../zoo/data/polish/5year.arff')[0])
polish = pd.concat([year_1, year_2, year_3, year_4, year_5], ignore_index=True)
x_train, x_test, y_train, y_test = preprocessing("Polish", polish, test_size)

# Print Encoders
print(categorical)
print(label_encoders)

# Set and Train the models
print('\nRF')
RFmodel = RandomForest().train(x_train, y_train)
print(f"Random Forest: {RFmodel.metrics(x_test, y_test)}")

print("\nSVC")
SVCmodel = SVC().train(x_train, y_train)
print(f"SVM: {SVCmodel.metrics(x_test, y_test)}")

print("\nMLP")
MLPmodel = MLP().train(x_train, y_train)
print(f"MLP: {MLPmodel.metrics(x_test, y_test)}")

print("\nGB")
GBmodel = GradientBoost().train(x_train, y_train)
print(f"Gradient Boost: {GBmodel.metrics(x_test, y_test)}")

# Save Training Data
joblib.dump(categorical, "../zoo/models/polish/categorical.joblib", compress=True)
joblib.dump(label_encoders, "../zoo/models/polish/label_encoders.joblib", compress=True)

joblib.dump(RFmodel.model, "../zoo/models/polish/rf_classifier.joblib", compress=True)
joblib.dump(SVCmodel.model, "../zoo/models/polish/svc_classifier.joblib", compress=True)
joblib.dump(MLPmodel.model, "../zoo/models/polish/mlp_classifier.joblib", compress=True)
joblib.dump(GBmodel.model, "../zoo/models/polish/gb_classifier.joblib", compress=True)

None
{}

RF
Random Forest: {'accuracy': 0.9782173259889835, 'f1_score': 0.5550920537610866, 'recall_score': 0.5345633905736208, 'precision_score': 0.6901985423473235}

SVC


  _warn_prf(average, modifier, msg_start, len(result))


SVM: {'accuracy': 0.9789684526790186, 'f1_score': 0.49468623481781376, 'recall_score': 0.5, 'precision_score': 0.4894842263395093}

MLP
MLP: {'accuracy': 0.9802203304957436, 'f1_score': 0.7148243851605213, 'recall_score': 0.6811990013396663, 'precision_score': 0.7651988110993766}

GB
Gradient Boost: {'accuracy': 0.9822233350025038, 'f1_score': 0.6574014002404209, 'recall_score': 0.6006789672390696, 'precision_score': 0.8963299893327581}


['../zoo/models/polish/gb_classifier.joblib']