In [1]:
import joblib
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

test_size = 0.20

processed_data = None
categorical = None
label_encoders = {}

def preprocessing(dataset, data, test_size):
    """
    Preprocess dataset

    Parameters
    ----------
    data: DataFrame
        Pandas dataframe containing German dataset.
    """
    
    global processed_data
    global categorical
    global label_encoders

    # Reset global variables
    
    processed_data = None
    categorical = None
    label_encoders = {}


    if dataset == "German":
        # Drop savings account and checkings account columns as they contain a lot
        # of NaN values and may not always be available in real life scenarios
        data = data.drop(columns = ['Saving accounts', 'Checking account'])
        
    dat_dict = data.to_dict()
    new_dat_dict = {}

    # rename columns(Make them lowercase and snakecase)
    for key, value in dat_dict.items():
        newKey = key
        if type(key) == str:
            newKey = newKey.lower().replace(' ', '_')
        # if newKey != key:
        new_dat_dict[newKey] = dat_dict[key]
    del dat_dict

    data = pd.DataFrame.from_dict(new_dat_dict)
    del new_dat_dict


    # print(data.describe())
    # print(data.describe(include='O'))

    cols = data.columns
    num_cols = data._get_numeric_data().columns
    categorical = list(set(cols) - set(num_cols))

    # Drop null rows
    data = data.dropna()

    # Encode text columns to number values
    for category in categorical:
        le = LabelEncoder()
        data[category] = le.fit_transform(data[category])
        label_encoders[category] = le

    for col in data.columns:
        if(col not in categorical):
            data[col] = (data[col].astype('float') - np.mean(data[col].astype('float')))/np.std(data[col].astype('float'))

    # print(data.describe())
    # print(data.describe(include='O'))

    processed_data = data

    # Get Training parameters
    if dataset == "German":
        target_col = data.columns[-1]
        x = data.drop(columns=target_col, axis=1)
        y = data[target_col].astype('int')
    elif dataset == "Australian":
        x = data.drop(14, axis=1)
        y = data[14].astype('int')
    elif dataset == "Japanese":
        x = data.drop(15, axis=1)
        y = data[15].astype('int')
    elif dataset == "Taiwan":
        x = data.drop('default_payment_next_month', axis=1)
        y = data['default_payment_next_month'].astype('int')
    elif dataset == "Polish":
        x = data.drop('class', axis=1)
        y = data['class'].astype('int')


    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_size)
    x_train = pd.DataFrame(x_train)
    y_train = pd.DataFrame(y_train)

    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)

    return (x_train, x_test, y_train, y_test)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score

class Model(object):
    """
    Basic Scorecard Model

    Warning: This class should not be used directly. Use derived classes
    instead.
    """

    def __init__(self,
                 classifier=None,
                 test_size=test_size,
                 n_splits=1,
                 random_state=None,
                 n_jobs=None,
                 params=None):
                 
        self.classifier = classifier
        self.params = params
        self.random_state = random_state
        self.test_size = test_size
        self.n_splits = n_splits
        self.n_jobs = n_jobs

        self.model = GridSearchCV(estimator=classifier,
                                  param_grid=params,
                                  n_jobs=n_jobs,
                                  cv=ShuffleSplit(test_size=test_size,
                                  n_splits=n_splits,
                                  random_state=0))
    
    def __str__(self):
        return f"""
        Model Object
        ----------------------------------------------------------------

        Classifier: {self.classifier.__class__.__name__}
        Test Size: {self.test_size}
        Random State: {self.random_state}
        Number of Splits: {self.n_splits}
        Parameter Grid: {self.params}

        {self.model}
        """
    
    def __repr__(self):
        return self.__str__()

    def train(self, x_train, y_train):
        """
        Train scorecard model
        
        Args:
            x_train:
                array of training parameters
            y_train:
                pandas dataframe with training labels
        """

        self.model = self.model.fit(x_train, y_train.values.ravel())
        return self

    def predict(self, data):
        """
        Predict scorecard model

        Args:
            data: array
                Data to perform prediction on.
        """

        return self.model.predict(data)

    def accuracy(self, x_test, y_test):
        """
        Compute scorecard model accuracy

        Args:
            x_test: array
                The test parameters.
            y_test: array
                The labels
        """

        y_pred = self.predict(x_test)
        return accuracy_score(y_test, y_pred, normalize=False)

    def metrics(self, x_test, y_test):
        """
        Comput scorecard model metrics
        
        Args:
            x_test: array
                The test parameters.
            y_test: array
                The labels
        """

        y_pred = self.predict(x_test)
        
        cm = confusion_matrix(y_pred, y_test)
        accuracy = accuracy_score(y_test, y_pred, normalize=True)
        f1 = f1_score(y_test, y_pred, average="macro")
        recall = recall_score(y_test, y_pred, average="macro")
        precision = precision_score(y_test, y_pred, average="macro")

        return {"accuracy" : accuracy,
                "f1_score" : f1,
                "recall_score" : recall,
                "precision_score": precision}

class RandomForest(Model):
    def __init__(self,
                 classifier=RandomForestClassifier(),
                 test_size=test_size,
                 n_splits=1,
                 random_state=0,
                 n_jobs=None,
                 params={'n_estimators' : [20, 30, 40], 'random_state' : [0]}):        
        super(RandomForest, self).__init__(classifier,
                                           test_size,
                                           n_splits,
                                           random_state,
                                           n_jobs,
                                           params)

class SVC(Model):
    def __init__(self,
                 classifier=SVC(),
                 test_size=test_size,
                 n_splits=1,
                 random_state=0,
                 n_jobs=None,
                 params={'kernel' : ['poly'], 'degree' : [2, 3, 4]}):
        super(SVC, self).__init__(classifier,
                                  test_size,
                                  n_splits,
                                  random_state,
                                  n_jobs,
                                  params)

class MLP(Model):
    def __init__(self,
                 classifier=MLPClassifier(),
                 test_size=test_size,
                 n_splits=1,
                 random_state=0,
                 n_jobs=-1,
                 params={'hidden_layer_sizes' : [(100, 50 ,10)],
                         'max_iter' : [500],
                         'activation' : ['relu'],
                         'solver' : ['adam'],
                         'random_state' : [1]}):
        super(MLP, self).__init__(classifier,
                                  test_size,
                                  n_splits,
                                  random_state,
                                  n_jobs,
                                  params)

class GradientBoost(Model):
    def __init__(self,
                 classifier=GradientBoostingClassifier(),
                 test_size=test_size,
                 n_splits=1,
                 random_state=0,
                 n_jobs=None,
                 params={'n_estimators' : [100, 200, 50],
                         'random_state' : [0],
                         'learning_rate' : [1.0],
                         'max_depth' : [1, 2, 3]}):
        super(GradientBoost, self).__init__(classifier,
                                            test_size,
                                            n_splits,
                                            random_state,
                                            n_jobs,
                                            params)



In [3]:
# GERMAN DATASET
german = pd.read_csv('../zoo/data/german.csv', index_col=0)
x_train, x_test, y_train, y_test = preprocessing("German", german, test_size)

# Print Encoders
print(categorical)
print(label_encoders)

# Set and Train the models
print('\nRF')
RFmodel = RandomForest().train(x_train, y_train)
print(f"Random Forest: {RFmodel.metrics(x_test, y_test)}")

print("\nSVC")
SVCmodel = SVC().train(x_train, y_train)
print(f"SVM: {SVCmodel.metrics(x_test, y_test)}")

print("\nMLP")
MLPmodel = MLP().train(x_train, y_train)
print(f"MLP: {MLPmodel.metrics(x_test, y_test)}")

print("\nGB")
GBmodel = GradientBoost().train(x_train, y_train)
print(f"Gradient Boost: {GBmodel.metrics(x_test, y_test)}")

# Save Training Data
joblib.dump(categorical, "../zoo/models/german/categorical.joblib", compress=True)
joblib.dump(label_encoders, "../zoo/models/german/label_encoders.joblib", compress=True)

joblib.dump(RFmodel.model, "../zoo/models/german/rf_classifier.joblib", compress=True)
joblib.dump(SVCmodel.model, "../zoo/models/german/svc_classifier.joblib", compress=True)
joblib.dump(MLPmodel.model, "../zoo/models/german/mlp_classifier.joblib", compress=True)
joblib.dump(GBmodel.model, "../zoo/models/german/gb_classifier.joblib", compress=True)

['risk', 'purpose', 'sex', 'housing']
{'risk': LabelEncoder(), 'purpose': LabelEncoder(), 'sex': LabelEncoder(), 'housing': LabelEncoder()}

RF
Random Forest: {'accuracy': 0.73, 'f1_score': 0.64, 'recall_score': 0.6361323155216285, 'precision_score': 0.7348730673792709}

SVC
SVM: {'accuracy': 0.685, 'f1_score': 0.5211128425373418, 'recall_score': 0.5571965925434229, 'precision_score': 0.6985407066052227}

MLP
MLP: {'accuracy': 0.72, 'f1_score': 0.6697723788182568, 'recall_score': 0.6627945569200133, 'precision_score': 0.6888717751251443}

GB
Gradient Boost: {'accuracy': 0.67, 'f1_score': 0.5268817204301075, 'recall_score': 0.5526053767009624, 'precision_score': 0.6264964086193137}


['../zoo/models/german/gb_classifier.joblib']

In [4]:
# Australian DATASET
australian = [i.strip().split() for i in open("../zoo/data/australian.dat").readlines()]
australian = pd.DataFrame(australian)
x_train, x_test, y_train, y_test = preprocessing("Australian", australian, test_size)

# Print Encoders
print(categorical)
print(label_encoders)

# Set and Train the models
print('\nRF')
RFmodel = RandomForest().train(x_train, y_train)
print(f"Random Forest: {RFmodel.metrics(x_test, y_test)}")

print("\nSVC")
SVCmodel = SVC().train(x_train, y_train)
print(f"SVM: {SVCmodel.metrics(x_test, y_test)}")

print("\nMLP")
MLPmodel = MLP().train(x_train, y_train)
print(f"MLP: {MLPmodel.metrics(x_test, y_test)}")

print("\nGB")
GBmodel = GradientBoost().train(x_train, y_train)
print(f"Gradient Boost: {GBmodel.metrics(x_test, y_test)}")

# Save Training Data
joblib.dump(categorical, "../zoo/models/australian/categorical.joblib", compress=True)
joblib.dump(label_encoders, "../zoo/models/australian/label_encoders.joblib", compress=True)

joblib.dump(RFmodel.model, "../zoo/models/australian/rf_classifier.joblib", compress=True)
joblib.dump(SVCmodel.model, "../zoo/models/australian/svc_classifier.joblib", compress=True)
joblib.dump(MLPmodel.model, "../zoo/models/australian/mlp_classifier.joblib", compress=True)
joblib.dump(GBmodel.model, "../zoo/models/australian/gb_classifier.joblib", compress=True)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
{0: LabelEncoder(), 1: LabelEncoder(), 2: LabelEncoder(), 3: LabelEncoder(), 4: LabelEncoder(), 5: LabelEncoder(), 6: LabelEncoder(), 7: LabelEncoder(), 8: LabelEncoder(), 9: LabelEncoder(), 10: LabelEncoder(), 11: LabelEncoder(), 12: LabelEncoder(), 13: LabelEncoder(), 14: LabelEncoder()}

RF
Random Forest: {'accuracy': 0.8260869565217391, 'f1_score': 0.8237172663402172, 'recall_score': 0.8227928692699491, 'precision_score': 0.825}

SVC
SVM: {'accuracy': 0.8188405797101449, 'f1_score': 0.8166746373346085, 'recall_score': 0.816213921901528, 'precision_score': 0.8172237598467107}

MLP
MLP: {'accuracy': 0.8405797101449275, 'f1_score': 0.8389219015280136, 'recall_score': 0.8389219015280136, 'precision_score': 0.8389219015280136}

GB
Gradient Boost: {'accuracy': 0.8115942028985508, 'f1_score': 0.8083333333333333, 'recall_score': 0.8066638370118846, 'precision_score': 0.8114224137931034}


['../zoo/models/australian/gb_classifier.joblib']

In [5]:
# Japanese DATASET
japanese = [i.strip().split(",") for i in open("../zoo/data/japanese/japanese.data").readlines()]
japanese = pd.DataFrame(japanese)
x_train, x_test, y_train, y_test = preprocessing("Japanese", japanese, test_size)

# Print Encoders
print(categorical)
print(label_encoders)

# Set and Train the models
print('\nRF')
RFmodel = RandomForest().train(x_train, y_train)
print(f"Random Forest: {RFmodel.metrics(x_test, y_test)}")

print("\nSVC")
SVCmodel = SVC().train(x_train, y_train)
print(f"SVM: {SVCmodel.metrics(x_test, y_test)}")

print("\nMLP")
MLPmodel = MLP().train(x_train, y_train)
print(f"MLP: {MLPmodel.metrics(x_test, y_test)}")

print("\nGB")
GBmodel = GradientBoost().train(x_train, y_train)
print(f"Gradient Boost: {GBmodel.metrics(x_test, y_test)}")

# Save Training Data
joblib.dump(categorical, "../zoo/models/japanese/categorical.joblib", compress=True)
joblib.dump(label_encoders, "../zoo/models/japanese/label_encoders.joblib", compress=True)

joblib.dump(RFmodel.model, "../zoo/models/japanese/rf_classifier.joblib", compress=True)
joblib.dump(SVCmodel.model, "../zoo/models/japanese/svc_classifier.joblib", compress=True)
joblib.dump(MLPmodel.model, "../zoo/models/japanese/mlp_classifier.joblib", compress=True)
joblib.dump(GBmodel.model, "../zoo/models/japanese/gb_classifier.joblib", compress=True)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


{0: LabelEncoder(), 1: LabelEncoder(), 2: LabelEncoder(), 3: LabelEncoder(), 4: LabelEncoder(), 5: LabelEncoder(), 6: LabelEncoder(), 7: LabelEncoder(), 8: LabelEncoder(), 9: LabelEncoder(), 10: LabelEncoder(), 11: LabelEncoder(), 12: LabelEncoder(), 13: LabelEncoder(), 14: LabelEncoder(), 15: LabelEncoder()}

RF
Random Forest: {'accuracy': 0.855072463768116, 'f1_score': 0.8468368479467259, 'recall_score': 0.8468368479467259, 'precision_score': 0.8468368479467259}

SVC
SVM: {'accuracy': 0.855072463768116, 'f1_score': 0.8488499452354874, 'recall_score': 0.8539400665926749, 'precision_score': 0.8453541260558805}

MLP
MLP: {'accuracy': 0.8333333333333334, 'f1_score': 0.8276779412563114, 'recall_score': 0.8362930077691454, 'precision_score': 0.8237179487179487}

GB
Gradient Boost: {'accuracy': 0.8188405797101449, 'f1_score': 0.8078529657477026, 'recall_score': 0.80677025527192, 'precision_score': 0.8090339892665475}


['../zoo/models/japanese/gb_classifier.joblib']

In [7]:
# Taiwan DATASET
taiwan = pd.read_excel('../zoo/data/taiwan.xls', index_col=0, header=1)
x_train, x_test, y_train, y_test = preprocessing("Taiwan", taiwan, test_size)

# Print Encoders
print(categorical)
print(label_encoders)

# Set and Train the models
print('\nRF')
RFmodel = RandomForest().train(x_train, y_train)
print(f"Random Forest: {RFmodel.metrics(x_test, y_test)}")

print("\nSVC")
SVCmodel = SVC().train(x_train, y_train)
print(f"SVM: {SVCmodel.metrics(x_test, y_test)}")

print("\nMLP")
MLPmodel = MLP().train(x_train, y_train)
print(f"MLP: {MLPmodel.metrics(x_test, y_test)}")

print("\nGB")
GBmodel = GradientBoost().train(x_train, y_train)
print(f"Gradient Boost: {GBmodel.metrics(x_test, y_test)}")

# Save Training Data
joblib.dump(categorical, "../zoo/models/taiwan/categorical.joblib", compress=True)
joblib.dump(label_encoders, "../zoo/models/taiwan/label_encoders.joblib", compress=True)

joblib.dump(RFmodel.model, "../zoo/models/taiwan/rf_classifier.joblib", compress=True)
joblib.dump(SVCmodel.model, "../zoo/models/taiwan/svc_classifier.joblib", compress=True)
joblib.dump(MLPmodel.model, "../zoo/models/taiwan/mlp_classifier.joblib", compress=True)
joblib.dump(GBmodel.model, "../zoo/models/taiwan/gb_classifier.joblib", compress=True)

[]
{}

RF
Random Forest: {'accuracy': 0.8068333333333333, 'f1_score': 0.658943758531004, 'recall_score': 0.6383870540247156, 'precision_score': 0.7201949725759249}

SVC
SVM: {'accuracy': 0.8028333333333333, 'f1_score': 0.60854756089228, 'recall_score': 0.5947537306306359, 'precision_score': 0.729399851139397}

MLP
MLP: {'accuracy': 0.7668333333333334, 'f1_score': 0.6373733544584739, 'recall_score': 0.6300293110538142, 'precision_score': 0.6484376781559713}

GB
Gradient Boost: {'accuracy': 0.81, 'f1_score': 0.65551412099996, 'recall_score': 0.6338427688032164, 'precision_score': 0.7313800755578117}


['../zoo/models/taiwan/gb_classifier.joblib']

In [None]:
# Polish DATASET
from scipy.io import arff

year_1 = pd.DataFrame(arff.loadarff('../zoo/data/polish/1year.arff')[0])
year_2 = pd.DataFrame(arff.loadarff('../zoo/data/polish/2year.arff')[0])
year_3 = pd.DataFrame(arff.loadarff('../zoo/data/polish/3year.arff')[0])
year_4 = pd.DataFrame(arff.loadarff('../zoo/data/polish/4year.arff')[0])
year_5 = pd.DataFrame(arff.loadarff('../zoo/data/polish/5year.arff')[0])
polish = pd.concat([year_1, year_2, year_3, year_4, year_5], ignore_index=True)
x_train, x_test, y_train, y_test = preprocessing("Polish", polish, test_size)

# Print Encoders
print(categorical)
print(label_encoders)

# Set and Train the models
print('\nRF')
RFmodel = RandomForest().train(x_train, y_train)
print(f"Random Forest: {RFmodel.metrics(x_test, y_test)}")

print("\nSVC")
SVCmodel = SVC().train(x_train, y_train)
print(f"SVM: {SVCmodel.metrics(x_test, y_test)}")

print("\nMLP")
MLPmodel = MLP().train(x_train, y_train)
print(f"MLP: {MLPmodel.metrics(x_test, y_test)}")

print("\nGB")
GBmodel = GradientBoost().train(x_train, y_train)
print(f"Gradient Boost: {GBmodel.metrics(x_test, y_test)}")

# Save Training Data
joblib.dump(categorical, "../zoo/models/polish/categorical.joblib", compress=True)
joblib.dump(label_encoders, "../zoo/models/polish/label_encoders.joblib", compress=True)

joblib.dump(RFmodel.model, "../zoo/models/polish/rf_classifier.joblib", compress=True)
joblib.dump(SVCmodel.model, "../zoo/models/polish/svc_classifier.joblib", compress=True)
joblib.dump(MLPmodel.model, "../zoo/models/polish/mlp_classifier.joblib", compress=True)
joblib.dump(GBmodel.model, "../zoo/models/polish/gb_classifier.joblib", compress=True)

['class']
{'sex': LabelEncoder(), 'risk': LabelEncoder(), 'housing': LabelEncoder(), 'purpose': LabelEncoder(), 0: LabelEncoder(), 1: LabelEncoder(), 2: LabelEncoder(), 3: LabelEncoder(), 4: LabelEncoder(), 5: LabelEncoder(), 6: LabelEncoder(), 7: LabelEncoder(), 8: LabelEncoder(), 9: LabelEncoder(), 10: LabelEncoder(), 11: LabelEncoder(), 12: LabelEncoder(), 13: LabelEncoder(), 14: LabelEncoder(), 15: LabelEncoder(), 'class': LabelEncoder()}
Random Forest: {'accuracy': 0.9802203304957436, 'f1_score': 0.5073501054699149, 'recall_score': 0.5059245566321038, 'precision_score': 0.5466471490310888}
SVM: {'accuracy': 0.9817225838758137, 'f1_score': 0.49538850284270375, 'recall_score': 0.4998725140234574, 'precision_score': 0.49098422238918105}
MLP: {'accuracy': 0.9799699549323986, 'f1_score': 0.71321171997573, 'recall_score': 0.7103058246926172, 'precision_score': 0.7162043104703655}
Gradient Boost: {'accuracy': 0.9819729594391587, 'f1_score': 0.566873915558126, 'recall_score': 0.5409017508

['zoo/polish/gb_classifier.joblib']