In [1]:
# Load libraries
import pandas as pd
import numpy as np

from matplotlib import pyplot
from pandas import read_csv
from pandas import set_option
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import (
    classification_report, 
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    roc_auc_score,
    precision_score,
    recall_score,
    confusion_matrix)

from sklearn.feature_selection import RFECV

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from azureml.core.workspace import Workspace

In [2]:
ws = Workspace.from_config()

In [3]:
dataset = ws.datasets["job-leaver-aug-small"].to_pandas_dataframe()
# change objects to category to impute
for col in dataset.select_dtypes(object):
    dataset[col] = dataset[col].astype('category')

In [4]:
dataset = dataset.drop(['enrollee_id', 'city'], axis=1)

In [5]:
X = dataset.drop(columns=['target'], axis=1)
y = np.array(dataset['target'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [6]:
# Evaluate Algorithms

# Test options and evaluation metric
num_folds = 10
seed = 7
scoring = 'f1'


In [7]:
args_max_iter = 100
args_min_features = 3
args_C = 0.25

# Setting up the sklean pipeline

# RFE
svc = SVC(kernel="linear")
min_features_to_select = args_min_features
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2),
              scoring='accuracy',
              min_features_to_select=min_features_to_select)

# model
logreg = LogisticRegression(
    C=args_C,
    max_iter=args_max_iter,
    class_weight='balanced',
    solver="liblinear",
    random_state=42,)

# transformer
numeric_transformer = Pipeline(steps=[
    ('imp', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imp', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, selector(dtype_exclude="category")),
    ('cat', categorical_transformer, selector(dtype_include="category"))
])

# pipeline
pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('rfecv', rfecv)])

In [None]:
# Spot Check Algorithms
pipelines = []
pipelines.append(('LR', Pipeline([('prep', preprocessor),('LR', logreg)])))
pipelines.append(('LDA', Pipeline([('prep', preprocessor),('LDA', LinearDiscriminantAnalysis())])))
pipelines.append(('KNN', Pipeline([('prep', preprocessor),('KNN', KNeighborsClassifier())])))
pipelines.append(('CART', Pipeline([('prep', preprocessor),('CART', DecisionTreeClassifier())])))
pipelines.append(('NB', Pipeline([('prep', preprocessor),('NB', GaussianNB())])))
pipelines.append(('SVM', Pipeline([('prep', preprocessor),('SVM', SVC())])))

#ensembles
pipelines.append(('AdaBoost', Pipeline([('prep', preprocessor),('SVM', AdaBoostClassifier())])))
pipelines.append(('GBM', Pipeline([('prep', preprocessor),('SVM', GradientBoostingClassifier())])))
pipelines.append(('RF', Pipeline([('prep', preprocessor),('SVM', RandomForestClassifier())])))
pipelines.append(('ET', Pipeline([('prep', preprocessor),('SVM', ExtraTreesClassifier())])))

metrics = []
results = []
names = []

for name, model in pipelines:
    kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())

    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    for metric in [balanced_accuracy_score, accuracy_score, f1_score, roc_auc_score, precision_score, recall_score]:
        metrics.append([name, metric.__name__, np.float(metric(y_test, y_pred))])
    
    print(msg)

# Compare Algorithms
fig = pyplot.figure(figsize=(15,7.5))
fig.suptitle('Scaled Algorithm Comparison using F1 score')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()

In [None]:
# Print out the compared metrics
pd.DataFrame(data=metrics).pivot(index=1, columns=0, values=2).apply(lambda x: np.round(x, 2))

Check how the different metrics meant for multi-class are performing

In [8]:
model = Pipeline([('prep', preprocessor),('LR', logreg)])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [18]:
# This cell just to check the different versions of any metric, ito macro, micro and weighted
"Details can be found here:  https://docs.microsoft.com/en-us/azure/machine-learning/how-to-understand-automated-ml#classification-metrics"
print("recall_score")
print(f"default: {recall_score(y_test, y_pred)}")
print(f"macro: {recall_score(y_test, y_pred, average='macro')}")
print(f"micro: {recall_score(y_test, y_pred, average='micro')}")
print(f"weighted: {recall_score(y_test, y_pred, average='weighted')}")

print(f"\nConfusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

recall_score
default: 0.7195121951219512
macro: 0.685444170955471
micro: 0.67
weighted: 0.67

Confusion Matrix:
[[142  76]
 [ 23  59]]
