In [19]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, StratifiedKFold

## Main functions

In [2]:
def compare_confusion_matrix_metrics(model1, model2, y_pred1, y_pred2, y_test):
    """
    """
    
    # compare confusion matrix
    # TP, FN
    # FP, TN
    
    cm_1 = confusion_matrix(y_test, y_pred1, labels=model1.classes_)
    cm_2 = confusion_matrix(y_test, y_pred2, labels=model2.classes_)

    print(cm_1)
    print('\n', cm_2)
    
    difference = np.subtract(cm_1, cm_2)
    tp, fn = difference[0][0], difference[0][1]
    fp, tn = difference[1][0], difference[1][1]
    
    print(tp, fn)
    print(fp, tn)
    
    print('\n', difference, '\n')
    
    c1, c2 = fn < 0, fp < 0
    c3, c4 = tp < 0, tn < 0
    
    results = {'tp':0,
              'fp':0,
              'fn':0,
              'tn':0}
    
    # check if overall decrease
    if all([c1, c2]):
        print('there is an overall decrease in confusion matrix')
        print(f'model 2 has a FN increase of {abs(fn)}')
        print(f'model 2 has a FP increase of {abs(fp)}')
        
        results['fn'] = abs(fn)
        results['fp'] = abs(fp)
    
    # check if overal increase    
    elif all([c3, c4]):
        print('there is an overall increase in confusion matrix')
        print(f'model 2 has a TP increase of {abs(tp)}')
        print(f'model 2 has a TN increase of {abs(tn)}')
        
        results['tp'] = abs(tp)
        results['tn'] = abs(tn)
    
    # check if one decrease area 
    elif any([c1, c2]):
        if c1:
            print(f'model 2 has a FN increase of: {abs(fn)}')
            results['fn'] = abs(fn)
        else:
            print(f'model 2 has a FP increase of: {abs(fp)}')
            results['fp'] = abs(fp)
    
    # check if one increase area
    elif any([c3, c4]):
        if c3:
            print(f'model 2 has a TP increase of: {abs(tp)}')
            results['tp'] = abs(tp)
        else:
            print(f'model 2 has a TN increase of: {abs(tn)}')
            results['tn'] = abs(tn)
    else:
        print('there was no confusion matrix improvement') 
        return results
    
    return results


def compare_auc(model1, model2, X_test, y_test):
    
    ras1 = roc_auc_score(y_test, model1.predict(X_test))
    ras2 = roc_auc_score(y_test, model2.predict(X_test))
    
    return ras1 - ras2
    
        
def compare_classification_report(y_pred1, y_pred2, y_test):
    
    cr1 = classification_report(y_test, y_pred1, output_dict=True)
    cr2 = classification_report(y_test, y_pred2, output_dict=True)
    
    results = {'0':{key: cr1['0'][key] - cr2['0'][key] for key in cr1['0'].keys()},
               '1':{key: cr1['1'][key] - cr2['1'][key] for key in cr1['1'].keys()}
    }
    
    return results
        
    
def model_comparison_helper(model1, model2, y_pred1, y_pred2, X_test, y_test):
    
    confusion_results = compare_confusion_matrix_metrics(model1, model2, y_pred1, y_pred2, y_test)
    
    auc_results = compare_auc(model1, model2, X_test, y_test)
    
    cr_results = compare_classification_report(y_pred1, y_pred2, y_test)
    
    print(confusion_results)
    print(auc_results)
    print(cr_results)

## Get data

In [3]:
# load dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'target']
df = pd.read_csv(url, names=names)
df.shape

(768, 9)

In [4]:
df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,target
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Split into features and target

In [5]:
X = df.iloc[:,: -1].values
y = df.loc[:,'target'].values

# Split into train and test

In [16]:
import shap
import mlflow

In [11]:
# train XGBoost model
X, y = shap.datasets.adult()

In [18]:
num_examples = len(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

model = xgboost.XGBClassifier().fit(X_train, y_train)

eval_data = X_test
eval_data["label"] = y_test

with mlflow.start_run() as run:
    mlflow.sklearn.log_model(model, "model")
    model_uri = mlflow.get_artifact_uri("model")
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        dataset_name="adult",
        evaluators=["default"],
    )

print(f"metrics:\n{result.metrics}")
print(f"artifacts:\n{result.artifacts}")

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
2022/04/15 08:42:22 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
2022/04/15 08:42:22 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is True, negative label is False.
2022/04/15 08:42:23 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used.
Unable to serialize underlying model using MLflow, will use SHAP serialization


PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\leopa\\AppData\\Local\\Temp\\tmpv_c8soa0\\confusion_matrix_on_data_adult.png'

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
import xgboost

model = xgboost.XGBClassifier().fit(X_train, y_train)

eval_data = X_test
eval_data["label"] = y_test

with mlflow.start_run() as run:
    mlflow.sklearn.log_model(model, "model")
    model_uri = mlflow.get_artifact_uri("model")
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        dataset_name="adult",
        evaluators=["default"],
    )

print(f"metrics:\n{result.metrics}")
print(f"artifacts:\n{result.artifacts}")





IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

## Train classifiers for testing

### Classifier 1

In [15]:
params = {'n_estimators':[64, 128, 256, 500, 1000],
#           'max_depth': [4,5,6,7,8,9,10],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_leaf': [3, 4, 5],
         'criterion' :['gini', 'entropy']}

clf_rf = GridSearchCV(RandomForestClassifier(random_state=42, n_jobs=-1),param_grid=params, cv=5, refit = True)
clf_rf.fit(X_train, y_train)

print(f'Best parameters: \n{clf_rf.best_params_}')
print(f'Average cross-validation score for best model: {clf_rf.best_score_}')

rf_classifier = clf_rf.best_estimator_

y_pred = rf_classifier.predict(X_test)

Best parameters: 
{'criterion': 'entropy', 'max_features': 'auto', 'min_samples_leaf': 4, 'n_estimators': 256}
Average cross-validation score for best model: 0.7899240303878449


### Classifier 2

In [16]:
rf_classifier_2 = RandomForestClassifier(n_estimators=1000, random_state=42)
#Train the model using the training sets y_pred=clf.predict(X_test)
rf_classifier_2.fit(X_train,y_train)

rf2_scores = cross_val_score(rf_classifier_2, X_train, y_train, scoring='roc_auc', cv=StratifiedKFold(n_splits=5), n_jobs=-1, error_score='raise')

# report performance
print(f'Mean ROC AUC: {round(np.mean(rf2_scores),3)} with a standard deviation of {round(np.std(rf2_scores),4)}')

y_pred_2 = rf_classifier_2.predict(X_test)

Mean ROC AUC: 0.833 with a standard deviation of 0.0225


## Compare classifier metrics

In [17]:
model_comparison_helper(rf_classifier, rf_classifier_2, y_pred, y_pred_2, X_test, y_test)

[[80 19]
 [20 35]]

 [[78 21]
 [20 35]]
2 -2
0 0

 [[ 2 -2]
 [ 0  0]] 

model 2 has a FN increase of: 2
{'tp': 0, 'fp': 0, 'fn': 2, 'tn': 0}
0.010101010101010055
{'0': {'precision': 0.004081632653061273, 'recall': 0.02020202020202022, 'f1-score': 0.012141927913680184, 'support': 0}, '1': {'precision': 0.02314814814814814, 'recall': 0.0, 'f1-score': 0.011571204231754617, 'support': 0}}


In [None]:
## Extra stuff

In [None]:
# use matplotlib.pyplot.matshow() to represent an correlation matrix in a new figure window
plt.matshow(df.corr())

# set the ticks
plt.xticks(range(len(df.columns)), df.columns, rotation=90)
plt.yticks(range(len(df.columns)), df.columns)

# set the color bar
plt.colorbar()

# draw
plt.show()

In [None]:
from scipy.stats import chi2_contingency
columns = df.columns[:-1]
for col in columns:
    sub_df = df[['target', col]]
    csq = chi2_contingency(pd.crosstab(sub_df['target'], sub_df[col]))
    print(f'col: {col}, P-value: {csq[1]}')