# 1. IMPORTS

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import model_selection
from sklearn import datasets
from sklearn import tree
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelBinarizer
from matplotlib import pyplot as plt

# 2. Classification

In [2]:
# load data
data = datasets.load_iris(as_frame=True)

# Separate predictor and trget variables
X = data.data
y = data.target

# Train/Test split
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)

## 2.1 Iris Dataset

### 2.1 KNN

In [4]:
# Define
k_classifier = KNeighborsClassifier(n_neighbors=5)

# Fit
k_classifier.fit(X_train, y_train)

# Predict 
yhat_test = k_classifier.predict_proba(X_test)

# Which classe is this one?
class_of_interest = data.target_names[0]
print(f'our positive label is: {class_of_interest}')

our positive label is: setosa


Since we have a multiclass problem, we we will use the one vs rest strategy, which consists in create one ROC curve for each class, where we binarize the other classes which is not the one that we are targeting.

In [5]:
target_dict = dict(zip(k_classifier.classes_, data.target_names))
target_dict

{0: 'setosa', 1: 'versicolor', 2: 'virginica'}

In [6]:
df_aux = X_test.copy()

df_aux['class'] = y_test.apply(lambda x: 1 if x == 0 else 0)
df_aux['prob'] = [for i in y_test]

In [None]:
target_dict = dict(zip(k_classifier.classes_, data.target_names))

# Set figure size
plt.figure(figsize=(20, 12))

# Create a set of bins, from 0 to 1, with intervals of 0.05
bins = [i/20 for i in range (20)] + [1]

# define classes
classes = k_classifier.classes_

# Will save the results
roc_auc_ovr = []

for i in range(len(classes)):
  
  # get the class
  c = classes[i]

  # prepares an auxiliar dataframe to help with the plots
  df_aux = X_test.copy()
  df_aux['class'] = [1 if y == c else 0 for y in y_test] 
  df_aux['proba'] = yhat_test[:, i]

  # plots the probability distribution
  ax = plt.subplot(2, 3, i+1)
  sns.histplot(x='proba', data=df_aux, ax=ax, hue='class', bins=bins, color='b')
  ax.set_title(target_dict[c])
  ax.legend([f'class: {target_dict[c]}', 'rest'])

  # Calculates the ROC Coordinates and plots the ROC Curves
  ax_bottom = plt.subplot(2, 3, i+4)
  tpr, fpr, th = metrics.roc_curve(df_aux['class'], df_aux['proba'])
  plt.plot(tpr, fpr)
  
  # Use euclidian distance to find the distances from tpr/fpr until to 1 intersection
  distances = np.sqrt(((1 - tpr)**2) + (fpr ** 2))
  
  # Find the best threshold
  best_th = th[np.argmin(distances)]

  roc_auc_ovr.append(best_th)

In [None]:
roc_auc_ovr