## Classification Metrics I

### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split

from sklearn.datasets import load_breast_cancer
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

### Create dataset

In [2]:
data = load_breast_cancer()

In [3]:
df = pd.DataFrame(data.data, columns=data.feature_names)

In [4]:
df['tumor'] = data.target

In [5]:
data.target_names

array(['malignant', 'benign'], dtype='<U9')

In [6]:
df['tumor'] = df['tumor'].replace({0: 'malignant', 1: 'benign'})

In [7]:
df.columns = [c.replace(' ', '_') for c in df.columns]

In [8]:
df.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,tumor
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,malignant
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,malignant
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,malignant
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,malignant
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,malignant


In [9]:
df.tumor.value_counts()

benign       357
malignant    212
Name: tumor, dtype: int64

### X and y

In [10]:
target = 'tumor'
y = df[target]
X = df.drop(target, axis=1)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [12]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [13]:
le.inverse_transform([0])

array(['benign'], dtype=object)

In [14]:
le.classes_

array(['benign', 'malignant'], dtype=object)

### Scaling features

In [15]:
ss = StandardScaler()
Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

### Instantiate and fitting

In [16]:
knn = KNeighborsClassifier()

In [17]:
knn.fit(Z_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

### Predictions

In [18]:
y_test_hat = knn.predict(Z_test)

In [19]:
y_test_hat

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0])

In [20]:
pd.DataFrame({
    'y_true': y_test,
    'y_hat': y_test_hat
}).head()

Unnamed: 0,y_true,y_hat
0,1,1
1,0,0
2,1,1
3,0,0
4,0,0


### Confusion Matrix

In [21]:
cm = confusion_matrix(y_test, y_test_hat)
cm

array([[89,  1],
       [ 5, 48]])

### Confusion Matrix DataFrame

In [22]:
cm_df = pd.DataFrame(data=cm, columns=['predicted benign', 'predicted malignant'], index=['actual benign', 'actual malignant'])
cm_df

Unnamed: 0,predicted benign,predicted malignant
actual benign,89,1
actual malignant,5,48


In [23]:
y_test_hat = np.where(knn.predict_proba(Z_test)[:, 1] > 0.50, 1, 0)
cm = confusion_matrix(y_test, y_test_hat)
cm_df = pd.DataFrame(data=cm, columns=['predicted benign', 'predicted malignant'], index=['actual benign', 'actual malignant'])
cm_df

Unnamed: 0,predicted benign,predicted malignant
actual benign,89,1
actual malignant,5,48


### Calculate recall

Recall = Sensitivity, and there are no p's in sensitivity.

In [24]:
48 / 53

0.9056603773584906

### How many Type I errors?

Type I = False positive

In [25]:
1

1

### How many Type II errors are there?

Type II = False negatives

In [26]:
5

5

### Which error is worse (Type I vs Type II)?

Type II, because they have a malignant tumor and we just told them they didn't.

### Calculate the sensitivity

There are no p's in sensitivity: TP/P

In [28]:
48/53

0.9056603773584906

### Calculate the specificity

There is a p in specificity, therefore there are no p's in the calculation: TN/N

In [29]:
89/90

0.9888888888888889