In [1]:
# Understanding Confusion Matrix's and Classification Reports


# __________Terms__________
# Accuracy: This measures the proportion of correctly classified instances over the total number of instances.
# Accuracy = (TP + TN) / (TP + TN + FP + FN)

# Precision: This measures the proportion of correctly classified positive instances over the total number of positive predictions.
# Precision = TP / (TP + FP)

# Recall (Sensitivity): This measures the proportion of correctly classified positive instances over the total number of actual positive instances.
# Recall = TP / (TP + FN)

# Specificity: This measures the proportion of true negative instances that are correctly identified as negative by the model. It is also known as the true negative rate (TNR).
# Specificity = TN / (TN + FP)

# F1 Score: This is a harmonic mean of precision and recall, which balances the trade-off between them.
# F1 Score = 2 * (Precision * Recall) / (Precision + Recall)

# ROC Curve: Receiver Operating Characteristic (ROC) Curve is a graphical representation of the performance of a binary classifier. The ROC curve plots the true positive rate (sensitivity) against the false positive rate (1-specificity) at various threshold settings.
# AUC: Area Under the ROC Curve (AUC) is a measure of the overall performance of a binary classifier.


# Importing requirements
# pip3 install pandas
# pip3 install scikit-learn
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Loading the dataset
diabetes_df = pd.read_csv('../../_datasets/diabetes_clean.csv')

# Displaying statistics
print(diabetes_df.describe())

       pregnancies     glucose   diastolic     triceps     insulin  \
count   768.000000  768.000000  768.000000  768.000000  768.000000   
mean      3.845052  120.894531   69.105469   20.536458   79.799479   
std       3.369578   31.972618   19.355807   15.952218  115.244002   
min       0.000000    0.000000    0.000000    0.000000    0.000000   
25%       1.000000   99.000000   62.000000    0.000000    0.000000   
50%       3.000000  117.000000   72.000000   23.000000   30.500000   
75%       6.000000  140.250000   80.000000   32.000000  127.250000   
max      17.000000  199.000000  122.000000   99.000000  846.000000   

              bmi         dpf         age    diabetes  
count  768.000000  768.000000  768.000000  768.000000  
mean    31.992578    0.471876   33.240885    0.348958  
std      7.884160    0.331329   11.760232    0.476951  
min      0.000000    0.078000   21.000000    0.000000  
25%     27.300000    0.243750   24.000000    0.000000  
50%     32.000000    0.372500   2

In [2]:
# Displaying more information
print(diabetes_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pregnancies  768 non-null    int64  
 1   glucose      768 non-null    int64  
 2   diastolic    768 non-null    int64  
 3   triceps      768 non-null    int64  
 4   insulin      768 non-null    int64  
 5   bmi          768 non-null    float64
 6   dpf          768 non-null    float64
 7   age          768 non-null    int64  
 8   diabetes     768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None


In [3]:
# Creating features and target

X = diabetes_df.drop('diabetes',axis=1).values
y = diabetes_df['diabetes']

In [4]:
# Creating | splitting | fitting KNN model

# Instanciate KNN model
knn = KNeighborsClassifier(n_neighbors=6)

# Splitting model
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.4, random_state=42)

# Train model
knn.fit(X_train, y_train)

# Predicting on held out features
y_pred = knn.predict(X_test)


In [5]:
# Generate Confusion Matrix

# Actual on column, predicted on row
# Confusion Matrix displays [[--,-+],
#                           [+-,++]]

# Predicted / Actual | Negative | Positive
# Negative           | TN       | FN
# Positive           | FP       | TP



print(confusion_matrix(y_test, y_pred))


[[256  43]
 [102  60]]


In [6]:
# Generating Classification Report

# Tells metrics for each class 
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.86      0.78       299
           1       0.58      0.37      0.45       162

    accuracy                           0.69       461
   macro avg       0.65      0.61      0.62       461
weighted avg       0.67      0.69      0.66       461

