# KNN_Model: Excercises

<hr style="border:2px solid black">

# Imports:

In [33]:
## Imports:

#standard DS imports
import pandas as pd
import numpy as np

#visualization imports
import matplotlib.pyplot as plt
import seaborn as sns

#metrics import
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier


#custom modules
import acquire
import prepare

## Acquire
Plan --> **Acquire** --> Prepare --> Explore --> Model --> Deliver

In [34]:
#get my data
df = acquire.get_titanic_data('titanic_db')

## Prepare
Plan --> Acquire --> **Prepare** --> Explore --> Model --> Deliver

In [35]:
#clean my data
df = prepare.prep_titanic(df)

In [36]:
#split my data into train, validate, test
train, validate, test = prepare.split_function(df, 'survived')

In [37]:
# further split by x and y:
target = 'survived'

# train
x_train = train.drop(columns=[target, 'sex', 'embark_town'])
y_train = train[target]

# validate
x_validate = validate.drop(columns=[target, 'sex', 'embark_town'])
y_validate = validate[target]

# test
x_test = test.drop(columns=[target, 'sex', 'embark_town'])
y_test = test[target]

## 1.) Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)



## Model
Plan --> Acquire --> Prepare --> Explore --> **Model** --> Deliver

### Create Object:

In [38]:
# make the object using default settings: weights = ['uniform', 'distance']
knn = KNeighborsClassifier()
knn

### Fit Object:

In [39]:
knn.fit(x_train, y_train)

### Transform/Predict:

In [40]:
y_pred = knn.predict(x_train)
y_pred[:5]

array([0, 1, 0, 1, 0])

## 2.) Evaluate your results using the model score, confusion matrix, and classification report.



### Evaluate:

In [41]:
# Accuracy
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(x_train, y_train)))

Accuracy of KNN classifier on training set: 0.74


In [42]:
#find labels in our dataset & sort
labels = sorted(y_train.unique())
labels

[0, 1]

In [43]:
# Confusion Matrix
conf_array = confusion_matrix(y_train,y_pred)
conf = pd.DataFrame(conf_array,
            index=[str(label) + '_actual'for label in labels],
            columns=[str(label) + '_predict'for label in labels])
conf


Unnamed: 0,0_predict,1_predict
0_actual,287,42
1_actual,97,108


In [44]:
# Classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.87      0.81       329
           1       0.72      0.53      0.61       205

    accuracy                           0.74       534
   macro avg       0.73      0.70      0.71       534
weighted avg       0.74      0.74      0.73       534



## 3.) Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.



In [45]:
# Create variables for TN, FP, FN, TP
TN, FP, FN, TP = conf_array.ravel()

In [46]:
# Label and Print: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
all_ = (TP + TN + FP + FN)
accuracy = (TP + TN) / all_
TPR = recall = TP / (TP + FN)
FPR = FP / (FP + TN)
TNR = TN / (FP + TN)
FNR = FN / (FN + TP)
precision =  TP / (TP + FP)
f1 =  2 * ((precision * recall) / ( precision + recall))
support_pos = TP + FN
support_neg = FP + TN
print(f"Accuracy: {accuracy: .2%}\n")
print(f"True Positive Rate/Sensitivity/Recall/Power: {TPR: .2%}")
print(f"False Positive Rate/False Alarm Ratio/Fall-out: {FPR: .2%}")
print(f"True Negative Rate/Specificity/Selectivity: {TNR: .2%}")
print(f"False Negative Rate/Miss Rate: {FNR: .2%}\n")
print(f"Precision/PPV: {precision}")
print(f"F1 Score: {f1}\n")
print(f"Support (0): {support_pos}")
print(f"Support (1): {support_neg}")

Accuracy:  73.97%

True Positive Rate/Sensitivity/Recall/Power:  52.68%
False Positive Rate/False Alarm Ratio/Fall-out:  12.77%
True Negative Rate/Specificity/Selectivity:  87.23%
False Negative Rate/Miss Rate:  47.32%

Precision/PPV: 0.72
F1 Score: 0.6084507042253521

Support (0): 205
Support (1): 329


## 4.) Run through steps 1-3 setting k to 10



## 4.1) Create, Fit, Transform/Predict


### Create Object:

In [47]:
# make the object setting k to 10:
knn2 = KNeighborsClassifier(n_neighbors=10)
knn2

### Fit Object:

In [48]:
# fit object
knn2.fit(x_train, y_train)

### Transform/Predict:

In [49]:
# Make predictions
y2_pred = knn2.predict(x_train)

## 4.2) Evaluate


In [50]:
# Accuracy
print('Accuracy of KNN2 classifier on training set: {:.2f}'
     .format(knn2.score(x_train, y_train)))

Accuracy of KNN2 classifier on training set: 0.70


In [51]:
# Confusion Matrix
conf_array_2 = confusion_matrix(y_train,y2_pred)
conf_2 = pd.DataFrame(conf_array_2,
            index=[str(label) + '_actual'for label in labels],
            columns=[str(label) + '_predict'for label in labels])
conf_2


Unnamed: 0,0_predict,1_predict
0_actual,308,21
1_actual,137,68


In [52]:
# Classification report
print(classification_report(y_train, y2_pred))

              precision    recall  f1-score   support

           0       0.69      0.94      0.80       329
           1       0.76      0.33      0.46       205

    accuracy                           0.70       534
   macro avg       0.73      0.63      0.63       534
weighted avg       0.72      0.70      0.67       534



## 4.3) Label and print Metrics

In [53]:
# Create variables for TN, FP, FN, TP
TN, FP, FN, TP = conf_array_2.ravel()

In [54]:
# Label and Print: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
all_ = (TP + TN + FP + FN)
accuracy = (TP + TN) / all_
TPR = recall = TP / (TP + FN)
FPR = FP / (FP + TN)
TNR = TN / (FP + TN)
FNR = FN / (FN + TP)
precision =  TP / (TP + FP)
f1 =  2 * ((precision * recall) / ( precision + recall))
support_pos = TP + FN
support_neg = FP + TN
print(f"Accuracy: {accuracy: .2%}\n")
print(f"True Positive Rate/Sensitivity/Recall/Power: {TPR: .2%}")
print(f"False Positive Rate/False Alarm Ratio/Fall-out: {FPR: .2%}")
print(f"True Negative Rate/Specificity/Selectivity: {TNR: .2%}")
print(f"False Negative Rate/Miss Rate: {FNR: .2%}\n")
print(f"Precision/PPV: {precision}")
print(f"F1 Score: {f1}\n")
print(f"Support (0): {support_pos}")
print(f"Support (1): {support_neg}")

Accuracy:  70.41%

True Positive Rate/Sensitivity/Recall/Power:  33.17%
False Positive Rate/False Alarm Ratio/Fall-out:  6.38%
True Negative Rate/Specificity/Selectivity:  93.62%
False Negative Rate/Miss Rate:  66.83%

Precision/PPV: 0.7640449438202247
F1 Score: 0.4625850340136055

Support (0): 205
Support (1): 329


## 5.) Run through steps 1-3 setting k to 20



## 5.1) Create, Fit, Transform/Predict
### Create Object:

In [55]:
# make the object setting k to 20:
knn3 = KNeighborsClassifier(n_neighbors=20)
knn3

### Fit Object:

In [56]:
# fit object
knn3.fit(x_train, y_train)

### Transform/Predict:

In [57]:
# Make predictions
y3_pred = knn3.predict(x_train)

## 5.2) Evaluate

In [58]:
# Accuracy
print('Accuracy of KNN3 classifier on training set: {:.2f}'
     .format(knn3.score(x_train, y_train)))

Accuracy of KNN3 classifier on training set: 0.69


In [59]:
# Confusion Matrix
conf_array_3 = confusion_matrix(y_train,y3_pred)
conf_3 = pd.DataFrame(conf_array_3,
            index=[str(label) + '_actual'for label in labels],
            columns=[str(label) + '_predict'for label in labels])
conf_3


Unnamed: 0,0_predict,1_predict
0_actual,316,13
1_actual,152,53


In [60]:
# Classification report
print(classification_report(y_train, y3_pred))

              precision    recall  f1-score   support

           0       0.68      0.96      0.79       329
           1       0.80      0.26      0.39       205

    accuracy                           0.69       534
   macro avg       0.74      0.61      0.59       534
weighted avg       0.72      0.69      0.64       534



## 5.3) Label and print Metrics

In [61]:
# Create variables for TN, FP, FN, TP
TN, FP, FN, TP = conf_array_3.ravel()

In [62]:
# Label and Print: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
all_ = (TP + TN + FP + FN)
accuracy = (TP + TN) / all_
TPR = recall = TP / (TP + FN)
FPR = FP / (FP + TN)
TNR = TN / (FP + TN)
FNR = FN / (FN + TP)
precision =  TP / (TP + FP)
f1 =  2 * ((precision * recall) / ( precision + recall))
support_pos = TP + FN
support_neg = FP + TN
print(f"Accuracy: {accuracy: .2%}\n")
print(f"True Positive Rate/Sensitivity/Recall/Power: {TPR: .2%}")
print(f"False Positive Rate/False Alarm Ratio/Fall-out: {FPR: .2%}")
print(f"True Negative Rate/Specificity/Selectivity: {TNR: .2%}")
print(f"False Negative Rate/Miss Rate: {FNR: .2%}\n")
print(f"Precision/PPV: {precision}")
print(f"F1 Score: {f1}\n")
print(f"Support (0): {support_pos}")
print(f"Support (1): {support_neg}")

Accuracy:  69.10%

True Positive Rate/Sensitivity/Recall/Power:  25.85%
False Positive Rate/False Alarm Ratio/Fall-out:  3.95%
True Negative Rate/Specificity/Selectivity:  96.05%
False Negative Rate/Miss Rate:  74.15%

Precision/PPV: 0.803030303030303
F1 Score: 0.3911439114391144

Support (0): 205
Support (1): 329


## 6.) What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?
**ANSWER: As we increase our k= the accuracy and Recall of the model decrease. However, with it so does the rate of false positives. We also have improvements in our models in terms of Specificity and Miss Rate as we increase k. Depending on our appetite for false positives I would say that the model with k=5 is performs best on in-sample data.**

In [63]:
knn_metrics = 'Accuracy:  73.97% \n\
\n\
True Positive Rate/Sensitivity/Recall/Power:  52.68%\n\
False Positive Rate/False Alarm Ratio/Fall-out:  12.77%\n\
True Negative Rate/Specificity/Selectivity:  87.23%\n\
False Negative Rate/Miss Rate:  47.32%\n\
\n\
Precision/PPV: 0.72\n\
F1 Score: 0.6084507042253521\n\
\n\
Support (0): 205\n\
Support (1): 329'

knn2_metrics = 'Accuracy:  70.41%\n\
\n\
True Positive Rate/Sensitivity/Recall/Power:  33.17%\n\
False Positive Rate/False Alarm Ratio/Fall-out:  6.38%\n\
True Negative Rate/Specificity/Selectivity:  93.62%\n\
False Negative Rate/Miss Rate:  66.83%\n\
\n\
Precision/PPV: 0.7640449438202247\n\
F1 Score: 0.4625850340136055\n\
\n\
Support (0): 205\n\
Support (1): 329'

knn3_metrics = 'Accuracy:  69.10%\n\
\n\
True Positive Rate/Sensitivity/Recall/Power:  25.85%\n\
False Positive Rate/False Alarm Ratio/Fall-out:  3.95%\n\
True Negative Rate/Specificity/Selectivity:  96.05%\n\
False Negative Rate/Miss Rate:  74.15%\n\
\n\
Precision/PPV: 0.803030303030303\n\
F1 Score: 0.3911439114391144\n\
\n\
Support (0): 205\n\
Support (1): 329'
print(f'knn Metrics: {knn_metrics}')
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print(f'knn2 Metrics: {knn2_metrics}')
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print(f'knn3 Metrics: {knn3_metrics}')


knn Metrics: Accuracy:  73.97% 

True Positive Rate/Sensitivity/Recall/Power:  52.68%
False Positive Rate/False Alarm Ratio/Fall-out:  12.77%
True Negative Rate/Specificity/Selectivity:  87.23%
False Negative Rate/Miss Rate:  47.32%

Precision/PPV: 0.72
F1 Score: 0.6084507042253521

Support (0): 205
Support (1): 329
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
knn2 Metrics: Accuracy:  70.41%

True Positive Rate/Sensitivity/Recall/Power:  33.17%
False Positive Rate/False Alarm Ratio/Fall-out:  6.38%
True Negative Rate/Specificity/Selectivity:  93.62%
False Negative Rate/Miss Rate:  66.83%

Precision/PPV: 0.7640449438202247
F1 Score: 0.4625850340136055

Support (0): 205
Support (1): 329
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
knn3 Metrics: Accuracy:  69.10%

True Positive Rate/Sensitivity/Recall/Power:  25.85%
False Positive Rate/False Alarm Ratio/Fall-out:  3.95%
True Negative Rate/Specificity/Selectivity:  96.05%
False Negative Rate/Miss Rate:  74.15%

Precisio

## 7.) Which model performs best on our out-of-sample data from validate?
**ANSWER: knn3, the model with k=20 performed best on our out-of-sample data from validate.**


In [64]:
print('Accuracy of KNN classifier on validate set: {:.2f}'
     .format(knn.score(x_validate, y_validate)))
print('Accuracy of KNN2 classifier on validate set: {:.2f}'
     .format(knn2.score(x_validate, y_validate)))
print('Accuracy of KNN3 classifier on validate set: {:.2f}'
     .format(knn3.score(x_validate, y_validate)))

Accuracy of KNN classifier on validate set: 0.61
Accuracy of KNN2 classifier on validate set: 0.67
Accuracy of KNN3 classifier on validate set: 0.70
