# KNN Model Exercises

Create a new notebook, knn_model, and work with the titanic dataset to answer the following:

In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from env import user, password, hostname, get_db_url
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

# acquire
import acquire
import prepare

# turn off pink warning boxes
import warnings
warnings.filterwarnings("ignore")

In [13]:
titanic = acquire.get_titanic_data()

In [14]:
clean_df = prepare.prep_titanic(titanic)
clean_df

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,male,1,0,7.2500,Southampton,0,1,0,1
1,1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,2,1,3,female,0,0,7.9250,Southampton,1,0,0,1
3,3,1,1,female,1,0,53.1000,Southampton,0,0,0,1
4,4,0,3,male,0,0,8.0500,Southampton,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,0,0,13.0000,Southampton,1,1,0,1
887,887,1,1,female,0,0,30.0000,Southampton,1,0,0,1
888,888,0,3,female,1,2,23.4500,Southampton,0,0,0,1
889,889,1,1,male,0,0,30.0000,Cherbourg,1,1,0,0


In [15]:
train, validate, test = prepare.my_train_test_split(clean_df, target= 'survived')
train.shape, validate.shape, test.shape

((534, 12), (178, 12), (179, 12))

In [None]:
print(train.shape, validate.shape, test.shape)

## 1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [16]:
x_train = train.drop(columns=['survived', 'passenger_id', 'sex', 'embark_town'])
y_train = train.survived

x_val = validate.drop(columns=['survived', 'passenger_id', 'sex', 'embark_town'])
y_val = validate.survived

x_test = test.drop(columns=['survived', 'passenger_id', 'sex', 'embark_town'])
y_test = test.survived

knn5 = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn5.fit(x_train,y_train)

In [17]:
y_pred = knn5.predict(x_train)

In [18]:
y_pred_proba = knn5.predict_proba(x_train)

## 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [19]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn5.score(x_train, y_train)))

Accuracy of KNN classifier on training set: 0.81


In [20]:
print(confusion_matrix(y_train, y_pred))

[[280  49]
 [ 50 155]]


In [21]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85       329
           1       0.76      0.76      0.76       205

    accuracy                           0.81       534
   macro avg       0.80      0.80      0.80       534
weighted avg       0.81      0.81      0.81       534



## 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [22]:
print(accuracy_score(y_train, y_pred))

0.8146067415730337


In [23]:
print(precision_score(y_train, y_pred))

0.7598039215686274


In [24]:
print(recall_score(y_train, y_pred))

0.7560975609756098


In [25]:
print(f1_score(y_train, y_pred))

0.7579462102689486


In [26]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85       329
           1       0.76      0.76      0.76       205

    accuracy                           0.81       534
   macro avg       0.80      0.80      0.80       534
weighted avg       0.81      0.81      0.81       534



In [27]:
# Instructor's way:
def print_cm_metrics(cm):
    tn, fp, fn, tp = cm.ravel()

    accuracy = (tp + tn)/(tn + fp + fn + tp)

    true_positive_rate = tp/(tp + fn)
    false_positive_rate = fp/(fp + tn)
    true_negative_rate = tn/(tn + fp)
    false_negative_rate = fn/(fn + tp)

    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    f1_score = 2*(precision*recall)/(precision+recall)

    support_pos = tp + fn
    support_neg = fp + tn

    dict = {
        'metric' : ['accuracy'
                    ,'true_positive_rate'
                    ,'false_positive_rate'
                    ,'true_negative_rate'
                    ,'false_negative_rate'
                    ,'precision'
                    ,'recall'
                    ,'f1_score'
                    ,'support_pos'
                    ,'support_neg']
        ,'score' : [accuracy
                    ,true_positive_rate
                    ,false_positive_rate
                    ,true_negative_rate
                    ,false_negative_rate
                    ,precision
                    ,recall
                    ,f1_score
                    ,support_pos
                    ,support_neg]
    }

    return pd.DataFrame(dict)

## 4. Run through steps 1-3 setting k to 10

In [28]:
knn10 = KNeighborsClassifier(n_neighbors=10, weights='uniform')
knn10.fit(x_train,y_train)

In [29]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn10.score(x_train, y_train)))

Accuracy of KNN classifier on training set: 0.78


In [30]:
print(confusion_matrix(y_train, y_pred))

[[280  49]
 [ 50 155]]


In [31]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85       329
           1       0.76      0.76      0.76       205

    accuracy                           0.81       534
   macro avg       0.80      0.80      0.80       534
weighted avg       0.81      0.81      0.81       534



## 5. Run through steps 1-3 setting k to 20

In [34]:
knn20 = KNeighborsClassifier(n_neighbors=20, weights='uniform')
knn20.fit(x_train,y_train)

In [35]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn20.score(x_train, y_train)))

Accuracy of KNN classifier on training set: 0.74


In [36]:
print(confusion_matrix(y_train, y_pred))

[[280  49]
 [ 50 155]]


In [37]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85       329
           1       0.76      0.76      0.76       205

    accuracy                           0.81       534
   macro avg       0.80      0.80      0.80       534
weighted avg       0.81      0.81      0.81       534



## 6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

The first model, k=5, runs the best for train data with an accuracy of 0.80.

## 7. Which model performs best on our out-of-sample data from validate?

In [38]:
y_val_pred1 = knn5.predict(x_val)
y_val_pred2 = knn10.predict(x_val)
y_val_pred3 = knn20.predict(x_val)

In [39]:
print(classification_report(y_val, y_val_pred1))

print(classification_report(y_val, y_val_pred2))

print(classification_report(y_val, y_val_pred3))

              precision    recall  f1-score   support

           0       0.79      0.76      0.78       110
           1       0.64      0.68      0.66        68

    accuracy                           0.73       178
   macro avg       0.72      0.72      0.72       178
weighted avg       0.73      0.73      0.73       178

              precision    recall  f1-score   support

           0       0.75      0.79      0.77       110
           1       0.63      0.57      0.60        68

    accuracy                           0.71       178
   macro avg       0.69      0.68      0.68       178
weighted avg       0.70      0.71      0.71       178

              precision    recall  f1-score   support

           0       0.70      0.79      0.74       110
           1       0.57      0.46      0.51        68

    accuracy                           0.66       178
   macro avg       0.64      0.62      0.63       178
weighted avg       0.65      0.66      0.65       178



Model 1, knn=5, performs the best on the validate model.