In [4]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from pydataset import data


import env
import os
import acquire
import prepare

# acquire data

In [6]:
titanic = acquire.get_titanic_data(acquire.get_connection)

In [9]:
titanic = prepare.prep_titanic(titanic)

In [13]:
titanic.head(1)

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,1,0,7.25,0,1,0,1


In [12]:
titanic = titanic.drop(columns=['sex','embark_town'])

# split data

In [14]:
train,validate,test = prepare.split_data(titanic,'survived')

In [15]:
train.shape, validate.shape,test.shape

((498, 9), (214, 9), (179, 9))

# x and y data

In [19]:
x_train = train.drop(columns=['survived'])
y_train = train.survived

x_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

x_test = test.drop(columns=['survived'])
y_test = test.survived

# Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

## train model

In [22]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn

KNeighborsClassifier()

## fit model

In [23]:
knn.fit(x_train,y_train)

KNeighborsClassifier()

## make predictions

In [None]:
y_pred = knn.predict(x_train)

## estimate probability

In [25]:
y_pred_proba = knn.predict_proba(x_train)


# Evaluate your results using the model score, confusion matrix, and classification report.

## score

In [27]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(x_train, y_train)))

Accuracy of KNN classifier on training set: 0.82


## confusion matrix


In [41]:
cm = confusion_matrix(y_train, y_pred)
cm = pd.DataFrame(cm,index = ['actual 0','actual 1'] , columns = ['pred 0','pred 1'])
cm

Unnamed: 0,pred 0,pred 1
actual 0,274,33
actual 1,58,133


In [30]:
tp = 274
tn = 133
fp = 33
fn = 58
comb = tp + tn + fp + fn

# Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [34]:
accuracy = (tp + tn)/ comb
print(f"Accuracy: {accuracy}")

true_positive_rate = tp/(tp+fn)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = fp/(fp+tn)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = tn/(tn+fp)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = fn/(fn+tp)
print(f"False Negative Rate: {false_negative_rate}")

precision = tp / (tp + fp)
print(f"Precision: {precision}")

recall = tp / (tp + fn)
print(f"Recall: {recall}")

f1_score = 2 * (precision * recall) / (precision + recall)
print(f"F1 Score: {f1_score}")

support_pos = tp + fn
print(f"Support (0): {support_pos}")

support_neg = fp + tn
print(f"Support (1): {support_neg}")

Accuracy: 0.8172690763052208
True Positive Rate: 0.8253012048192772
False Positive Rate: 0.19879518072289157
True Negative Rate: 0.8012048192771084
False Negative Rate: 0.1746987951807229
Precision: 0.8925081433224755
Recall: 0.8253012048192772
F1 Score: 0.8575899843505477
Support (0): 332
Support (1): 166


## classification report

In [44]:
pd.DataFrame(classification_report(y_train,y_pred,output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.825301,0.801205,0.817269,0.813253,0.816059
recall,0.892508,0.696335,0.817269,0.794422,0.817269
f1-score,0.85759,0.745098,0.817269,0.801344,0.814445
support,307.0,191.0,0.817269,498.0,498.0


# Run through steps 1-3 setting k to 10

In [54]:
knn10 = KNeighborsClassifier(n_neighbors=10)

In [55]:
knn10.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [61]:
y10_pred = knn10.predict(x_train)

In [57]:
y10_pred_proba = knn10.predict_proba(x_train)

In [59]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn10.score(x_train, y_train)))

Accuracy of KNN classifier on training set: 0.77


In [62]:
confusion_matrix(y_train, y10_pred)

array([[279,  28],
       [ 88, 103]])

In [63]:
tp10 = 279
tn10 = 103
fp10 = 28
fn10 = 88


In [67]:
comb10 = tp10 + tn10 + fp10 + fn10

In [69]:
print(classification_report(y_train, y10_pred))

              precision    recall  f1-score   support

           0       0.76      0.91      0.83       307
           1       0.79      0.54      0.64       191

    accuracy                           0.77       498
   macro avg       0.77      0.72      0.73       498
weighted avg       0.77      0.77      0.76       498



In [71]:
accuracy10 = (tp10 + tn10)/ comb10
print(f"Accuracy: {accuracy10}")

true_positive_rate10 = tp10/(tp10+fn10)
print(f"True Positive Rate: {true_positive_rate10}")

false_positive_rate10 = fp10/(fp10+tn10)
print(f"False Positive Rate: {false_positive_rate10}")

true_negative_rate10 = tn10/(tn10+fp10)
print(f"True Negative Rate: {true_negative_rate10}")

false_negative_rate10 = fn10/(fn10+tp10)
print(f"False Negative Rate: {false_negative_rate10}")

precision10 = tp10 / (tp10 + fp10)
print(f"Precision: {precision10}")

recall10 = tp10 / (tp10 + fn10)
print(f"Recall: {recall10}")

f1_score10 = 2 * (precision10 * recall10) / (precision10 + recall10)
print(f"F1 Score: {f1_score10}")

support_pos10 = tp10 + fn10
print(f"Support (0): {support_pos10}")

support_neg10 = fp10 + tn10
print(f"Support (1): {support_neg10}")

Accuracy: 0.7670682730923695
True Positive Rate: 0.7602179836512262
False Positive Rate: 0.21374045801526717
True Negative Rate: 0.7862595419847328
False Negative Rate: 0.23978201634877383
Precision: 0.9087947882736156
Recall: 0.7602179836512262
F1 Score: 0.827893175074184
Support (0): 367
Support (1): 131


## CLASS FUNCTION

In [97]:
def knn_fit_predict(k, x_train, y_train, x_validate):
   
    knn = KNeighborsClassifier(n_neighbors=k)

    # create the model
    knn.fit(x_train, y_train)

    # run the model
    y_train_pred = knn.predict(x_train)
    y_validate_pred = knn.predict(x_validate)
    
    return knn, y_train_pred, y_validate_pred

In [104]:
def evaluate_clf(model, x, y, y_pred):
    
    # model score
    
    accuracy = model.score(x, y)
    print(f'Accuracy: {accuracy}')

    # confusion matrix
    
    cm = confusion_matrix(y, y_pred)
    print('Confusion Matrix')
    print(pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], 
                       columns=['Pred 0', 'Pred 1']))

    # classification report
    print('Classification Report')
    print(pd.DataFrame(classification_report(y, y_pred,
                                             output_dict=True)))
    
    
    return accuracy

# k = 10

In [105]:
k = 10


knn, y_train_pred, y_validate_pred = knn_fit_predict(k, 
                                                     x_train, 
                                                     y_train, 
                                                     x_validate)

In [106]:
print('Train Evaluation\n')
train_accuracy = evaluate_clf(knn, x_train, y_train, y_train_pred)

print('\nValidate Evaluation\n')
validate_accuracy = evaluate_clf(knn, x_validate, 
                                 y_validate, y_validate_pred)

Train Evaluation

Accuracy: 0.7670682730923695
Confusion Matrix
          Pred 0  Pred 1
Actual 0     279      28
Actual 1      88     103
Classification Report
                    0           1  accuracy   macro avg  weighted avg
precision    0.760218    0.786260  0.767068    0.773239      0.770206
recall       0.908795    0.539267  0.767068    0.724031      0.767068
f1-score     0.827893    0.639752  0.767068    0.733822      0.755734
support    307.000000  191.000000  0.767068  498.000000    498.000000

Validate Evaluation

Accuracy: 0.7523364485981309
Confusion Matrix
          Pred 0  Pred 1
Actual 0     115      17
Actual 1      36      46
Classification Report
                    0          1  accuracy   macro avg  weighted avg
precision    0.761589   0.730159  0.752336    0.745874      0.749546
recall       0.871212   0.560976  0.752336    0.716094      0.752336
f1-score     0.812721   0.634483  0.752336    0.723602      0.744424
support    132.000000  82.000000  0.752336  214.

# Run through steps 1-3 setting k to 20

In [108]:
k = 20

knn, y_train_pred, y_validate_pred = knn_fit_predict(k, 
                                                     x_train, 
                                                     y_train, 
                                                     x_validate)

print('Train Evaluation\n')
train_accuracy = evaluate_clf(knn, x_train, y_train, y_train_pred)

print('\nValidate Evaluation\n')
validate_accuracy = evaluate_clf(knn, x_validate, 
                                 y_validate, y_validate_pred)

Train Evaluation

Accuracy: 0.7188755020080321
Confusion Matrix
          Pred 0  Pred 1
Actual 0     267      40
Actual 1     100      91
Classification Report
                    0           1  accuracy   macro avg  weighted avg
precision    0.727520    0.694656  0.718876    0.711088      0.714916
recall       0.869707    0.476440  0.718876    0.673073      0.718876
f1-score     0.792285    0.565217  0.718876    0.678751      0.705197
support    307.000000  191.000000  0.718876  498.000000    498.000000

Validate Evaluation

Accuracy: 0.6869158878504673
Confusion Matrix
          Pred 0  Pred 1
Actual 0     110      22
Actual 1      45      37
Classification Report
                    0          1  accuracy   macro avg  weighted avg
precision    0.709677   0.627119  0.686916    0.668398      0.678043
recall       0.833333   0.451220  0.686916    0.642276      0.686916
f1-score     0.766551   0.524823  0.686916    0.645687      0.673926
support    132.000000  82.000000  0.686916  214.

In [83]:
accuracy20 = (tp20 + tn20)/ comb20
print(f"Accuracy: {accuracy10}")

true_positive_rate20 = tp20/(tp20+fn20)
print(f"True Positive Rate: {true_positive_rate20}")

false_positive_rate20 = fp20/(fp20+tn20)
print(f"False Positive Rate: {false_positive_rate20}")

true_negative_rate20 = tn20/(tn20+fp20)
print(f"True Negative Rate: {true_negative_rate20}")

false_negative_rate20 = fn20/(fn20+tp20)
print(f"False Negative Rate: {false_negative_rate20}")

precision20 = tp20 / (tp20 + fp20)
print(f"Precision: {precision20}")

recall20 = tp20 / (tp20 + fn20)
print(f"Recall: {recall20}")

f1_score20 = 2 * (precision20 * recall20) / (precision20 + recall20)
print(f"F1 Score: {f1_score10}")

support_pos20 = tp20 + fn20
print(f"Support (0): {support_pos20}")

support_neg20 = fp20 + tn20
print(f"Support (1): {support_neg20}")

Accuracy: 0.7670682730923695
True Positive Rate: 0.7275204359673024
False Positive Rate: 0.3053435114503817
True Negative Rate: 0.6946564885496184
False Negative Rate: 0.2724795640326976
Precision: 0.8697068403908795
Recall: 0.7275204359673024
F1 Score: 0.827893175074184
Support (0): 367
Support (1): 131


# What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [None]:
# the f1 score is better where k = 20

# Which model performs best on our out-of-sample data from validate?

In [None]:
# the k = 10 works best all around in and out of sample