In [11]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from pydataset import data


import env
import os
import acquire
import prepare

# acquire data

In [12]:
titanic = acquire.get_titanic_data(acquire.get_connection)

In [13]:
titanic = prepare.prep_titanic(titanic)

In [14]:
titanic.head(1)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,22.0,1,0,7.25,Southampton,0,1,0,1


In [5]:
titanic = titanic.drop(columns=['sex','embark_town'])

# split data

In [15]:
train,validate,test = prepare.split_data(titanic,'survived')

In [16]:
train.shape, validate.shape,test.shape

((498, 12), (214, 12), (179, 12))

# x and y data

In [17]:
x_train = train.drop(columns=['survived'])
y_train = train.survived

x_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

x_test = test.drop(columns=['survived'])
y_test = test.survived

# Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

## train model

In [9]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn

KNeighborsClassifier()

## fit model

In [10]:
knn.fit(x_train,y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## make predictions

In [None]:
y_pred = knn.predict(x_train)

## estimate probability

In [None]:
y_pred_proba = knn.predict_proba(x_train)


# Evaluate your results using the model score, confusion matrix, and classification report.

## score

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(x_train, y_train)))

## confusion matrix


In [None]:
cm = confusion_matrix(y_train, y_pred)
cm = pd.DataFrame(cm,index = ['actual 0','actual 1'] , columns = ['pred 0','pred 1'])
cm

In [None]:
tp = 274
tn = 133
fp = 33
fn = 58
comb = tp + tn + fp + fn

# Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
accuracy = (tp + tn)/ comb
print(f"Accuracy: {accuracy}")

true_positive_rate = tp/(tp+fn)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = fp/(fp+tn)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = tn/(tn+fp)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = fn/(fn+tp)
print(f"False Negative Rate: {false_negative_rate}")

precision = tp / (tp + fp)
print(f"Precision: {precision}")

recall = tp / (tp + fn)
print(f"Recall: {recall}")

f1_score = 2 * (precision * recall) / (precision + recall)
print(f"F1 Score: {f1_score}")

support_pos = tp + fn
print(f"Support (0): {support_pos}")

support_neg = fp + tn
print(f"Support (1): {support_neg}")

## classification report

In [None]:
pd.DataFrame(classification_report(y_train,y_pred,output_dict=True))

# Run through steps 1-3 setting k to 10

In [None]:
knn10 = KNeighborsClassifier(n_neighbors=10)

In [None]:
knn10.fit(x_train, y_train)

In [None]:
y10_pred = knn10.predict(x_train)

In [None]:
y10_pred_proba = knn10.predict_proba(x_train)

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn10.score(x_train, y_train)))

In [None]:
confusion_matrix(y_train, y10_pred)

In [None]:
tp10 = 279
tn10 = 103
fp10 = 28
fn10 = 88


In [None]:
comb10 = tp10 + tn10 + fp10 + fn10

In [None]:
print(classification_report(y_train, y10_pred))

In [None]:
accuracy10 = (tp10 + tn10)/ comb10
print(f"Accuracy: {accuracy10}")

true_positive_rate10 = tp10/(tp10+fn10)
print(f"True Positive Rate: {true_positive_rate10}")

false_positive_rate10 = fp10/(fp10+tn10)
print(f"False Positive Rate: {false_positive_rate10}")

true_negative_rate10 = tn10/(tn10+fp10)
print(f"True Negative Rate: {true_negative_rate10}")

false_negative_rate10 = fn10/(fn10+tp10)
print(f"False Negative Rate: {false_negative_rate10}")

precision10 = tp10 / (tp10 + fp10)
print(f"Precision: {precision10}")

recall10 = tp10 / (tp10 + fn10)
print(f"Recall: {recall10}")

f1_score10 = 2 * (precision10 * recall10) / (precision10 + recall10)
print(f"F1 Score: {f1_score10}")

support_pos10 = tp10 + fn10
print(f"Support (0): {support_pos10}")

support_neg10 = fp10 + tn10
print(f"Support (1): {support_neg10}")

## CLASS FUNCTION

In [None]:
def knn_fit_predict(k, x_train, y_train, x_validate):
   
    knn = KNeighborsClassifier(n_neighbors=k)

    # create the model
    knn.fit(x_train, y_train)

    # run the model
    y_train_pred = knn.predict(x_train)
    y_validate_pred = knn.predict(x_validate)
    
    return knn, y_train_pred, y_validate_pred

In [None]:
def evaluate_clf(model, x, y, y_pred):
    
    # model score
    
    accuracy = model.score(x, y)
    print(f'Accuracy: {accuracy}')

    # confusion matrix
    
    cm = confusion_matrix(y, y_pred)
    print('Confusion Matrix')
    print(pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], 
                       columns=['Pred 0', 'Pred 1']))

    # classification report
    print('Classification Report')
    print(pd.DataFrame(classification_report(y, y_pred,
                                             output_dict=True)))
    
    
    return accuracy

# k = 10

In [None]:
k = 10


knn, y_train_pred, y_validate_pred = knn_fit_predict(k, 
                                                     x_train, 
                                                     y_train, 
                                                     x_validate)

In [None]:
print('Train Evaluation\n')
train_accuracy = evaluate_clf(knn, x_train, y_train, y_train_pred)

print('\nValidate Evaluation\n')
validate_accuracy = evaluate_clf(knn, x_validate, 
                                 y_validate, y_validate_pred)

# Run through steps 1-3 setting k to 20

In [None]:
k = 20

knn, y_train_pred, y_validate_pred = knn_fit_predict(k, 
                                                     x_train, 
                                                     y_train, 
                                                     x_validate)

print('Train Evaluation\n')
train_accuracy = evaluate_clf(knn, x_train, y_train, y_train_pred)

print('\nValidate Evaluation\n')
validate_accuracy = evaluate_clf(knn, x_validate, 
                                 y_validate, y_validate_pred)

In [None]:
accuracy20 = (tp20 + tn20)/ comb20
print(f"Accuracy: {accuracy10}")

true_positive_rate20 = tp20/(tp20+fn20)
print(f"True Positive Rate: {true_positive_rate20}")

false_positive_rate20 = fp20/(fp20+tn20)
print(f"False Positive Rate: {false_positive_rate20}")

true_negative_rate20 = tn20/(tn20+fp20)
print(f"True Negative Rate: {true_negative_rate20}")

false_negative_rate20 = fn20/(fn20+tp20)
print(f"False Negative Rate: {false_negative_rate20}")

precision20 = tp20 / (tp20 + fp20)
print(f"Precision: {precision20}")

recall20 = tp20 / (tp20 + fn20)
print(f"Recall: {recall20}")

f1_score20 = 2 * (precision20 * recall20) / (precision20 + recall20)
print(f"F1 Score: {f1_score10}")

support_pos20 = tp20 + fn20
print(f"Support (0): {support_pos20}")

support_neg20 = fp20 + tn20
print(f"Support (1): {support_neg20}")

# What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [None]:
# the f1 score is better where k = 20

# Which model performs best on our out-of-sample data from validate?

In [None]:
# the k = 10 works best all around in and out of sample