In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import sklearn as sk

import warnings
warnings.filterwarnings('ignore')

from env import get_db_url, user, password, host

import acquire
import prepare
import explore

# pandas display preferences
pd.set_option('display.max_columns', 50)
pd.set_option('display.precision', 3)
#pd.option_context('display.max_rows', None)

line_break = ('-' * 50)
line_break_2 = ('\n' + '=' * 50 + '\n')

# KNN Titanic Data

In [2]:
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)

Reading from local CSV...


In [3]:
target = 'survived'
positive = 1

In [4]:
train, test, validate = prepare.train_test_validate_split(df, target)

train	 n = 498
test	 n = 179
validate n = 214


In [5]:
# separate each sample into x and y
x_train = train.drop(columns=target)
y_train = train[target]

x_validate = validate.drop(columns=target)
y_validate = validate[target]

x_test = test.drop(columns=target)
y_test = test[target]

### 1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

    


In [6]:
# create the classifer
clf = KNeighborsClassifier(n_neighbors=5)

# fit the classifier to the training data
clf = clf.fit(x_train, y_train)

# make predictions to evaluate classifier performance on in-sample data
y_pred = clf.predict(x_train)

# establish baseline predictions and create dataframe to calculate performance
train_results = pd.DataFrame()
train_results['actual'] = train[target]
train_results['baseline'] = train[target].mode()[0]
train_results['predicted'] = y_pred

### 2. Evaluate your results using the model score, confusion matrix, and classification report.

    

In [7]:
class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
print(f'Model Score: {clf.score(x_train, y_train):.2f}')
print(line_break)
print('Confusion Matrix: \n', confusion_matrix(y_train, y_pred))
print(line_break)
print('Classification Report: \n', class_report)

Model Score: 0.84
--------------------------------------------------
Confusion Matrix: 
 [[270  37]
 [ 45 146]]
--------------------------------------------------
Classification Report: 
                  0        1  accuracy  macro avg  weighted avg
precision    0.857    0.798     0.835      0.827         0.834
recall       0.879    0.764     0.835      0.822         0.835
f1-score     0.868    0.781     0.835      0.824         0.835
support    307.000  191.000     0.835    498.000       498.000


### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.



In [8]:
accuracy = clf.score(x_train, y_train)
precision = sk.metrics.precision_score(y_train, y_pred, pos_label=positive)
recall = sk.metrics.recall_score(y_train, y_pred, pos_label=positive)
f1_score = sk.metrics.f1_score(y_train, y_pred, pos_label=positive)
support_1 = int(y_train[y_train == 1].count())
support_0 = int(y_train[y_train == 0].count())

n=len(train)
tp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == positive)]) / n
fp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual != positive)]) / n
tn_rate = len(train_results[(train_results.predicted != positive) & (train_results.actual != positive)]) / n
fn_rate = len(train_results[(train_results.predicted != positive) & (train_results.actual == positive)]) / n

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1_score:.2f}')
print(f'Support 1: {support_1}')
print(f'Support 0: {support_0}')
print()
print(f'True Postive Rate:\t{tp_rate:.2f}')
print(f'False Positive Rate:\t{fp_rate:.2f}')
print(f'True Negative Rate:\t{tn_rate:.2f}')
print(f'False Negative Rate:\t{fn_rate:.2f}')

Accuracy: 0.84
Precision: 0.80
Recall: 0.76
F1 Score: 0.78
Support 1: 191
Support 0: 307

True Postive Rate:	0.29
False Positive Rate:	0.07
True Negative Rate:	0.54
False Negative Rate:	0.09


### 4. Run through steps 2-4 setting k to 10



In [9]:
# create the classifer
clf = KNeighborsClassifier(n_neighbors=10)

# fit the classifier to the training data
clf = clf.fit(x_train, y_train)

# make predictions to evaluate classifier performance on in-sample data
y_pred = clf.predict(x_train)

# establish baseline predictions and create dataframe to calculate performance
train_results = pd.DataFrame()
train_results['actual'] = train[target]
train_results['baseline'] = train[target].mode()[0]
train_results['predicted'] = y_pred



class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
print(f'Model Score: {clf.score(x_train, y_train):.2f}')
print(line_break)
print('Confusion Matrix: \n', confusion_matrix(y_train, y_pred))
print(line_break)
print('Classification Report: \n', class_report)



accuracy = clf.score(x_train, y_train)
precision = sk.metrics.precision_score(y_train, y_pred, pos_label=positive)
recall = sk.metrics.recall_score(y_train, y_pred, pos_label=positive)
f1_score = sk.metrics.f1_score(y_train, y_pred, pos_label=positive)
support_1 = int(y_train[y_train == 1].count())
support_0 = int(y_train[y_train == 0].count())

n=len(train)
tp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == positive)]) / n
fp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual != positive)]) / n
tn_rate = len(train_results[(train_results.predicted != positive) & (train_results.actual != positive)]) / n
fn_rate = len(train_results[(train_results.predicted != positive) & (train_results.actual == positive)]) / n

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1_score:.2f}')
print(f'Support 1: {support_1}')
print(f'Support 0: {support_0}')
print()
print(f'True Postive Rate:\t{tp_rate:.2f}')
print(f'False Positive Rate:\t{fp_rate:.2f}')
print(f'True Negative Rate:\t{tn_rate:.2f}')
print(f'False Negative Rate:\t{fn_rate:.2f}')

Model Score: 0.79
--------------------------------------------------
Confusion Matrix: 
 [[271  36]
 [ 69 122]]
--------------------------------------------------
Classification Report: 
                  0        1  accuracy  macro avg  weighted avg
precision    0.797    0.772     0.789      0.785         0.788
recall       0.883    0.639     0.789      0.761         0.789
f1-score     0.838    0.699     0.789      0.768         0.785
support    307.000  191.000     0.789    498.000       498.000
Accuracy: 0.79
Precision: 0.77
Recall: 0.64
F1 Score: 0.70
Support 1: 191
Support 0: 307

True Postive Rate:	0.24
False Positive Rate:	0.07
True Negative Rate:	0.54
False Negative Rate:	0.14


### 5. Run through setps 2-4 setting k to 20

    

In [10]:
# create the classifer
clf = KNeighborsClassifier(n_neighbors=20)

# fit the classifier to the training data
clf = clf.fit(x_train, y_train)

# make predictions to evaluate classifier performance on in-sample data
y_pred = clf.predict(x_train)

# establish baseline predictions and create dataframe to calculate performance
train_results = pd.DataFrame()
train_results['actual'] = train[target]
train_results['baseline'] = train[target].mode()[0]
train_results['predicted'] = y_pred



class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
print(f'Model Score: {clf.score(x_train, y_train):.2f}')
print(line_break)
print('Confusion Matrix: \n', confusion_matrix(y_train, y_pred))
print(line_break)
print('Classification Report: \n', class_report)



accuracy = clf.score(x_train, y_train)
precision = sk.metrics.precision_score(y_train, y_pred, pos_label=positive)
recall = sk.metrics.recall_score(y_train, y_pred, pos_label=positive)
f1_score = sk.metrics.f1_score(y_train, y_pred, pos_label=positive)
support_1 = int(y_train[y_train == 1].count())
support_0 = int(y_train[y_train == 0].count())

n=len(train)
tp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual == positive)]) / n
fp_rate = len(train_results[(train_results.predicted == positive) & (train_results.actual != positive)]) / n
tn_rate = len(train_results[(train_results.predicted != positive) & (train_results.actual != positive)]) / n
fn_rate = len(train_results[(train_results.predicted != positive) & (train_results.actual == positive)]) / n

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1_score:.2f}')
print(f'Support 1: {support_1}')
print(f'Support 0: {support_0}')
print()
print(f'True Postive Rate:\t{tp_rate:.2f}')
print(f'False Positive Rate:\t{fp_rate:.2f}')
print(f'True Negative Rate:\t{tn_rate:.2f}')
print(f'False Negative Rate:\t{fn_rate:.2f}')

Model Score: 0.73
--------------------------------------------------
Confusion Matrix: 
 [[269  38]
 [ 94  97]]
--------------------------------------------------
Classification Report: 
                  0        1  accuracy  macro avg  weighted avg
precision    0.741    0.719     0.735      0.730         0.732
recall       0.876    0.508     0.735      0.692         0.735
f1-score     0.803    0.595     0.735      0.699         0.723
support    307.000  191.000     0.735    498.000       498.000
Accuracy: 0.73
Precision: 0.72
Recall: 0.51
F1 Score: 0.60
Support 1: 191
Support 0: 307

True Postive Rate:	0.19
False Positive Rate:	0.08
True Negative Rate:	0.54
False Negative Rate:	0.19


### 6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

   

In [11]:
# create empty dataframe to store model results
model_results = pd.DataFrame(columns=['model_number', 'metric_type', 'sample_type', 'score'])

# empty dataframe to store information about the model itself
model_info = pd.DataFrame(columns=['model_number', 'K_neighbors'])
######################################################################################
# store baseline metrics

model_number = 'baseline'

# store info about the model
dct = {'model_number': model_number,
       'K_neighbors': 'N/A'}
model_info = model_info.append(dct, ignore_index=True)

# establish baseline predictions for train sample
y_pred = baseline_pred = pd.Series([train[target].mode()[0]]).repeat(len(train))

# get metrics
dct = {'model_number': model_number, 
       'sample_type': 'train', 
       'metric_type': 'accuracy',
       'score': sk.metrics.accuracy_score(y_train, y_pred)}
model_results = model_results.append(dct, ignore_index=True)

dct = {'model_number': model_number, 
       'sample_type': 'train', 
       'metric_type': 'precision',
       'score': sk.metrics.precision_score(y_train, y_pred, pos_label=positive)}
model_results = model_results.append(dct, ignore_index=True)

dct = {'model_number': model_number, 
       'sample_type': 'train', 
       'metric_type': 'recall',
       'score': sk.metrics.recall_score(y_train, y_pred, pos_label=positive)}
model_results = model_results.append(dct, ignore_index=True)

dct = {'model_number': model_number, 
       'sample_type': 'train', 
       'metric_type': 'f1_score',
       'score': sk.metrics.f1_score(y_train, y_pred, pos_label=positive)}
model_results = model_results.append(dct, ignore_index=True)

# establish baseline predictions for validate sample
y_pred = baseline_pred = pd.Series([train[target].mode()[0]]).repeat(len(validate))

# get metrics
dct = {'model_number': model_number, 
       'sample_type': 'validate', 
       'metric_type': 'f1_score',
       'score': sk.metrics.f1_score(y_validate, y_pred, pos_label=positive)}
model_results = model_results.append(dct, ignore_index=True)

dct = {'model_number': model_number, 
       'sample_type': 'validate', 
       'metric_type': 'accuracy',
       'score': sk.metrics.accuracy_score(y_validate, y_pred)}
model_results = model_results.append(dct, ignore_index=True)

dct = {'model_number': model_number, 
       'sample_type': 'validate', 
       'metric_type': 'precision',
       'score': sk.metrics.precision_score(y_validate, y_pred, pos_label=positive)}
model_results = model_results.append(dct, ignore_index=True)

dct = {'model_number': model_number, 
       'sample_type': 'validate', 
       'metric_type': 'recall',
       'score': sk.metrics.recall_score(y_validate, y_pred, pos_label=positive)}
model_results = model_results.append(dct, ignore_index=True)


#######################################################################################
# create models

model_number = 1
k_values = [5, 10, 20]

for k in k_values:
    
    # store info about the model
    dct = {'model_number': model_number,
           'K_neighbors': k}
    model_info = model_info.append(dct, ignore_index=True)
    
    # fit the classifier to the training sample and transform
    clf = KNeighborsClassifier(n_neighbors=k)
    clf = clf.fit(x_train, y_train)    

    # results for train sample
    y_pred = clf.predict(x_train)
    
    
    # get metrics
    dct = {'model_number': model_number, 
           'sample_type': 'train', 
           'metric_type': 'accuracy',
           'score': sk.metrics.accuracy_score(y_train, y_pred)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': 'train', 
           'metric_type': 'precision',
           'score': sk.metrics.precision_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': 'train', 
           'metric_type': 'recall',
           'score': sk.metrics.recall_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': 'train', 
           'metric_type': 'f1_score',
           'score': sk.metrics.f1_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)


    # results for validate sample
    y_pred = clf.predict(x_validate)
    
    # get metrics
    dct = {'model_number': model_number, 
           'sample_type': 'validate', 
           'metric_type': 'f1_score',
           'score': sk.metrics.f1_score(y_validate, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': 'validate', 
           'metric_type': 'accuracy',
           'score': sk.metrics.accuracy_score(y_validate, y_pred)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': 'validate', 
           'metric_type': 'precision',
           'score': sk.metrics.precision_score(y_validate, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': 'validate', 
           'metric_type': 'recall',
           'score': sk.metrics.recall_score(y_validate, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)
    
    model_number += 1

In [12]:
def display_model_results():
    return model_results.pivot_table(columns='model_number', 
                                     index=('metric_type', 'sample_type'), 
                                     values='score',
                                     aggfunc=lambda x: x)

In [13]:
model_info

Unnamed: 0,model_number,K_neighbors
0,baseline,
1,1,5.0
2,2,10.0
3,3,20.0


In [14]:
model_results[model_results.sample_type == 'train'].pivot_table(columns='model_number', 
                                                                 index=('metric_type', 'sample_type'), 
                                                                 values='score',
                                                                 aggfunc=lambda x: x)

Unnamed: 0_level_0,model_number,1,2,3,baseline
metric_type,sample_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
accuracy,train,0.835,0.789,0.735,0.616
f1_score,train,0.781,0.699,0.595,0.0
precision,train,0.798,0.772,0.719,0.0
recall,train,0.764,0.639,0.508,0.0


Each of the metrics tends to go down as we increase the value for K. Model #1, with K=5, performs best on in-sample data. As we incrase the number of neighbors compared to, those neighbors become less likely to be similar to our tested value. 

### 7.  Which model performs best on our out-of-sample data from validate?

In [15]:
model_results[model_results.sample_type == 'validate'].pivot_table(columns='model_number', 
                                                                 index=('metric_type', 'sample_type'), 
                                                                 values='score',
                                                                 aggfunc=lambda x: x)

Unnamed: 0_level_0,model_number,1,2,3,baseline
metric_type,sample_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
accuracy,validate,0.748,0.743,0.71,0.617
f1_score,validate,0.667,0.631,0.581,0.0
precision,validate,0.675,0.701,0.652,0.0
recall,validate,0.659,0.573,0.524,0.0


For out of sample data, overall performance is still best for Model 1, though Model 2 (K=10) does slightly better on precision. 

In [16]:
display_model_results()

Unnamed: 0_level_0,model_number,1,2,3,baseline
metric_type,sample_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
accuracy,train,0.835,0.789,0.735,0.616
accuracy,validate,0.748,0.743,0.71,0.617
f1_score,train,0.781,0.699,0.595,0.0
f1_score,validate,0.667,0.631,0.581,0.0
precision,train,0.798,0.772,0.719,0.0
precision,validate,0.675,0.701,0.652,0.0
recall,train,0.764,0.639,0.508,0.0
recall,validate,0.659,0.573,0.524,0.0


But when we compare performance on train vs validate samples, Model 3, with K=20, tends to have the smallest difference in perforamnce between the two samples, and even performs slightly better on recall on the validate sample. 