In [2]:
import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import acquire as a
import prepare as p

In [5]:
df = a.get_titanic_data()
df.head()

this file exists, reading csv


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [6]:
# grab, prepare, and split our data sets
titanic_train, \
titanic_val, \
titanic_test = p.splitting_data(
    p.clean_titanic(a.get_titanic_data()),
    'survived')

this file exists, reading csv


In [7]:
titanic_train.shape, titanic_val.shape, titanic_test.shape

((534, 9), (178, 9), (179, 9))

In [8]:
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 534 entries, 776 to 768
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   passenger_id  534 non-null    int64  
 1   survived      534 non-null    int64  
 2   pclass        534 non-null    object 
 3   sex           534 non-null    object 
 4   sibsp         534 non-null    int64  
 5   parch         534 non-null    int64  
 6   fare          534 non-null    float64
 7   embark_town   534 non-null    object 
 8   alone         534 non-null    int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 41.7+ KB


In [9]:
# tidy up the pclass and drop passenger_id:
titanic_train.drop(columns='passenger_id')
titanic_train['pclass'] = titanic_train['pclass'].astype(int)

In [10]:
# encoding sex and embark_town, 
# the only object types we have left:

In [11]:
# lets assign the output of pd.get_dummies into a separate df:
titanic_train_encoded_cats = pd.get_dummies(titanic_train[['embark_town', 'sex']],
              drop_first=True).astype(int)

In [12]:
# glue that back into my original df:
titanic_preprocessed = pd.concat(
    [titanic_train,
    titanic_train_encoded_cats],
    axis=1).drop(columns=['sex', 'embark_town'])

In [13]:
titanic_preprocessed.head()

Unnamed: 0,passenger_id,survived,pclass,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
776,776,0,3,0,0,7.75,1,1,0,1
829,829,1,1,0,0,80.0,1,0,1,0
215,215,1,1,1,0,113.275,0,0,0,0
258,258,1,1,0,0,512.3292,1,0,0,0
129,129,0,3,0,0,6.975,1,0,1,1


In [14]:
def preprocess_titanic(train_df, val_df, test_df):
    '''
    preprocess_titanic will take in three pandas dataframes
    of our titanic data, expected as cleaned versions of this 
    titanic data set (see documentation on acquire.py and prepare.py)
    
    output:
    encoded, ML-ready versions of our clean data, with 
    columns sex and embark_town encoded in the one-hot fashion
    return: (pd.DataFrame, pd.DataFrame, pd.DataFrame)
    '''
    # with a looping structure:
    # for df in [train_df, val_df, test_df]:
    #     df.drop(blah blah blah)
    #     df['pclass'] = df['pclass'].astype(int)
    train_df = train_df.drop(columns='passenger_id')
    train_df['pclass'] = train_df['pclass'].astype(int)
    val_df = val_df.drop(columns='passenger_id')
    val_df['pclass'] = val_df['pclass'].astype(int)
    test_df = test_df.drop(columns='passenger_id')
    test_df['pclass'] = test_df['pclass'].astype(int)
    encoding_var = ['sex', 'embark_town']
    encoded_dfs = []
    for df in [train_df, val_df, test_df]:
        df_encoded_cats = pd.get_dummies(
            df[['embark_town', 'sex']],
              drop_first=True).astype(int)
        encoded_dfs.append(pd.concat(
            [df,
            df_encoded_cats],
            axis=1).drop(columns=['sex', 'embark_town']))
    return encoded_dfs


In [15]:
ttrain_encoded, tval_encoded, ttest_encoded = preprocess_titanic(
    titanic_train, 
    titanic_val, 
    titanic_test)

In [17]:
ttrain_encoded.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
776,0,3,0,0,7.75,1,1,0,1
829,1,1,0,0,80.0,1,0,1,0
215,1,1,1,0,113.275,0,0,0,0
258,1,1,0,0,512.3292,1,0,0,0
129,0,3,0,0,6.975,1,0,1,1


In [19]:
#isolate my target variable
y_train = ttrain_encoded.survived
y_train.head()

776    0
829    1
215    1
258    1
129    0
Name: survived, dtype: int64

In [21]:
#repeat for validate and test
y_validate = tval_encoded.survived
y_test = ttest_encoded.survived

In [174]:
#isolate our features, using all of them for now
X_train = ttrain_encoded.drop(columns='survived')
X_train.head()

Unnamed: 0,pclass,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
776,3,0,0,7.75,1,1,0,1
829,1,0,0,80.0,1,0,1,0
215,1,1,0,113.275,0,0,0,0
258,1,0,0,512.3292,1,0,0,0
129,3,0,0,6.975,1,0,1,1


In [183]:
X_validate = tval_encoded.drop(columns='survived')
X_test = ttest_encoded.drop(columns='survived')
X_validate.head()

Unnamed: 0,pclass,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
541,3,4,2,31.275,0,0,1,0
204,3,0,0,8.05,1,0,1,1
108,3,0,0,7.8958,1,0,1,1
88,1,3,2,263.0,0,0,1,0
677,3,0,0,9.8417,1,0,1,0


In [175]:
#create baseline
y_train.value_counts()

survived
0    329
1    205
Name: count, dtype: int64

In [176]:
#baseline accuracy
y_train.value_counts(normalize=True)[0]

0.6161048689138576

#### sklearn modeling process

1. create the object
2. fit the object
3. use the object 

## 1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [177]:
knn = KNeighborsClassifier(n_neighbors=5)
knn

In [178]:
knn.fit(X_train, y_train)

In [179]:
knn.classes_

array([0, 1])

In [180]:
#predicted values
predicted = knn.predict(X_train)
predicted[:10]

array([0, 1, 1, 1, 0, 0, 1, 0, 1, 0])

In [181]:
y_train.head()

776    0
829    1
215    1
258    1
129    0
Name: survived, dtype: int64

#### 2.Evaluate your results using the model score, confusion matrix, and classification report.

In [182]:
#accuracy
knn.score(X_train,y_train)

0.8258426966292135

In [136]:
#survived = Actual
#col_O = Predicted
counts = pd.crosstab(y_train, predicted)
counts

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,286,43
1,50,155


In [137]:
#y_true = y_train
#y_pred = predicted
print(classification_report(y_train, predicted))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86       329
           1       0.78      0.76      0.77       205

    accuracy                           0.83       534
   macro avg       0.82      0.81      0.81       534
weighted avg       0.82      0.83      0.83       534



#### 3.Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

Accuracy evaluates how many correct predictions (both positive and negative) were made over the total number of predictions.

$accuracy = {TP + TN \over TP + TN + FP + FN }$

In [138]:
tp = counts.iloc[0,0] #rows, columns
fn = counts.iloc[0,1]
fp = counts.iloc[1,0]
tn = counts.iloc[1,1]

In [139]:
tp, fn, fp, tn

(286, 43, 50, 155)

In [140]:
(tp + tn) / (tp + tn + fp + fn)

0.8258426966292135

Precision evaluates how many of the positive predictions were correct.

$precision = {TP \over TP + FP}$

In [141]:
precision = tp / (tp + fp)
precision

0.8511904761904762

Recall evaluates how the model handled all positive outcomes.

$recall = {TP \over TP + FN}$

In [142]:
recall = tp / (tp+fn)
recall

0.8693009118541033

F1 Score is the harmonic mean of precision and recall

$F1 = 2 * {{precision * recall}\over precision + recall}$

In [144]:
2*((precision*recall)/(precision+recall))

0.8601503759398496

## 4.Run through steps 1-3 setting k to 10

In [145]:
knn_10 = KNeighborsClassifier(n_neighbors=10)
knn_10

In [146]:
knn_10.fit(X_train, y_train)

In [147]:
knn_10.classes_

array([0, 1])

In [148]:
#predicted values
predicted_10 = knn_10.predict(X_train)
predicted_10[:10]

array([0, 1, 1, 1, 0, 0, 1, 0, 1, 0])

In [149]:
y_train.head()

776    0
829    1
215    1
258    1
129    0
Name: survived, dtype: int64

#### 2.Evaluate your results using the model score, confusion matrix, and classification report.

In [150]:
#accuracy
knn_10.score(X_train,y_train)

0.7883895131086143

In [153]:
#survived = Actual
#col_O = Predicted
counts_10 = pd.crosstab(y_train, predicted_10)
counts_10

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,287,42
1,71,134


In [152]:
#y_true = y_train
#y_pred = predicted
print(classification_report(y_train, predicted_10))

              precision    recall  f1-score   support

           0       0.80      0.87      0.84       329
           1       0.76      0.65      0.70       205

    accuracy                           0.79       534
   macro avg       0.78      0.76      0.77       534
weighted avg       0.79      0.79      0.78       534



#### 3.Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

Accuracy evaluates how many correct predictions (both positive and negative) were made over the total number of predictions.

$accuracy = {TP + TN \over TP + TN + FP + FN }$

In [154]:
tp = counts_10.iloc[0,0] #rows, columns
fn = counts_10.iloc[0,1]
fp = counts_10.iloc[1,0]
tn = counts_10.iloc[1,1]

In [155]:
tp, fn, fp, tn

(287, 42, 71, 134)

In [156]:
(tp + tn) / (tp + tn + fp + fn)

0.7883895131086143

Precision evaluates how many of the positive predictions were correct.

$precision = {TP \over TP + FP}$

In [157]:
precision = tp / (tp + fp)
precision

0.8016759776536313

Recall evaluates how the model handled all positive outcomes.

$recall = {TP \over TP + FN}$

In [158]:
recall = tp / (tp+fn)
recall

0.8723404255319149

F1 Score is the harmonic mean of precision and recall

$F1 = 2 * {{precision * recall}\over precision + recall}$

In [159]:
2*((precision*recall)/(precision+recall))

0.8355167394468704

## 5.Run through steps 1-3 setting k to 20

In [200]:
knn_20 = KNeighborsClassifier(n_neighbors=20)
knn_20

In [201]:
knn_20.fit(X_train, y_train)

In [202]:
knn_20.classes_

array([0, 1])

In [203]:
#predicted values
predicted_20 = knn_20.predict(X_train)
predicted_20[:10]

array([0, 1, 1, 1, 0, 0, 1, 0, 1, 0])

In [204]:
y_train.head()

776    0
829    1
215    1
258    1
129    0
Name: survived, dtype: int64

#### 2.Evaluate your results using the model score, confusion matrix, and classification report.

In [205]:
#accuracy
knn_20.score(X_train,y_train)

0.7340823970037453

In [166]:
#survived = Actual
#col_O = Predicted
counts_20 = pd.crosstab(y_train, predicted_20)
counts_20

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,283,46
1,96,109


In [167]:
#y_true = y_train
#y_pred = predicted
print(classification_report(y_train, predicted_20))

              precision    recall  f1-score   support

           0       0.75      0.86      0.80       329
           1       0.70      0.53      0.61       205

    accuracy                           0.73       534
   macro avg       0.72      0.70      0.70       534
weighted avg       0.73      0.73      0.73       534



#### 3.Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

Accuracy evaluates how many correct predictions (both positive and negative) were made over the total number of predictions.

$accuracy = {TP + TN \over TP + TN + FP + FN }$

In [168]:
tp = counts_20.iloc[0,0] #rows, columns
fn = counts_20.iloc[0,1]
fp = counts_20.iloc[1,0]
tn = counts_20.iloc[1,1]

In [169]:
tp, fn, fp, tn

(283, 46, 96, 109)

In [170]:
(tp + tn) / (tp + tn + fp + fn)

0.7340823970037453

Precision evaluates how many of the positive predictions were correct.

$precision = {TP \over TP + FP}$

In [171]:
precision = tp / (tp + fp)
precision

0.7467018469656992

Recall evaluates how the model handled all positive outcomes.

$recall = {TP \over TP + FN}$

In [172]:
recall = tp / (tp+fn)
recall

0.8601823708206687

F1 Score is the harmonic mean of precision and recall

$F1 = 2 * {{precision * recall}\over precision + recall}$

In [173]:
2*((precision*recall)/(precision+recall))

0.7994350282485876

#### 6.What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [189]:
knn.score(X_train,y_train)

0.8258426966292135

In [190]:
knn_10.score(X_train,y_train)

0.7883895131086143

In [191]:
knn_20.score(X_train,y_train)

0.7340823970037453

> k=5 performs the best

#### 7. Which model performs best on our out-of-sample data from validate?

In [185]:
knn.score(X_validate,y_validate)

0.7078651685393258

In [187]:
knn_10.score(X_validate,y_validate)

0.6685393258426966

In [188]:
knn_20.score(X_validate,y_validate)

0.6629213483146067