In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import sklearn as sk

import acquire
import prepare

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns

# Titanic Logistic Regression

In [2]:
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)

Reading from local CSV...


In [3]:
target = 'survived'
positive = 1

In [4]:
# create empty dataframe to store model results
model_results = pd.DataFrame(columns=['model_number', 'metric_type', 'sample_type', 'score'])

# empty dataframe to store information about the model itself
model_info = pd.DataFrame(columns=['model_number', 'features'])

In [5]:
# split the samples
train, test, validate = prepare.train_test_validate_split(df, target)

train	 n = 399
test	 n = 143
validate n = 172


In [6]:
# establish baseline predictions and create dataframe to calculate performance
train_results = pd.DataFrame()
train_results['actual'] = train[target]
train_results['baseline'] = train[target].mode()[0]

In [7]:
# separate each sample into x and y
x_train = train.drop(columns=target)
y_train = train[target]

x_validate = validate.drop(columns=target)
y_validate = validate[target]

x_test = test.drop(columns=target)
y_test = test[target]

In [8]:
# store baseline metrics

model_number = 'baseline'

# store info about the model
dct = {'model_number': model_number,
       'features': 'N/A',
       'C_value': 'N/A'}
model_info = model_info.append(dct, ignore_index=True)

# establish baseline predictions for train sample
y_pred = baseline_pred = pd.Series([train[target].mode()[0]]).repeat(len(train))

# get metrics
dct = {'model_number': model_number, 
       'sample_type': 'train', 
       'metric_type': 'accuracy',
       'score': sk.metrics.accuracy_score(y_train, y_pred)}
model_results = model_results.append(dct, ignore_index=True)

dct = {'model_number': model_number, 
       'sample_type': 'train', 
       'metric_type': 'precision',
       'score': sk.metrics.precision_score(y_train, y_pred, pos_label=positive)}
model_results = model_results.append(dct, ignore_index=True)

dct = {'model_number': model_number, 
       'sample_type': 'train', 
       'metric_type': 'recall',
       'score': sk.metrics.recall_score(y_train, y_pred, pos_label=positive)}
model_results = model_results.append(dct, ignore_index=True)

dct = {'model_number': model_number, 
       'sample_type': 'train', 
       'metric_type': 'f1_score',
       'score': sk.metrics.f1_score(y_train, y_pred, pos_label=positive)}
model_results = model_results.append(dct, ignore_index=True)

# establish baseline predictions for validate sample
y_pred = baseline_pred = pd.Series([train[target].mode()[0]]).repeat(len(validate))

# get metrics
dct = {'model_number': model_number, 
       'sample_type': 'validate', 
       'metric_type': 'f1_score',
       'score': sk.metrics.f1_score(y_validate, y_pred, pos_label=positive)}
model_results = model_results.append(dct, ignore_index=True)

dct = {'model_number': model_number, 
       'sample_type': 'validate', 
       'metric_type': 'accuracy',
       'score': sk.metrics.accuracy_score(y_validate, y_pred)}
model_results = model_results.append(dct, ignore_index=True)

dct = {'model_number': model_number, 
       'sample_type': 'validate', 
       'metric_type': 'precision',
       'score': sk.metrics.precision_score(y_validate, y_pred, pos_label=positive)}
model_results = model_results.append(dct, ignore_index=True)

dct = {'model_number': model_number, 
       'sample_type': 'validate', 
       'metric_type': 'recall',
       'score': sk.metrics.recall_score(y_validate, y_pred, pos_label=positive)}
model_results = model_results.append(dct, ignore_index=True)

In [9]:
train.head()

Unnamed: 0,survived,pclass,age,n_sibs_and_spouse,n_parents_and_children,fare,alone,family_size,sex_male,embark_town_Queenstown,embark_town_Southampton
312,0,2,26.0,1,1,26.0,0,2,0,0,1
376,1,3,22.0,0,0,7.25,1,0,0,0,1
41,0,2,27.0,1,0,21.0,0,1,0,0,1
278,0,3,7.0,4,1,29.125,0,5,1,1,0
675,0,3,18.0,0,0,7.775,1,0,1,0,1


### 1. Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

### 2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

### 3. Try out other combinations of features and models.


### 4. Use you best 3 models to predict and evaluate on your validate sample.


In [10]:
features1 = ['age', 'fare', 'pclass']
features2 = ['age', 'fare', 'pclass', 'sex_male']
features3 = ['age', 'fare', 'pclass', 'sex_male', 'alone']
features4 = ['age', 'fare', 'pclass', 'sex_male', 'family_size']
feature_combos = [features1, features2, features3, features4]
C_values = [1, .5, .1]
model_number = 0

for features in feature_combos:
    for C in C_values:
                
        model_number += 1
        
        # store info about the model
        dct = {'model_number': model_number,
               'features': features,
               'C_value': C}
        model_info = model_info.append(dct, ignore_index=True)
        
        # separate each sample into x and y
        x_train = train.drop(columns=target)
        x_train = x_train[features]
        y_train = train[target]

        x_validate = validate.drop(columns=target)
        x_validate = x_validate[features]
        y_validate = validate[target]
        
        
        # create the classifer
        clf = LogisticRegression(C=C, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs')

        # fit the classifier to the training data
        clf = clf.fit(x_train, y_train)

        
        # results for train sample
        y_pred = clf.predict(x_train)
        
        # get metrics
        dct = {'model_number': model_number, 
               'sample_type': 'train', 
               'metric_type': 'accuracy',
               'score': sk.metrics.accuracy_score(y_train, y_pred)}
        model_results = model_results.append(dct, ignore_index=True)

        dct = {'model_number': model_number, 
               'sample_type': 'train', 
               'metric_type': 'precision',
               'score': sk.metrics.precision_score(y_train, y_pred, pos_label=positive)}
        model_results = model_results.append(dct, ignore_index=True)

        dct = {'model_number': model_number, 
               'sample_type': 'train', 
               'metric_type': 'recall',
               'score': sk.metrics.recall_score(y_train, y_pred, pos_label=positive)}
        model_results = model_results.append(dct, ignore_index=True)

        dct = {'model_number': model_number, 
               'sample_type': 'train', 
               'metric_type': 'f1_score',
               'score': sk.metrics.f1_score(y_train, y_pred, pos_label=positive)}
        model_results = model_results.append(dct, ignore_index=True)
        
        
        # results for validate sample
        y_pred = clf.predict(x_validate)

        # get metrics
        dct = {'model_number': model_number, 
               'sample_type': 'validate', 
               'metric_type': 'f1_score',
               'score': sk.metrics.f1_score(y_validate, y_pred, pos_label=positive)}
        model_results = model_results.append(dct, ignore_index=True)

        dct = {'model_number': model_number, 
               'sample_type': 'validate', 
               'metric_type': 'accuracy',
               'score': sk.metrics.accuracy_score(y_validate, y_pred)}
        model_results = model_results.append(dct, ignore_index=True)

        dct = {'model_number': model_number, 
               'sample_type': 'validate', 
               'metric_type': 'precision',
               'score': sk.metrics.precision_score(y_validate, y_pred, pos_label=positive)}
        model_results = model_results.append(dct, ignore_index=True)

        dct = {'model_number': model_number, 
               'sample_type': 'validate', 
               'metric_type': 'recall',
               'score': sk.metrics.recall_score(y_validate, y_pred, pos_label=positive)}
        model_results = model_results.append(dct, ignore_index=True)

        

In [11]:
def display_model_results():
    return model_results.pivot_table(columns='model_number', 
                                     index=('metric_type', 'sample_type'), 
                                     values='score',
                                     aggfunc=lambda x: x)

In [12]:
model_info

Unnamed: 0,model_number,features,C_value
0,baseline,,
1,1,"[age, fare, pclass]",1.0
2,2,"[age, fare, pclass]",0.5
3,3,"[age, fare, pclass]",0.1
4,4,"[age, fare, pclass, sex_male]",1.0
5,5,"[age, fare, pclass, sex_male]",0.5
6,6,"[age, fare, pclass, sex_male]",0.1
7,7,"[age, fare, pclass, sex_male, alone]",1.0
8,8,"[age, fare, pclass, sex_male, alone]",0.5
9,9,"[age, fare, pclass, sex_male, alone]",0.1


In [13]:
display_model_results()

###### BUT WHY ARE THEY ALL THE SAME??? ########

Unnamed: 0_level_0,model_number,1,2,3,4,5,6,7,8,9,10,11,12,baseline
metric_type,sample_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
accuracy,train,0.406015,0.406015,0.406015,0.406015,0.406015,0.406015,0.406015,0.406015,0.406015,0.406015,0.406015,0.406015,0.593985
accuracy,validate,0.406977,0.406977,0.406977,0.406977,0.406977,0.406977,0.406977,0.406977,0.406977,0.406977,0.406977,0.406977,0.593023
f1_score,train,0.57754,0.57754,0.57754,0.57754,0.57754,0.57754,0.57754,0.57754,0.57754,0.57754,0.57754,0.57754,0.0
f1_score,validate,0.578512,0.578512,0.578512,0.578512,0.578512,0.578512,0.578512,0.578512,0.578512,0.578512,0.578512,0.578512,0.0
precision,train,0.406015,0.406015,0.406015,0.406015,0.406015,0.406015,0.406015,0.406015,0.406015,0.406015,0.406015,0.406015,0.0
precision,validate,0.406977,0.406977,0.406977,0.406977,0.406977,0.406977,0.406977,0.406977,0.406977,0.406977,0.406977,0.406977,0.0
recall,train,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
recall,validate,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


### 5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?
