In [None]:
# Set up the Environment

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import acquire, prepare

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [55]:
def model_metrics(alg, X, y):
    alg.fit(X, y)
    score = alg.score(X, y)
    y_pred = alg.predict(X)
    report = classification_report(y, y_pred, output_dict=True)
    report = pd.DataFrame(report).drop(columns=['accuracy', 'macro avg', 'weighted avg']).T
    report['FPR/FNR'] = 1 - report.recall
    report['accuracy'] = score
    report.rename(columns={'recall':'TNR/TPR'}, inplace=True)
    return report

### In this exercise, we'll continue working with the titanic dataset and building logistic regression models. Througout this exercise, be sure you are training, evaluation, and comparing models on the train and validate dataset. The test dataset should be only used for your final model. 

### For all of the models you create, choose a threshold that optimizes for accuracy. 

### Do your work for these exercises in either a notebook or a python script named model within your classification-exercises repository. Add, commit, and push your work.

**Takeways**:
1. Build logistic regression models for titanic dataset.
2. Several models need to be build. 
3. Accuray is the evaluation metrics. 
4. Target varibale: the survivied (categorical)
5. The positive case is predicting the survivied
    - TP: predicting survived actually survivied
    - FP: predicting survived actually being a victim
    - TN: predicting being a victim acturally was a victim
    - FN: predicting being a victim acturally survived

### 1. Create another model that includes age in addition to fare and pclass. Does this model perform better than your previous one? 

In [None]:
# Acquire titanic data.

titanic = acquire.get_titanic_data()
titanic.head()

In [None]:
# Prepare titanic dataset

train, validate, test = prepare.prep_titanic(titanic)
train.head()

In [None]:
train.shape, validate.shape, test.shape

In [None]:
# Double check if there is any missing values

train.isnull().sum()

In [None]:
# Compute the baseline accuracy

train.survived.value_counts(normalize=True)

### BL_Model: X = ['fare', 'pclass'], y = 'survived'
1. fare: continuous
2. pclass: categotical

In [None]:
# fare and pclass are the X in model1.

X_train_model1 = train[['fare', 'pclass']]
y_train_model1 = train[['survived']]

X_train_model1.shape, y_train_model1.shape

In [None]:
# Create the logistic regression object

logit1 = LogisticRegression(C=1)

# Fit the model to the training data

logit1.fit(X_train_model1, y_train_model1)

# Print the coefficients and intercept of the model

print('Coefficient: \n', logit1.coef_)
print('Intercept: \n', logit1.intercept_)

In [None]:
# Estimate whether or not a passenger would survive, using the training data

y_pred_model1 = logit1.predict(X_train_model1)
y_pred_model1

# Estimate the probablity of a passenger surviving, using the training data
y_pred_proba_model1 = logit1.predict_proba(X_train_model1)

**Evalute model on train**

In [None]:
# Compute the accuracy

print(logit1.score(X_train_model1, y_train_model1))

# Create a confusion matrix

print(confusion_matrix(y_train_model1, y_pred_model1))

# Compute Precision, Recall, F1-score, and Support

print(classification_report(y_train_model1, y_pred_model1))

### Model 2: X = ['fare', 'pclass', 'age'], y = 'survived'

In [None]:
# fare, pclass, age are the X in model2.

X_train_model2 = train[['fare', 'pclass', 'age']]
y_train_model2 = train[['survived']]

X_train_model2.shape, y_train_model2.shape

**Create, Fit & Predict**

In [None]:
# Create the logistic regression object

logit2 = LogisticRegression(C=1)

# Fit the model to the training data

logit2.fit(X_train_model2, y_train_model2)

# Print the coefficients and intercept of the model

print('Coefficient: \n', logit2.coef_)
print('Intercept: \n', logit2.intercept_)

# Estimate whether or not a passenger would survive, using the training data

y_pred_model2 = logit2.predict(X_train_model2)
y_pred_model2

# Estimate the probablity of a passenger surviving, using the training data
y_pred_proba_model2 = logit2.predict_proba(X_train_model2)

**Evalute model on train**

In [None]:
# Compute the accuracy

print('Accuracy: {: .2f}'.format(logit2.score(X_train_model2, y_train_model2)))

# Create a confusion matrix

print('Confusion matrix: \n', confusion_matrix(y_train_model2, y_pred_model2))

# Compute Precision, Recall, F1-score, and Support

print(classification_report(y_train_model2, y_pred_model2))

### 2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [None]:
train.head()

In [None]:
sex_dummy = pd.get_dummies(train.sex)
train = pd.concat([train, sex_dummy], axis=1)
train.head()

In [None]:
train.info()

### Model 3: X = ['fare', 'pclass', 'age', 'male'], y = ['survivied']

In [None]:
# fare, pclass, age, and male are the X in model2.

X_train_model3 = train[['fare', 'pclass', 'age', 'male']]
y_train_model3 = train[['survived']]

X_train_model3.shape, y_train_model3.shape

In [None]:
# Create the logistic regression object

logit3 = LogisticRegression(C=1)

# Fit the model to the training data

logit3.fit(X_train_model3, y_train_model3)

# Print the coefficients and intercept of the model

print('Coefficient: \n', logit3.coef_)
print('Intercept: \n', logit3.intercept_)

# Estimate whether or not a passenger would survive, using the training data

y_pred_model3 = logit3.predict(X_train_model3)
y_pred_model3

# Estimate the probablity of a passenger surviving, using the training data
y_pred_proba_model3 = logit3.predict_proba(X_train_model3)

In [None]:
# Compute the accuracy

print('Accuracy: {: .2f}'.format(logit3.score(X_train_model3, y_train_model3)))

# Create a confusion matrix

print('Confusion matrix: ', confusion_matrix(y_train_model3, y_pred_model3))

# Compute Precision, Recall, F1-score, and Support

print(classification_report(y_train_model3, y_pred_model3))

**Notes**
1. Previous model only contains fare and pclass as the X. 
2. No missing values in the train dataset. 

### 3. Try out other combinations of features and models.
* Model 4: X = ['pcalss', 'male'], y = 'survived'
* Create, fit and predict
* Accuracy, Confustion matrix, and Report

In [None]:
# pclass and male are the X in model 4.

X_train_model4 = train[['pclass', 'male']]
y_train_model4 = train[['survived']]

# Create the logistic regression object

logit4 = LogisticRegression(C=1)

# Fit the model to the training data

logit4.fit(X_train_model4, y_train_model4)

# Print the coefficients and intercept of the model

print('Coefficient: \n', logit4.coef_)
print('Intercept: \n', logit4.intercept_)

# Estimate whether or not a passenger would survive, using the training data

y_pred_model4 = logit4.predict(X_train_model4)
y_pred_model4

# Estimate the probablity of a passenger surviving, using the training data
y_pred_proba_model4 = logit4.predict_proba(X_train_model4)

In [None]:
# Compute the accuracy

print('Accuracy: {: .2f}'.format(logit4.score(X_train_model4, y_train_model4)))

# Create a confusion matrix

print('Confusion matrix: \n', confusion_matrix(y_train_model4, y_pred_model4))

# Compute Precision, Recall, F1-score, and Support

print(classification_report(y_train_model4, y_pred_model4))

### 4. Use best 3 models to predict and evaluate your validate sample
* Best 3: model 2, 3, 4

In [None]:
# Load validate dataset

validate.head()

In [None]:
sex_dummy = pd.get_dummies(validate.sex)
validate = pd.concat([validate, sex_dummy], axis=1)
validate.head()

In [None]:
validate.shape

In [None]:
# Load validate dataset for Model 2

X_validate_model2 = validate[['fare', 'pclass', 'age']]
y_validate_model2 = validate[['survived']]

# Estimate whether or not a passenger would survive, using the training data

y_pred_model2 = logit2.predict(X_validate_model2)
y_pred_model2

# Estimate the probablity of a passenger surviving, using the training data
y_pred_proba_model2 = logit2.predict_proba(X_validate_model2)

# Compute the accuracy

print('Accuracy: {: .2f}'.format(logit2.score(X_validate_model2, y_validate_model2)))

# Create a confusion matrix

print(confusion_matrix(y_validate_model2, y_pred_model2))

# Compute Precision, Recall, F1-score, and Support

print(classification_report(y_validate_model2, y_pred_model2))

In [None]:
# Load validate dataset for Model 3

X_validate_model3 = validate[['fare', 'pclass', 'age', 'male']]
y_validate_model3 = validate[['survived']]

# Estimate whether or not a passenger would survive, using the training data

y_pred_model3 = logit3.predict(X_validate_model3)
y_pred_model3

# Estimate the probablity of a passenger surviving, using the training data
y_pred_proba_model3 = logit3.predict_proba(X_validate_model3)

# Compute the accuracy

print('Accuracy: {: .2f}'.format(logit3.score(X_validate_model3, y_validate_model3)))

# Create a confusion matrix

print(confusion_matrix(y_validate_model3, y_pred_model3))

# Compute Precision, Recall, F1-score, and Support

print(classification_report(y_validate_model3, y_pred_model3))

In [None]:
# Load validate dataset for Model 4

X_validate_model4 = validate[['pclass', 'male']]
y_validate_model4 = validate[['survived']]

# Estimate whether or not a passenger would survive, using the training data

y_pred_model4 = logit4.predict(X_validate_model4)
y_pred_model4

# Estimate the probablity of a passenger surviving, using the training data
y_pred_proba_model4 = logit4.predict_proba(X_validate_model4)

# Compute the accuracy

print('Accuracy: {: .2f}'.format(logit4.score(X_validate_model4, y_validate_model4)))

# Create a confusion matrix

print(confusion_matrix(y_validate_model4, y_pred_model4))

# Compute Precision, Recall, F1-score, and Support

print(classification_report(y_validate_model4, y_pred_model4))

### 5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?
* Best model from the validation: Model 3

In [None]:
test.head()

In [None]:
test.shape

In [None]:
sex_dummy = pd.get_dummies(test.sex)
test = pd.concat([test, sex_dummy], axis=1)
test.head()

In [None]:
# Load test dataset for Model 3

X_test_model3 = test[['fare', 'pclass', 'age', 'male']]
y_test_model3 = test[['survived']]

# Estimate whether or not a passenger would survive, using the training data

y_pred_model3 = logit3.predict(X_test_model3)
y_pred_model3

# Estimate the probablity of a passenger surviving, using the training data
y_pred_proba_model3 = logit3.predict_proba(X_test_model3)

# Compute the accuracy

print('Accuracy: {: .2f}'.format(logit3.score(X_test_model3, y_test_model3)))

# Create a confusion matrix

print(confusion_matrix(y_test_model3, y_pred_model3))

# Compute Precision, Recall, F1-score, and Support

print(classification_report(y_test_model3, y_pred_model3))

**Notes**: The accuracy from test dataset(0.80) is a little better than the validate(0.78) and train(0.79).

### Bonus 1. How do different strategies for handling the missing values in the age column affect model performance? 

**Notes**
1. In the current titanic dataset, the stragegy for handling the missing values in the age column is SimpleImpute = 'most_frequent')
2. There are four strategies in the SimpleImpute:
    - mean
    - median
    - most_frequent
    - constant
3. The best model for now is Model 3

**My Plan**

1. I will use a different strategy in SimpleImpute and then compare the performance for model 3. 
2. Which stragegy I am gonna use? mean or median. 

In [None]:
raw_titanic = acquire.get_titanic_data()
raw_titanic.head()

In [None]:
# age columns has 177 null values

null_age = raw_titanic.age.isnull().sum()
null_age

In [None]:
# The percentage of null values in age column

null_age/raw_titanic.age.size

In [None]:
# Most frequent age

# raw_titanic.age.value_counts().head(1)
raw_titanic.age.mode()

In [None]:
raw_titanic.age.plot.hist()

In [None]:
raw_titanic.age.agg(['mean', 'median'])

In [None]:
# Who are missing the age values?

mask = raw_titanic.age.isnull()
raw_titanic[mask].alone.value_counts()

In [None]:
raw_titanic.head()

In [None]:
train, validate, test = prepare.prep_titanic_mean(raw_titanic)
train.head()

In [None]:
# Visualization of age distrubtion after replacing the missing values with mean. 

train.age.plot.hist(alpha=0.5)
raw_titanic.age.plot.hist(alpha=0.5)

In [None]:
train.shape, validate.shape, test.shape

In [None]:
# Create dummy variables of sex in train dataset. 

sex_dummy = pd.get_dummies(train.sex)
train = pd.concat([train, sex_dummy], axis=1)
train.head()

In [None]:
# fare, pclass, age, and male are the X in model 3. 

X_train_model3 = train[['fare', 'pclass', 'age', 'male']]
y_train_model3 = train[['survived']]

X_train_model3.shape, y_train_model3.shape

In [None]:
# Create the logistic regression object

logit3 = LogisticRegression(C=1)

# Fit the model to the training data

logit3.fit(X_train_model3, y_train_model3)

# Print the coefficients and intercept of the model

print('Coefficient: \n', logit3.coef_)
print('Intercept: \n', logit3.intercept_)

# Estimate whether or not a passenger would survive, using the training data

y_pred_model3 = logit3.predict(X_train_model3)
y_pred_model3

# Estimate the probablity of a passenger surviving, using the training data
y_pred_proba_model3 = logit3.predict_proba(X_train_model3)

In [None]:
# Compute the accuracy

print('Accuracy: {: .2f}'.format(logit3.score(X_train_model3, y_train_model3)))

# Create a confusion matrix

print('Confusion matrix: \n', confusion_matrix(y_train_model3, y_pred_model3))

# Compute Precision, Recall, F1-score, and Support

print(classification_report(y_train_model3, y_pred_model3))

**Notes**
1. Age columns has 177 null values, about ~20% of all data. 
2. Most frequence age is 24. 
3. The mean age is 29.7.
4. The median age is 28.0.
5. Among whom are missing the age values, 133 are alone, and 44 had accompaniers. 

**Choice**
1. I am gonna use mean as the alternative strategy. Let's see how it affect the performance. 

**Results**
1. The accuracy is increased slightly from 0.79 to 0.80. 
2. Since the coefficient of age is small (0.027), it doesn't weigh that much in the model, which may explain the reason for such small change in accuracy. 

### Bonus 2: How do different strategies for encoding sex affect model performance.

In [None]:
# Acquire titianic dataset. 

titanic = acquire.get_titanic_data()
titanic.head()

In [None]:
# Prepare titianic dataset

train, validate, test = prepare.prep_titanic(titanic)
train.head()

In [None]:
# Create dummy variables for column sex

sex_dummy = pd.get_dummies(train.sex)
train = pd.concat([train, sex_dummy], axis=1)
train.head()

In [None]:
# X = ['fare', 'pclass', 'age', 'female']
# y = 'survived'

X_train_model3 = train[['fare', 'pclass', 'age', 'female']]
y_train_model3 = train[['survived']]

X_train_model3.shape, y_train_model3.shape

In [None]:
# Create the logistic regression object

logit3 = LogisticRegression(C=1)

# Fit the model to the training data

logit3.fit(X_train_model3, y_train_model3)

# Print the coefficients and intercept of the model

print('Coefficient: \n', logit3.coef_)
print('Intercept: \n', logit3.intercept_)

# Estimate whether or not a passenger would survive, using the training data

y_pred_model3 = logit3.predict(X_train_model3)
y_pred_model3

# Estimate the probablity of a passenger surviving, using the training data
y_pred_proba_model3 = logit3.predict_proba(X_train_model3)

In [None]:
# Compute the accuracy

print('Accuracy: {: .2f}'.format(logit3.score(X_train_model3, y_train_model3)))

# Create a confusion matrix

print('Confusion matrix: \n', confusion_matrix(y_train_model3, y_pred_model3))

# Compute Precision, Recall, F1-score, and Support

print(classification_report(y_train_model3, y_pred_model3))

**Notes**
1. Sex columns has zero null values.
2. Sex columns contain 577 males and 312 females.
3. In the model 3, the male is 1 and the female is 0.

**Alternative stragety for encoding sex**
1. The male is 0 and the female is 1. 
2. My hypothesis is there is no change in the model performance. 

**Results:**
Such encoding doesn't change performance of the model 3. 

### Bonus 3: `scikit-learn`'s `LogisticRegression` classifier is actually applying a regularization penalty to the coefficients by default. 

* This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. 
* This value can be modified with the `C` hyperparameter.
* Small values of `C` correspond to a larger penalty, and large values of `C` correspond to a smaller penalty.

### Try out the following values for `C` and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected. 
* C = 0.01, 0.1, 1, 10, 100, 1000
* Use model 3:
 - X: fare, pclass, age, male
 - y: survived

In [None]:
# Load titanic dataset

titanic = acquire.get_titanic_data()
titanic.head()

In [None]:
# Prepare the titanic dataset

train, validate, test = prepare.prep_titanic(titanic)
train.head()

In [None]:
# Create dummy variable for column 'sex'

sex_dummy = pd.get_dummies(train.sex)
train = pd.concat([train, sex_dummy], axis=1)
train.head()

In [None]:
X_train = train[['fare', 'pclass', 'age', 'male']]
y_train = train[['survived']]

X_train.shape, y_train.shape

In [None]:
# Define a function that return coefficients given the C value. 

def logit_model_coefficient(c_value, X, y):
    logit = LogisticRegression(C=c_value)
    logit.fit(X, y)
    coefficient = logit.coef_
    return pd.DataFrame(coefficient)

In [None]:
# Calcualte the coefficient according to a list of C values

list_C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
df = pd.DataFrame()
for i in list_C:
    df = pd.concat([df, logit_model_coefficient(i, X_train, y_train)])
df.index = list_C
df.columns = ['fare', 'pclass', 'age', 'male']
df

In [None]:
import math

x = [math.log10(i) for i in df.index]
y = df.male
plt.scatter(x, y)

In [None]:
def logit_model_accuracy(c_value, X, y):
    logit = LogisticRegression(C=c_value)
    logit.fit(X, y)
    accuracy = logit.score(X, y)
    return accuracy

In [None]:
logit_model_accuracy(1, X_train, y_train)

In [None]:
list_C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

accuracy_list = [logit_model_accuracy(i, X_train, y_train) for i in list_C]
df = pd.DataFrame(accuracy_list)
df.columns = ['Accuracy']
df.index = list_C
df

In [None]:
x = [math.log10(i) for i in df.index]
y = df.Accuracy
plt.scatter(x, y)

### Decision Tree Exercises

### In this exercise, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.
* Continue working in your model file. Add, commit, and push your changes.


In [44]:
# Acquire titanic dataset

titanic = acquire.get_titanic_data()
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [45]:
# Prepare titanic dataset

train, validate, test = prepare.prep_titanic(titanic)
train.tail()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embarked_Q,embarked_S
313,0,3,28.0,0,0,7.8958,1,1,0,1
636,0,3,32.0,0,0,7.925,1,1,0,1
222,0,3,51.0,0,0,8.05,1,1,0,1
485,0,3,24.0,3,1,25.4667,0,0,0,1
553,1,3,22.0,0,0,7.225,1,1,0,0


In [46]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 497 entries, 583 to 553
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   survived    497 non-null    int64  
 1   pclass      497 non-null    int64  
 2   age         497 non-null    float64
 3   sibsp       497 non-null    int64  
 4   parch       497 non-null    int64  
 5   fare        497 non-null    float64
 6   alone       497 non-null    int64  
 7   sex_male    497 non-null    uint8  
 8   embarked_Q  497 non-null    uint8  
 9   embarked_S  497 non-null    uint8  
dtypes: float64(2), int64(5), uint8(3)
memory usage: 32.5 KB


In [47]:
train.shape, validate.shape, test.shape

((497, 10), (214, 10), (178, 10))

### 1. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)
### 2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [48]:
# X_train and y_train

X_train_BL = train[['fare', 'pclass']]
y_train_BL = train['survived']

X_train_1 = train[['fare', 'pclass', 'age']]
y_train_1 = train['survived']

X_train_2 = train[['fare', 'pclass', 'age', 'sex_male']]
y_train_2 = train['survived']

X_train_3 = train[['pclass', 'sex_male']]
y_train_3 = train['survived']

In [49]:
X_train_BL.shape, X_train_1.shape

((497, 2), (497, 3))

In [None]:
models = [['baseline', X_train_BL, y_train_BL], 
          ['model1', X_train_1, y_train_1], 
          ['model2', X_train_2, y_train_2],
          ['model3', X_train_3, y_train_3]
         ]

In [51]:
models[0][0]

'baseline'

In [50]:
# Create the Decision Tree Object

clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [52]:
# Evalute the models using model score

output = []

def tree_accuracy(X, y):
    clf.fit(X, y)
    accuracy = clf.score(X, y)
    return accuracy

for model in models:
    output.append({'model': model[0],
                   "accuracy": tree_accuracy(model[1], model[2]).round(2)
                  })

matrics = pd.DataFrame(output)
matrics.sort_values(by='accuracy', ascending=False, ignore_index=True)

Unnamed: 0,model,accuracy
0,model2,0.82
1,model3,0.79
2,baseline,0.69
3,model1,0.69


In [107]:
def tree_y_pred(X, y):
    clf.fit(X, y)
    y_pred = clf.predict(X)
    return y_pred

# tree_y_pred(X_train_1, y_train_1)

output = pd.DataFrame(train['survived']).reset_index(drop=True)
output.rename(columns = {'survived':'actual'}, inplace=True)

for model in models:
    output[model[0]] = tree_y_pred(model[1], model[2])

output.head()

Unnamed: 0,actual,baseline,model1,model2,model3
0,0,0,0,0,0
1,1,1,1,1,1
2,0,0,0,0,0
3,1,1,1,1,1
4,1,1,1,1,1


In [106]:
labels = ['victim', 'survived']

for model in models:
    df = pd.crosstab(output.actual, output[model[0]])
    df.index = df.columns = labels
    print(f'The confusion matrix of {model[0]} is:\n {df}\n')

The confusion matrix of baseline is:
           victim  survived
victim       279        28
survived     126        64

The confusion matrix of model1 is:
           victim  survived
victim       279        28
survived     126        64

The confusion matrix of model2 is:
           victim  survived
victim       279        28
survived      62       128

The confusion matrix of model3 is:
           victim  survived
victim       303         4
survived     100        90



In [30]:
# Evaluate models by confusion matrix

def tree_matrix(X, y):
    clf.fit(X, y)
    y_pred = clf.predict(X)
    matrix = confusion_matrix(y, y_pred)
    return matrix

print('Confusion matrix of Baseline Model:\n', tree_matrix(X_train_BL, y_train_BL))
print('Confusion matrix of Model 1:\n', tree_matrix(X_train_1, y_train_1))
print('Confusion matrix of Model 2:\n', tree_matrix(X_train_2, y_train_2))
print('Confusion matrix of Model 3:\n', tree_matrix(X_train_3, y_train_3))

Confusion matrix of Baseline Model:
 [[279  28]
 [126  64]]
Confusion matrix of Model 1:
 [[279  28]
 [126  64]]
Confusion matrix of Model 2:
 [[279  28]
 [ 62 128]]
Confusion matrix of Model 3:
 [[303   4]
 [100  90]]


In [59]:
# Evaluate models by classification report

def tree_report(X, y):
    clf.fit(X, y)
    y_pred = clf.predict(X)
    report = classification_report(y, y_pred)
    return report

print('Classification report of Baseline Model:\n', tree_report(X_train_BL, y_train_BL))
print('Classification report of Model 1:\n', tree_report(X_train_1, y_train_1))
print('Classification report of Model 2:\n', tree_report(X_train_2, y_train_2))
print('Classification report of Model 3:\n', tree_report(X_train_3, y_train_3))

Classification report of Baseline Model:
               precision    recall  f1-score   support

           0       0.69      0.91      0.78       307
           1       0.70      0.34      0.45       190

    accuracy                           0.69       497
   macro avg       0.69      0.62      0.62       497
weighted avg       0.69      0.69      0.66       497

Classification report of Model 1:
               precision    recall  f1-score   support

           0       0.69      0.91      0.78       307
           1       0.70      0.34      0.45       190

    accuracy                           0.69       497
   macro avg       0.69      0.62      0.62       497
weighted avg       0.69      0.69      0.66       497

Classification report of Model 2:
               precision    recall  f1-score   support

           0       0.82      0.91      0.86       307
           1       0.82      0.67      0.74       190

    accuracy                           0.82       497
   macro avg    

### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
* Accuracy
* TPR = Recall(1)
* TNR = Recall(0)
* FPR = 1 - Recall(0)
* FNR = 1 - Recall(1)

In [86]:
def tree_report_dataframe(X, y):
    clf.fit(X, y)
    y_pred = clf.predict(X)
    accuracy = clf.score(X,y)
    report = classification_report(y, y_pred, output_dict=True)
    report = pd.DataFrame(report)
    report.drop(columns=['macro avg', 'weighted avg', 'accuracy'], inplace=True)
    report = report.T
    report['FPR/FNR'] = 1 - report.recall
    report['accuracy'] = accuracy
    report.rename(columns={'recall':'TNR/TPR'}, inplace=True)
    return report

print('Performance of Baseline Model:\n', tree_report_dataframe(X_train_BL, y_train_BL), '\n')
print('Performance of Model 1:\n', tree_report_dataframe(X_train_1, y_train_1), '\n')
print('Performance of Model 2:\n', tree_report_dataframe(X_train_2, y_train_2), '\n')
print('Performance of Model 3:\n', tree_report_dataframe(X_train_3, y_train_3), '\n')

Performance of Baseline Model:
    precision   TNR/TPR  f1-score  support   FPR/FNR  accuracy
0   0.688889  0.908795  0.783708    307.0  0.091205  0.690141
1   0.695652  0.336842  0.453901    190.0  0.663158  0.690141 

Performance of Model 1:
    precision   TNR/TPR  f1-score  support   FPR/FNR  accuracy
0   0.688889  0.908795  0.783708    307.0  0.091205  0.690141
1   0.695652  0.336842  0.453901    190.0  0.663158  0.690141 

Performance of Model 2:
    precision   TNR/TPR  f1-score  support   FPR/FNR  accuracy
0   0.818182  0.908795  0.861111    307.0  0.091205  0.818913
1   0.820513  0.673684  0.739884    190.0  0.326316  0.818913 

Performance of Model 3:
    precision   TNR/TPR  f1-score  support   FPR/FNR  accuracy
0   0.751861  0.986971  0.853521    307.0  0.013029  0.790744
1   0.957447  0.473684  0.633803    190.0  0.526316  0.790744 



### 4. Run through steps 2-4 using a different max_depth value.

In [89]:
# Create the Decision Tree Object with different max_depth value

clf5 = DecisionTreeClassifier(max_depth=5, random_state=123)
clf10 = DecisionTreeClassifier(max_depth=10, random_state=123)
clf20 = DecisionTreeClassifier(max_depth=20, random_state=123)

In [90]:
def tree_report_dataframe_max5(X, y):
    clf5.fit(X, y)
    y_pred = clf5.predict(X)
    accuracy = clf5.score(X,y)
    report = classification_report(y, y_pred, output_dict=True)
    report = pd.DataFrame(report)
    report.drop(columns=['macro avg', 'weighted avg', 'accuracy'], inplace=True)
    report = report.T
    report['FPR/FNR'] = 1 - report.recall
    report['accuracy'] = accuracy
    report.rename(columns={'recall':'TNR/TPR'}, inplace=True)
    return report

print('Performance of Baseline Model:\n', tree_report_dataframe_max5(X_train_BL, y_train_BL), '\n')
print('Performance of Model 1:\n', tree_report_dataframe_max5(X_train_1, y_train_1), '\n')
print('Performance of Model 2:\n', tree_report_dataframe_max5(X_train_2, y_train_2), '\n')
print('Performance of Model 3:\n', tree_report_dataframe_max5(X_train_3, y_train_3), '\n')

Performance of Baseline Model:
    precision   TNR/TPR  f1-score  support   FPR/FNR  accuracy
0   0.709756  0.947883  0.811715    307.0  0.052117   0.72837
1   0.816092  0.373684  0.512635    190.0  0.626316   0.72837 

Performance of Model 1:
    precision   TNR/TPR  f1-score  support   FPR/FNR  accuracy
0   0.722922  0.934853  0.815341    307.0  0.065147  0.738431
1   0.800000  0.421053  0.551724    190.0  0.578947  0.738431 

Performance of Model 2:
    precision   TNR/TPR  f1-score  support   FPR/FNR  accuracy
0   0.871473  0.905537  0.888179    307.0  0.094463  0.859155
1   0.837079  0.784211  0.809783    190.0  0.215789  0.859155 

Performance of Model 3:
    precision   TNR/TPR  f1-score  support   FPR/FNR  accuracy
0   0.751861  0.986971  0.853521    307.0  0.013029  0.790744
1   0.957447  0.473684  0.633803    190.0  0.526316  0.790744 



In [91]:
def tree_report_dataframe_max10(X, y):
    clf10.fit(X, y)
    y_pred = clf10.predict(X)
    accuracy = clf10.score(X,y)
    report = classification_report(y, y_pred, output_dict=True)
    report = pd.DataFrame(report)
    report.drop(columns=['macro avg', 'weighted avg', 'accuracy'], inplace=True)
    report = report.T
    report['FPR/FNR'] = 1 - report.recall
    report['accuracy'] = accuracy
    report.rename(columns={'recall':'TNR/TPR'}, inplace=True)
    return report

print('Performance of Baseline Model:\n', tree_report_dataframe_max10(X_train_BL, y_train_BL), '\n')
print('Performance of Model 1:\n', tree_report_dataframe_max10(X_train_1, y_train_1), '\n')
print('Performance of Model 2:\n', tree_report_dataframe_max10(X_train_2, y_train_2), '\n')
print('Performance of Model 3:\n', tree_report_dataframe_max10(X_train_3, y_train_3), '\n')

Performance of Baseline Model:
    precision   TNR/TPR  f1-score  support   FPR/FNR  accuracy
0   0.830619  0.830619  0.830619    307.0  0.169381  0.790744
1   0.726316  0.726316  0.726316    190.0  0.273684  0.790744 

Performance of Model 1:
    precision   TNR/TPR  f1-score  support   FPR/FNR  accuracy
0   0.935361  0.801303  0.863158    307.0  0.198697  0.843058
1   0.739316  0.910526  0.816038    190.0  0.089474  0.843058 

Performance of Model 2:
    precision   TNR/TPR  f1-score  support   FPR/FNR  accuracy
0   0.923313  0.980456  0.951027    307.0  0.019544  0.937626
1   0.964912  0.868421  0.914127    190.0  0.131579  0.937626 

Performance of Model 3:
    precision   TNR/TPR  f1-score  support   FPR/FNR  accuracy
0   0.751861  0.986971  0.853521    307.0  0.013029  0.790744
1   0.957447  0.473684  0.633803    190.0  0.526316  0.790744 



In [92]:
def tree_report_dataframe_max20(X, y):
    clf20.fit(X, y)
    y_pred = clf20.predict(X)
    accuracy = clf20.score(X,y)
    report = classification_report(y, y_pred, output_dict=True)
    report = pd.DataFrame(report)
    report.drop(columns=['macro avg', 'weighted avg', 'accuracy'], inplace=True)
    report = report.T
    report['FPR/FNR'] = 1 - report.recall
    report['accuracy'] = accuracy
    report.rename(columns={'recall':'TNR/TPR'}, inplace=True)
    return report

print('Performance of Baseline Model:\n', tree_report_dataframe_max20(X_train_BL, y_train_BL), '\n')
print('Performance of Model 1:\n', tree_report_dataframe_max20(X_train_1, y_train_1), '\n')
print('Performance of Model 2:\n', tree_report_dataframe_max20(X_train_2, y_train_2), '\n')
print('Performance of Model 3:\n', tree_report_dataframe_max20(X_train_3, y_train_3), '\n')

Performance of Baseline Model:
    precision   TNR/TPR  f1-score  support   FPR/FNR  accuracy
0   0.804813  0.980456  0.883994    307.0  0.019544  0.841046
1   0.951220  0.615789  0.747604    190.0  0.384211  0.841046 

Performance of Model 1:
    precision   TNR/TPR  f1-score  support   FPR/FNR  accuracy
0   0.964516  0.973941  0.969206    307.0  0.026059  0.961771
1   0.957219  0.942105  0.949602    190.0  0.057895  0.961771 

Performance of Model 2:
    precision   TNR/TPR  f1-score  support   FPR/FNR  accuracy
0   0.977707  1.000000  0.988728    307.0  0.000000  0.985915
1   1.000000  0.963158  0.981233    190.0  0.036842  0.985915 

Performance of Model 3:
    precision   TNR/TPR  f1-score  support   FPR/FNR  accuracy
0   0.751861  0.986971  0.853521    307.0  0.013029  0.790744
1   0.957447  0.473684  0.633803    190.0  0.526316  0.790744 



### 5. Which performs better on your in-sample data?
* Model 2 with max_indepth = 20

**Notes**
1. The baseline accuracy is 0.618
2. My baseline model is X = ['fare', 'pclass'], y = 'survived'
3. Model 1: X = ['fare', 'pclass', 'age']
4. Model 2: X = ['fare', 'pclass', 'age', 'male']
5. Model 3: X = ['pclass', 'male']

## Random Forest Exercises
* Continue working in your model file. Be sure to add, commit, and push your changes.

### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.
### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [2]:
# Acquire titanic dataset

titanic = acquire.get_titanic_data()
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
# Prepare titanic dataset

train, validate, test = prepare.prep_titanic(titanic)
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embarked_Q,embarked_S
583,0,1,36.0,0,0,40.125,1,1,0,0
337,1,1,41.0,0,0,134.5,1,0,0,0
50,0,3,7.0,4,1,39.6875,0,1,0,1
218,1,1,32.0,0,0,76.2917,1,0,0,0
31,1,1,24.0,1,0,146.5208,0,0,0,0


In [32]:
# X_train and y_train

X_train = train[['fare', 'pclass', 'age', 'sex_male']]
y_train = train['survived']

X_validate = validate[['fare', 'pclass', 'age', 'sex_male']]
y_validate = validate['survived']

X_test = test[['fare', 'pclass', 'age', 'sex_male']]
y_test = test['survived']

X_train.shape, X_validate.shape, X_test.shape

((497, 4), (214, 4), (178, 4))

In [33]:
# Creat the Random Forest Object

rf = RandomForestClassifier(n_estimators=100, 
                            max_depth=20, 
                            min_samples_split=2, 
                            min_samples_leaf=1, 
                            random_state=433)

In [6]:
# Fit to the training dataset.

rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=433,
                       verbose=0, warm_start=False)

In [7]:
# Compute the model score:

accuracy_rf = rf.score(X_train, y_train)
accuracy_rf.round(2)

0.99

In [8]:
# Compute the predicted y

y_pred_rf = rf.predict(X_train)
y_pred_rf[0:5]

array([0, 1, 0, 1, 1])

In [9]:
# Compute the confusion matrix

confusion_matrix(y_train, y_pred_rf)

array([[305,   2],
       [  5, 185]])

In [10]:
# Compute the classification report

print(classification_report(y_train, y_pred_rf))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       307
           1       0.99      0.97      0.98       190

    accuracy                           0.99       497
   macro avg       0.99      0.98      0.99       497
weighted avg       0.99      0.99      0.99       497



### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [54]:
model_metrics(rf, X_train, y_train).round(3)

Unnamed: 0,precision,TNR/TPR,f1-score,support,FPR/FNR,accuracy
0,0.984,0.993,0.989,307.0,0.007,0.986
1,0.989,0.974,0.981,190.0,0.026,0.986


### 4. Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.

In [45]:
# Create a new Random Forest Object

rf1 = RandomForestClassifier(n_estimators=100, 
                            max_depth=3, 
                            min_samples_split=2, 
                            min_samples_leaf=5, 
                            random_state=433)

In [49]:
# Evaluate the new algorithms

model_metrics(rf1, X_train, y_train).round(3)

Unnamed: 0,precision,TNR/TPR,f1-score,support,FPR/FNR,accuracy
0,0.811,0.948,0.874,307.0,0.052,0.831
1,0.884,0.642,0.744,190.0,0.358,0.831


### 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

1. The accuracy drop from 0.99 to 0.83
2. The rf performas better on the in-sample data, due to high max_depth and low min_sample_leaf. 

### Bonus: How about out-of-sample data?

In [50]:
model_metrics(rf, X_validate, y_validate).round(3)

Unnamed: 0,precision,TNR/TPR,f1-score,support,FPR/FNR,accuracy
0,0.992,0.992,0.992,132.0,0.008,0.991
1,0.988,0.988,0.988,82.0,0.012,0.991


In [51]:
model_metrics(rf, X_test, y_test).round(3)

Unnamed: 0,precision,TNR/TPR,f1-score,support,FPR/FNR,accuracy
0,0.991,0.991,0.991,110.0,0.009,0.989
1,0.985,0.985,0.985,68.0,0.015,0.989


In [52]:
model_metrics(rf1, X_validate, y_validate).round(3)

Unnamed: 0,precision,TNR/TPR,f1-score,support,FPR/FNR,accuracy
0,0.804,0.962,0.876,132.0,0.038,0.832
1,0.911,0.622,0.739,82.0,0.378,0.832


In [53]:
model_metrics(rf1, X_test, y_test).round(3)

Unnamed: 0,precision,TNR/TPR,f1-score,support,FPR/FNR,accuracy
0,0.879,0.855,0.866,110.0,0.145,0.837
1,0.775,0.809,0.791,68.0,0.191,0.837


**Notes**
1. The dataset I am gonna use is titanic
    * baseline accuracy: 
    * baseline model: X = ['fare','pclass']
    * best performance model so far: X = ['fare', 'pclass', 'age', 'sex_male']
2. rf: [100, 20, 2, 1, 433]
    * accuracy is 0.99, much higher than the baseline and lr and clf.
    * hightly possiblely overfit
3. rf: [100, 3, 2, 5, 433]
4. I am surprise to find no overfitting issue. 