In [11]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import numpy as np

In [12]:
def confusion_heat_map(confusion_matrix):
    group_names = ['True Neg','False Pos','False Neg','True Pos']
    group_counts = ["{0:0.0f}".format(value) for value in confusion_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in confusion_matrix.flatten()/np.sum(confusion_matrix)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages) ]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(confusion_matrix, annot=labels, fmt='', cmap='Blues')

In [13]:
%run "./load_clean.ipynb"

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PassengerId          8693 non-null   object 
 1   HomePlanet           8492 non-null   object 
 2   CryoSleep            8476 non-null   object 
 3   Cabin                8494 non-null   object 
 4   Destination          8511 non-null   object 
 5   Age                  8514 non-null   float64
 6   VIP                  8490 non-null   object 
 7   spend_room_service   8512 non-null   float64
 8   spend_food_court     8510 non-null   float64
 9   spend_shopping_mall  8485 non-null   float64
 10  spend_spa            8510 non-null   float64
 11  spend_vr_deck        8505 non-null   float64
 12  Name                 8493 non-null   object 
 13  Transported          8693 non-null   bool   
 14  group_id             8693 non-null   object 
 15  group_num            8693 non-null   o

In [14]:
df: pd.DataFrame = data_df;

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report


# Best Models

In [16]:
df.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'spend_room_service', 'spend_food_court', 'spend_shopping_mall',
       'spend_spa', 'spend_vr_deck', 'Name', 'Transported', 'group_id',
       'group_num', 'cabin_deck', 'cabin_num', 'cabin_side', 'spend_total',
       'spend_high_total', 'spend_low_total'],
      dtype='object')

In [17]:
df.isna().sum()

PassengerId              0
HomePlanet             201
CryoSleep              217
Cabin                  199
Destination            182
Age                    179
VIP                    203
spend_room_service       0
spend_food_court         0
spend_shopping_mall      0
spend_spa                0
spend_vr_deck            0
Name                   200
Transported              0
group_id                 0
group_num                0
cabin_deck             199
cabin_num              199
cabin_side             199
spend_total              0
spend_high_total         0
spend_low_total          0
dtype: int64

In [18]:
train_df:pd.DataFrame = read_clean_titanic('./data/train.csv')
test_df:pd.DataFrame = read_clean_titanic('./data/test.csv')

# #
# # TRAIN TEST SPLIT PREP
# #
feature_cols = [
   'is_female', 
   'Pclass', 
   'is_age_mister', 
   'is_age_miss', 
   'is_age_misses',
   'is_age_master', 
   'Embarked_C', 'Embarked_Q', 'Embarked_S', 
   'is_parent', 
   'is_child', 
   ]
X = train_df[feature_cols]
y = train_df['Transported']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# #
# # MODELING
# #
model = RandomForestClassifier(n_estimators=500, 
                               max_depth=5, 
                               random_state=1,
                              #  bootstrap=False,
                               )
model.fit(X, y)
y_pred = model.predict(X_test)

# #
# # METRIC REPORTING
# #
cnf_matrix = confusion_matrix(y_test, y_pred)
print("Recall:", classification_report(y_test, y_pred))
plt.figure(figsize=(10,5),dpi=100)
confusion_heat_map(cnf_matrix)
print(model.feature_importances_)

NameError: name 'read_clean_titanic' is not defined

In [None]:
train_df = read_clean_titanic('./data/train.csv')
test_df = read_clean_titanic('./data/test.csv')

# #
# # TRAIN TEST SPLIT PREP
# #
feature_cols = [
   'Pclass', 'is_female', 
   'Embarked_C', 'Embarked_Q', 'Embarked_S', 
   'is_family', 'is_parent', 'is_child', 
   'is_age_master', 'is_age_mister', 'is_age_miss', 'is_age_misses'
   ]

X = train_df[feature_cols]
y = train_df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# #
# # MODELING
# #
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
y_pred = model.predict(X_test)

# #
# # METRIC REPORTING
# #
cnf_matrix = confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))
plt.figure(figsize=(10,5),dpi=100)
confusion_heat_map(cnf_matrix)

# Model Testing

In [None]:
train_df = read_clean_titanic('./data/train.csv')
test_df = read_clean_titanic('./data/test.csv')

# #
# # TRAIN TEST SPLIT PREP
# #
feature_cols = [
   'Pclass', 'is_female', 
   'Embarked_C', 'Embarked_Q', 'Embarked_S', 
   'is_family', 'is_parent', 'is_child', 
   'is_age_master', 'is_age_mister', 'is_age_miss', 'is_age_misses'
   ]

# feature_cols = [
#    'Pclass', 
#    'is_female', 
#    'is_parent', 
#    'is_child', 
#    'age_cat_master', 'age_cat_mister',
#    'age_cat_miss', 'age_cat_misses',
#    ]
X = train_df[feature_cols]
y = train_df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# #
# # MODELING
# #
model = RandomForestClassifier(n_estimators=100, 
                               max_depth=5, 
                               random_state=1,
                               bootstrap=False
                               )
model.fit(X, y)
y_pred = model.predict(X_test)

# #
# # METRIC REPORTING
# #
cnf_matrix = confusion_matrix(y_test, y_pred)
# print(cnf_matrix)
print("Recall:", classification_report(y_test, y_pred))
plt.figure(figsize=(10,5),dpi=100)
confusion_heat_map(cnf_matrix)

In [None]:
train_df = read_clean_titanic('./data/train.csv')
test_df = read_clean_titanic('./data/test.csv')

# #
# # TRAIN TEST SPLIT PREP
# #
feature_cols = [
   'Pclass', 'is_female', 
   'Embarked_C', 'Embarked_Q', 'Embarked_S', 
   'is_family', 'is_parent', 'is_child', 
   'is_age_master', 'is_age_mister', 'is_age_miss', 'is_age_misses'
   ]

# feature_cols = [
#    'Pclass', 
#    'is_female', 
#    'is_parent', 
#    'is_child', 
#    'age_cat_master', 'age_cat_mister',
#    'age_cat_miss', 'age_cat_misses',
#    ]
X = train_df[feature_cols]
y = train_df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# #
# # MODELING
# #
model = RandomForestClassifier(n_estimators=100, 
                               max_depth=5, 
                               random_state=1,
                               bootstrap=False
                               )
model.fit(X, y)
y_pred = model.predict(X_test)

# #
# # METRIC REPORTING
# #
cnf_matrix = confusion_matrix(y_test, y_pred)
# print(cnf_matrix)
print("Recall:", classification_report(y_test, y_pred))
plt.figure(figsize=(10,5),dpi=100)
confusion_heat_map(cnf_matrix)

In [None]:
no_dummy_df = read_clean_titanic('./data/train.csv', use_dummies=False)
display(survival_in_feature_group(no_dummy_df, 'Embarked'))
display(survival_in_feature_group(no_dummy_df, 'Sex'))

## Good but more False Negative

In [None]:
train_df = read_clean_titanic('./data/train.csv')
test_df = read_clean_titanic('./data/test.csv')

# #
# # TRAIN TEST SPLIT PREP
# #
feature_cols = [
   'Pclass', 
   'is_female', 
   'is_age_miss', 'is_age_misses',
   'is_age_master', 'is_age_mister', 
   'Embarked_C', 'Embarked_Q', 'Embarked_S', 
   # 'is_family', 
   'is_parent', 
   'is_child', 
   ]
X = train_df[feature_cols]
y = train_df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# #
# # MODELING
# #
model = RandomForestClassifier(n_estimators=500, 
                               max_depth=7, 
                               random_state=1,
                               bootstrap=False,
                               criterion="entropy"
                               )
model.fit(X, y)
y_pred = model.predict(X_test)

# #
# # METRIC REPORTING
# #
cnf_matrix = confusion_matrix(y_test, y_pred)
# print(cnf_matrix)
print("Recall:", classification_report(y_test, y_pred))
plt.figure(figsize=(10,5),dpi=100)
confusion_heat_map(cnf_matrix)

In [None]:
train_df = read_clean_titanic('./data/train.csv')
test_df = read_clean_titanic('./data/test.csv')

# #
# # TRAIN TEST SPLIT PREP
# #
feature_cols = [
   'is_female', 
   'Pclass', 
   'is_age_mister', 
   'is_age_miss', 
   'is_age_misses',
   'is_age_master', 
   'Embarked_C', 'Embarked_Q', 'Embarked_S', 
   'is_parent', 
   'is_child', 
   ]
X = train_df[feature_cols]
y = train_df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# #
# # MODELING
# #
model = RandomForestClassifier(n_estimators=500, 
                               max_depth=5, 
                               random_state=1,
                               bootstrap=False,
                               min_samples_split=0.01
                               )
model.fit(X, y)
y_pred = model.predict(X_test)

# #
# # METRIC REPORTING
# #
cnf_matrix = confusion_matrix(y_test, y_pred)
print("Recall:", classification_report(y_test, y_pred))
plt.figure(figsize=(10,5),dpi=100)
confusion_heat_map(cnf_matrix)
print(model.feature_importances_)