In [30]:
# Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
#Data
Titanic = sns.load_dataset('titanic')
Titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## Checking for redundancies and data that doesn't effect the output and dropping it

In [4]:
Titanic["who"].value_counts()   
#Therefore I am going to say this column will be dropped, as well as "adult_male", 
    # as they are redundant with sex and age


man      537
woman    271
child     83
Name: who, dtype: int64

In [5]:
Titanic["survived"].value_counts() 
# Reduntant with "alive"

0    549
1    342
Name: survived, dtype: int64

In [6]:
Titanic["alive"].value_counts() 
#Since these are the same thing as survived, it appears, I will drop "alive"

no     549
yes    342
Name: alive, dtype: int64

In [7]:
# Drop redundant columns
Titanic.drop(['who', 'adult_male', 'alive'], axis = 1, inplace = True)

Titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   deck         203 non-null    category
 10  embark_town  889 non-null    object  
 11  alone        891 non-null    bool    
dtypes: bool(1), category(2), float64(2), int64(4), object(3)
memory usage: 65.9+ KB


In [8]:
Titanic.drop(['class'], axis = 1, inplace = True)
# Redundant with pclass and class
Titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   deck         203 non-null    category
 9   embark_town  889 non-null    object  
 10  alone        891 non-null    bool    
dtypes: bool(1), category(1), float64(2), int64(4), object(3)
memory usage: 64.9+ KB


In [9]:
Titanic.drop(['embark_town'], axis = 1, inplace = True)
# Redundant with embarked
Titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   survived  891 non-null    int64   
 1   pclass    891 non-null    int64   
 2   sex       891 non-null    object  
 3   age       714 non-null    float64 
 4   sibsp     891 non-null    int64   
 5   parch     891 non-null    int64   
 6   fare      891 non-null    float64 
 7   embarked  889 non-null    object  
 8   deck      203 non-null    category
 9   alone     891 non-null    bool    
dtypes: bool(1), category(1), float64(2), int64(4), object(2)
memory usage: 57.9+ KB


In [10]:
# Information related only to after the wreck, therefore not part of survival. 
Titanic.drop(['alone'], axis = 1, inplace = True)
Titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   survived  891 non-null    int64   
 1   pclass    891 non-null    int64   
 2   sex       891 non-null    object  
 3   age       714 non-null    float64 
 4   sibsp     891 non-null    int64   
 5   parch     891 non-null    int64   
 6   fare      891 non-null    float64 
 7   embarked  889 non-null    object  
 8   deck      203 non-null    category
dtypes: category(1), float64(2), int64(4), object(2)
memory usage: 57.0+ KB


## Drop NaN

In [11]:
# Drop NaN
Titanic.dropna(inplace = True)
Titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 182 entries, 1 to 889
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   survived  182 non-null    int64   
 1   pclass    182 non-null    int64   
 2   sex       182 non-null    object  
 3   age       182 non-null    float64 
 4   sibsp     182 non-null    int64   
 5   parch     182 non-null    int64   
 6   fare      182 non-null    float64 
 7   embarked  182 non-null    object  
 8   deck      182 non-null    category
dtypes: category(1), float64(2), int64(4), object(2)
memory usage: 13.3+ KB


## Recode Everything into Integers

In [12]:
def Gender (series):
    if series == 'male':
        return 0
    if series == 'female':
        return 1
    
Titanic['sexR'] = Titanic['sex'].apply(Gender)
Titanic.drop('sex', axis = 1, inplace = True)
Titanic.sexR = Titanic.sexR.astype(int)

In [13]:
Titanic.age = Titanic.age.astype(int)

In [14]:
Titanic.fare = Titanic.fare.astype(int)

In [15]:
def Embark (series):
    if series == 'S':
        return 0
    if series == 'C':
        return 1
    if series == 'Q':
        return 2
    
Titanic['embarkedR'] = Titanic['embarked'].apply(Embark)
Titanic.drop('embarked', axis = 1, inplace = True)
Titanic.embarkedR = Titanic.embarkedR.astype(int)

In [16]:
def Deck (series):
    if series == 'A':
        return 0
    if series == 'B':
        return 1
    if series == 'C':
        return 2
    if series == 'D':
        return 3
    if series == 'E':
        return 4
    if series == 'F':
        return 5
    if series == 'G':
        return 6

Titanic['deckR'] = Titanic['deck'].apply(Deck)
Titanic.drop('deck', axis = 1, inplace = True)
Titanic.deckR = Titanic.deckR.astype(int)

In [17]:
# All are indeed integers! 
Titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 182 entries, 1 to 889
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   survived   182 non-null    int64
 1   pclass     182 non-null    int64
 2   age        182 non-null    int64
 3   sibsp      182 non-null    int64
 4   parch      182 non-null    int64
 5   fare       182 non-null    int64
 6   sexR       182 non-null    int64
 7   embarkedR  182 non-null    int64
 8   deckR      182 non-null    int64
dtypes: int64(9)
memory usage: 14.2 KB


In [22]:
Titanic.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,sexR,embarkedR,deckR
1,1,1,38,1,0,71,1,1,2
3,1,1,35,1,0,53,1,0,2
6,0,1,54,0,0,51,0,0,4
10,1,3,4,1,1,16,1,0,6
11,1,1,58,0,0,26,1,0,2


## Split into X/y and Training and Test Groups

In [26]:
# Split into X/y
X = Titanic.drop('survived', axis = 1)
y = Titanic['survived']

# Split into training and test groups
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.3)

# Decision Tree (Part 1)

In [27]:
decisionTree = DecisionTreeClassifier()
decisionTree.fit(X_train, y_train)

In [28]:
# Bulid Prediction Tree Model

treePredictions = decisionTree.predict(X_test)
print(confusion_matrix(y_test, treePredictions))
print(classification_report(y_test, treePredictions))

[[10  3]
 [ 6 36]]
              precision    recall  f1-score   support

           0       0.62      0.77      0.69        13
           1       0.92      0.86      0.89        42

    accuracy                           0.84        55
   macro avg       0.77      0.81      0.79        55
weighted avg       0.85      0.84      0.84        55



##### Model has a 85% precision. This model would predict if a person would die 62% of the time, and survived 92% of the time

# Random Forest (Part 2)

In [32]:
forest = RandomForestClassifier(n_estimators = 500)
forest.fit(X_train, y_train)

In [33]:
forestPredictions = forest.predict(X_test)
print(confusion_matrix(y_test, forestPredictions))
print(classification_report(y_test, forestPredictions))

[[ 7  6]
 [ 3 39]]
              precision    recall  f1-score   support

           0       0.70      0.54      0.61        13
           1       0.87      0.93      0.90        42

    accuracy                           0.84        55
   macro avg       0.78      0.73      0.75        55
weighted avg       0.83      0.84      0.83        55



# The Random Forest model is not as good of a fit with 83% precision. The Decision Tree is the better model. 