# Titanic
# Week 2
### Week 1 was about exploring the data
### Week 2 is about building a simple Machine Learning model to predict survivors

Import Python libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn import metrics
from sklearn.model_selection import train_test_split

# Import the Data

In [2]:
train_df = pd.read_csv('train.csv')  # Dataset to Train and Validate on

# Some Data Exploration

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Prepare Data for Analysis

* We believe we only need 'Sex' to do a pretty good Model
* Drop unnecessary fields

In [4]:
X = train_df

X = X.drop(['Name','Ticket','Cabin','Embarked','Sex',
           'SibSp','Age','Parch','Fare','PassengerId','Pclass'], axis=1)   # Drop Features we don't want

X.loc[train_df['Sex']=='female', 'Sex'] = 1     # Enumerate 'Sex'
X.loc[train_df['Sex']=='male', 'Sex'] = 0     # Enumerate 'Sex'

X.dropna(axis=0, inplace=True)   # Drop any NaNs

y = X['Survived']   # Prepare the training labels
X.drop(['Survived'], axis=1, inplace=True)  # Remove Labels from the Training Set

# Train / Test Split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)



In [5]:
Xtrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 331 to 102
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Sex     712 non-null    float64
dtypes: float64(1)
memory usage: 11.1 KB


# Machine Learning Model

### A machine learning model can be used to predict if a person would survive the Titanic disaster

### We train the model on a portion of the data (Training set)

### We test (or validate) the model on data that has not been used for training, to see how well it predicts survival on data that it has not seen before


* Build a Model based on our Training Set
* Validate on our Test / Validation set

In [6]:
from sklearn.tree import DecisionTreeClassifier

# Instantiate a Classifier and fit with Xtrain and ytrain
clf = DecisionTreeClassifier()
clf = clf.fit(Xtrain, ytrain)

# Validate with the Test / Validation set
y_pred = clf.predict(Xtest)

In [7]:
# check the training accuracy (showing two different ways to calculate)
print(f'Validation Set Accuracy : {clf.score(Xtest, ytest)}\n')
print(f'Validation Set Accuracy : {np.sum(ytest==y_pred)/ytest.shape[0]}\n')

Validation Set Accuracy : 0.7821229050279329

Validation Set Accuracy : 0.7821229050279329



# Cross-Validation
https://scikit-learn.org/stable/modules/cross_validation.html



In [8]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X, y, cv=5)
scores

array([0.80446927, 0.80337079, 0.78651685, 0.75280899, 0.78651685])

In [9]:
scores.mean()

0.7867365513778168

# Results

* We got 78% accuracy on our Validation Set based only on the Male/Female feature
* Can you do better ?



## Prepare Submission to Kaggle

In [10]:
test_df = pd.read_csv('test.csv')

test_df['Sex'].replace(['female','male'],[1,0], inplace=True)        # Enumerate 'Sex'

Xtest = test_df.drop(['Name','Ticket','Cabin','Embarked','Pclass',
           'SibSp','Age','Parch','Fare','PassengerId'], axis=1)

Xtest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Sex     418 non-null    int64
dtypes: int64(1)
memory usage: 3.4 KB


In [11]:
predict = clf.predict(Xtest).astype(int)

submit = pd.DataFrame({'PassengerId' : [], 'Survived' : []})
submit['PassengerId'] = test_df.PassengerId
submit['Survived'] = predict
submit.to_csv('./submit.csv', index=False)