# Titanic
# Week 2 - Example Solution
### Week 1 was about exploring the data
### Week 2 is about building a simple Machine Learning model to predict survivors

Import Python libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn import metrics
from sklearn.model_selection import train_test_split

# Import the Data

In [2]:
train_df = pd.read_csv('train.csv')  # Dataset to Train and Validate on

# Some Data Exploration

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Enumerate 'Sex' feature (make it a number)

In [5]:
train_df['Sex'].replace(['female','male'],[1,0], inplace=True)        # Enumerate 'Sex'

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Sex'].replace(['female','male'],[1,0], inplace=True)        # Enumerate 'Sex'
  train_df['Sex'].replace(['female','male'],[1,0], inplace=True)        # Enumerate 'Sex'


### One Hot Encode Pclass
* Makes 3 features for Pclass 1,2,3

In [6]:
pclass_onehot = pd.get_dummies(train_df['Pclass'], prefix='Pclass')    # one-hot encode Pclass
train_df = pd.concat([train_df, pclass_onehot], axis=1)
train_df.drop('Pclass', axis=1, inplace=True)                          # don't need Pclass now

### One Hot Encode Embarked (Port where they boarded the Titanic)
* Makes 3 features for Q, S, C

In [7]:
pclass_embark = pd.get_dummies(train_df['Embarked'], prefix='Emb')    # one-hot encode Embarked
train_df = pd.concat([train_df, pclass_embark], axis=1)
train_df.drop('Embarked', axis=1, inplace=True)                       # don't need Embarked now

### Make a 'Child' Feature

In [8]:
train_df['Child'] = 0
train_df.loc[train_df['Age'] < 8, 'Child'] = 1

In [40]:
train_df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Emb_C,Emb_Q,Emb_S,Child
0,1,0,0,22.0,1,0,7.2500,False,False,True,False,False,True,0
1,2,1,1,38.0,1,0,71.2833,True,False,False,True,False,False,0
2,3,1,1,26.0,0,0,7.9250,False,False,True,False,False,True,0
3,4,1,1,35.0,1,0,53.1000,True,False,False,False,False,True,0
4,5,0,0,35.0,0,0,8.0500,False,False,True,False,False,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,0,27.0,0,0,13.0000,False,True,False,False,False,True,0
887,888,1,1,19.0,0,0,30.0000,True,False,False,False,False,True,0
888,889,0,1,,1,2,23.4500,False,False,True,False,False,True,0
889,890,1,0,26.0,0,0,30.0000,True,False,False,True,False,False,0


In [44]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Pclass_1,Pclass_2,Pclass_3,Emb_C,Emb_Q,Emb_S,Child
0,1,0,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,False,False,True,False,False,True,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,True,False,False,True,False,False,0
2,3,1,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,False,False,True,False,False,True,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,True,False,False,False,False,True,0
4,5,0,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,False,False,True,False,False,True,0


## Check Correlations

In [35]:
corr = train_df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True)
plt.title('Correlations between features')
plt.show()

ValueError: could not convert string to float: 'Braund, Mr. Owen Harris'

#### Observations
* Sex has the highest correlation with Survived
* Pclass is the second most important

# Prepare Data for Analysis

* Use Sex and Pclass features
* Drop unnecessary fields

In [45]:
X = train_df

X = X.drop(['Name','Ticket','Cabin',
           'SibSp','Age','Parch','Fare','PassengerId'], axis=1)   # Drop Features we don't want

#### Any NaNs ?

In [46]:
print(X.isnull().sum())
X.dropna(axis=0, inplace=True)   # Drop any NaNs

Survived    0
Sex         0
Pclass_1    0
Pclass_2    0
Pclass_3    0
Emb_C       0
Emb_Q       0
Emb_S       0
Child       0
dtype: int64


#### Prepare the X, y data for ML, split into Train / Test data

In [47]:
y = X['Survived']   # Prepare the training labels
X.drop(['Survived'], axis=1, inplace=True)  # Remove Labels from the Training Set

# Train / Test Split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
Xtrain.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 331 to 102
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Sex       712 non-null    int64
 1   Pclass_1  712 non-null    bool 
 2   Pclass_2  712 non-null    bool 
 3   Pclass_3  712 non-null    bool 
 4   Emb_C     712 non-null    bool 
 5   Emb_Q     712 non-null    bool 
 6   Emb_S     712 non-null    bool 
 7   Child     712 non-null    int64
dtypes: bool(6), int64(2)
memory usage: 20.9 KB


# Machine Learning Model

### A machine learning model can be used to predict if a person would survive the Titanic disaster

### We train the model on a portion of the data (Training set)

### We test (or validate) the model on data that has not been used for training, to see how well it predicts survival on data that it has not seen before


* Build a Model based on our Training Set
* Validate on our Test / Validation set

In [73]:
Xtrain.shape

(712, 8)

In [49]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Instantiate a Classifier and fit with Xtrain and ytrain
clf = LogisticRegression()
clf = clf.fit(Xtrain, ytrain)

# Validate with the Test / Validation set
y_pred = clf.predict(Xtest)

In [59]:
def sigmoid(a):
  return 1/(1+np.exp(-a))



In [60]:
def survivedproba(x, b, c):
    ##a = b + x[0,0]*c1 + x[0,1]*c2
    a=b
    n=x.shape[1]
    for i in range(0,n):
      a=a+x[0,i]*c[0,i]

    p=sigmoid(a)
    return p


In [89]:
def roctraining(xtrain, ytrain, a):
  n=x.shape[712,8]
  a == 0.5
  for i in range(0,n):
    survivedproba(xtrain)
    if survivedproba > a :
      survivedproba == 1
    if survivedproba < a :
      survivedproba == 0
    n = n+1
  return survivedproba()



In [None]:
def get_all_sens_spec():
  for all thresholds.
    determine TP, TN, FP, FN.
    calculate sens and spec.
  return threshold, sens and spect for all thresholds

In [77]:
output = dict()
output[0.1]=[0.3, 0.4]
output[0.2]=[0.4, 0.3]

output

{0.1: [0.3, 0.4], 0.2: [0.4, 0.3]}

In [71]:
np.array([1,2,3,4]).reshape(-1,1).shape

(4, 1)

In [65]:
survivedproba(x= np.array([5]).reshape(1,1), b = 2, c = np.array([5]).reshape(1,1))

0.9999999999981204

In [50]:
# check the training accuracy (showing two different ways to calculate)
print(f'Validation Set Accuracy : {clf.score(Xtest, ytest)}\n')

Validation Set Accuracy : 0.7877094972067039



# Cross-Validation
https://scikit-learn.org/stable/modules/cross_validation.html



In [51]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X, y, cv=5)
scores

array([0.79329609, 0.82022472, 0.78089888, 0.75280899, 0.79213483])

In [53]:
scores.mean()

0.7878727010231623

## Prepare Submission to Kaggle

In [None]:
test_df = pd.read_csv('test.csv')

test_df['Sex'].replace(['female','male'],[1,0], inplace=True)        # Enumerate 'Sex'

pclass_onehot = pd.get_dummies(test_df['Pclass'], prefix='Pclass')   # one-hot encode Pclass
test_df = pd.concat([test_df, pclass_onehot], axis=1)

pclass_embark = pd.get_dummies(test_df['Embarked'], prefix='Emb')    # one-hot encode Embarked
test_df = pd.concat([test_df, pclass_embark], axis=1)

test_df['Child'] = 0
test_df.loc[test_df['Age'] < 8, 'Child'] = 1                         # child feature

test_df.drop('Pclass', axis=1, inplace=True)                         # don't need Pclass now
test_df.drop('Embarked', axis=1, inplace=True)                       # don't need Embarked now

In [None]:
Xtest = test_df.drop(['Name','Ticket','Cabin',
           'SibSp','Age','Parch','Fare','PassengerId'], axis=1)

Xtest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Sex       418 non-null    int64
 1   Pclass_1  418 non-null    uint8
 2   Pclass_2  418 non-null    uint8
 3   Pclass_3  418 non-null    uint8
 4   Emb_C     418 non-null    uint8
 5   Emb_Q     418 non-null    uint8
 6   Emb_S     418 non-null    uint8
 7   Child     418 non-null    int64
dtypes: int64(2), uint8(6)
memory usage: 9.1 KB


In [None]:
predict = clf.predict(Xtest).astype(int)

submit = pd.DataFrame({'PassengerId' : [], 'Survived' : []})
submit['PassengerId'] = test_df.PassengerId
submit['Survived'] = predict
submit.to_csv('./submit.csv', index=False)

In [None]:
submit.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
