In [40]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [41]:
df = pd.read_csv("/kaggle/input/titanic/train.csv")
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")
df_submission_example = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

# EDA

In [42]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [44]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [45]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [46]:
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [47]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [48]:
df_submission_example

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


# Feature selection

In [49]:
# Potentially interesting variables
# - Alphabetic order of name
# - Age range: baby, minor, adult, elder
# - If the person was alone or accompanied

In [50]:
# Look into categorical columns
print(df.columns)

for c in df.columns:
    print(df[c].value_counts())

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
PassengerId
1      1
599    1
588    1
589    1
590    1
      ..
301    1
302    1
303    1
304    1
891    1
Name: count, Length: 891, dtype: int64
Survived
0    549
1    342
Name: count, dtype: int64
Pclass
3    491
1    216
2    184
Name: count, dtype: int64
Name
Braund, Mr. Owen Harris                     1
Boulos, Mr. Hanna                           1
Frolicher-Stehli, Mr. Maxmillian            1
Gilinski, Mr. Eliezer                       1
Murdlin, Mr. Joseph                         1
                                           ..
Kelly, Miss. Anna Katherine "Annie Kate"    1
McCoy, Mr. Bernard                          1
Johnson, Mr. William Cahoone Jr             1
Keane, Miss. Nora A                         1
Dooley, Mr. Patrick                         1
Name: count, Length: 891, dtype: int64
Sex
male      577
female    314
Na

In [51]:
# Remove some columns
print(df.columns)
cols = ['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
df_model = df[cols].copy()

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


Column transformations:
- Pclass: one-hot encoding
- Name: numbers from the first letter
- Sex: one-hot encoding
- Age: age ranges and one-hot encoding
- SibSp: maybe ranges and one-hot encoding
- Fare: maybe ranges and one-hot encoding
- Parch: maybe ranges and one-hot encoding
- Embarked: one-hot encoding

In [52]:
def getAgeRanges(ages):
    '''Get the age ranges for the given ages'''
    AgeRange = []
    for i in ages:
        if i < 2:
            AgeRange.append('baby')
        elif i < 18:
            AgeRange.append('minor')
        elif i < 65:
            AgeRange.append('adult')
        elif i < 120:
            AgeRange.append('elder')
        else: 
            AgeRange.append('adult') # null values are considered adults
    return AgeRange

In [53]:
df_model['AgeRange'] = getAgeRanges(df_model['Age'])
df_model.drop('Age', axis=1, inplace=True)

In [54]:
df_model.groupby('AgeRange')['Survived'].mean()

AgeRange
adult    0.365059
baby     0.857143
elder    0.090909
minor    0.494949
Name: Survived, dtype: float64

In [55]:
# Check other columns
df_model.groupby('Embarked')['Survived'].mean()
df_model['Embarked_C'] = df_model['Embarked'].apply(
    lambda x: True if x == 'C' else False)
df_model.drop('Embarked', axis=1, inplace=True)

In [56]:
# Transform name column into cardinal of its first letter
FirstLetter = [ord(n[0]) - 65 for n in df_model['Name']]
print(FirstLetter[:10])
df_model.drop('Name', axis=1, inplace=True)
df_model['FirstLetter'] = FirstLetter

[1, 2, 7, 5, 0, 12, 12, 15, 9, 13]


In [57]:
# Create a column to check who was alone
df_model['Accompanied'] = (
    df_model['SibSp'].apply(lambda x: False if x == 0 else True) 
    + df_model['Parch'].apply(lambda x: False if x == 0 else True))
print(df_model['Accompanied'].value_counts())

Accompanied
False    537
True     354
Name: count, dtype: int64


In [58]:
# Convert categorical variables into binary variables, for machine learning
df_dummies = pd.get_dummies(df_model[['Sex', 'AgeRange']], drop_first=True)
df_dummies.head()

Unnamed: 0,Sex_male,AgeRange_baby,AgeRange_elder,AgeRange_minor
0,True,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,True,False,False,False


In [59]:
df_model.drop(['AgeRange', 'Sex'], axis=1, inplace=True)
df_model = df_model.join(df_dummies)
df_model.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Fare,Embarked_C,FirstLetter,Accompanied,Sex_male,AgeRange_baby,AgeRange_elder,AgeRange_minor
0,0,3,1,0,7.25,False,1,True,True,False,False,False
1,1,1,1,0,71.2833,True,2,True,False,False,False,False
2,1,3,0,0,7.925,False,7,False,False,False,False,False
3,1,1,1,0,53.1,False,5,True,False,False,False,False
4,0,3,0,0,8.05,False,0,False,True,False,False,False


In [60]:
# Display correlation matrix
df_model.corr()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Fare,Embarked_C,FirstLetter,Accompanied,Sex_male,AgeRange_baby,AgeRange_elder,AgeRange_minor
Survived,1.0,-0.338481,-0.035322,0.081629,0.257307,0.16824,-0.065673,0.203367,-0.543351,0.122966,-0.067344,0.080778
Pclass,-0.338481,1.0,0.083081,0.018443,-0.5495,-0.243292,0.067473,-0.135207,0.1319,0.028934,-0.077787,0.121568
SibSp,-0.035322,0.083081,1.0,0.414838,0.159651,-0.059528,-0.006731,0.584471,-0.114631,0.103813,-0.043834,0.302071
Parch,0.081629,0.018443,0.414838,1.0,0.216225,-0.011069,-0.033814,0.583398,-0.245489,0.164202,-0.027725,0.271398
Fare,0.257307,-0.5495,0.159651,0.216225,1.0,0.269335,-0.047025,0.271832,-0.182333,0.003637,-0.007425,-0.00943
Embarked_C,0.16824,-0.243292,-0.059528,-0.011069,0.269335,1.0,0.005552,0.095298,-0.082853,0.054457,0.02406,0.003043
FirstLetter,-0.065673,0.067473,-0.006731,-0.033814,-0.047025,0.005552,1.0,-0.057544,0.024359,-0.044815,-0.027411,-0.021505
Accompanied,0.203367,-0.135207,0.584471,0.583398,0.271832,0.095298,-0.057544,1.0,-0.303646,0.155614,-0.049234,0.267596
Sex_male,-0.543351,0.1319,-0.114631,-0.245489,-0.182333,-0.082853,0.024359,-0.303646,1.0,0.01764,0.082477,-0.12044
AgeRange_baby,0.122966,0.028934,0.103813,0.164202,0.003637,0.054457,-0.044815,0.155614,0.01764,1.0,-0.014126,-0.04467


In [61]:
# Drop uncorrelated columns
df_model.drop(['SibSp', 'Parch', 'FirstLetter', 'Fare'], axis=1, inplace=True)

# Machine Learning

In [62]:
RANDOM_SEED = 2010

## Test some models

In [63]:
# Prepare data for training. Split data into training and validation
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df_model, 
                                    test_size=0.2, 
                                    random_state=RANDOM_SEED)

print(f'Training dataset: {len(df_model)} samples.')
print(f'Validation dataset: {len(df_val)} samples.')

Training dataset: 891 samples.
Validation dataset: 179 samples.


In [64]:
# Split data into features and target variable
target = 'Survived'

X_train = df_train.iloc[:, 1:]
X_val = df_val.iloc[:, 1:]

y_train = df_train[target]
y_val = df_val[target]

In [65]:
# Create datasets without validation split
X_train_full = df_model.iloc[:, 1:]
y_train_full = df_model[target]

In [66]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

## Random Forest

In [67]:
# Standard Random Forest, non-linear classifier
from sklearn.ensemble import RandomForestClassifier

model_forest = make_pipeline(StandardScaler(),
                             RandomForestClassifier(
                                 n_estimators=1000, 
                                 random_state=RANDOM_SEED))
model_forest.fit(X_train, y_train)

In [68]:
y_pred = model_forest.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(accuracy)

0.8435754189944135


## Gradient Boost

In [81]:
# Ensemble method that trains multiple classifiers
from sklearn.ensemble import GradientBoostingClassifier

model_GB = make_pipeline(
    StandardScaler(),
    GradientBoostingClassifier(random_state=RANDOM_SEED, 
                                      n_estimators=100, 
                                      subsample=0.5, 
                                      validation_fraction=0.2,
                                      verbose=1, 
                                      n_iter_no_change=20))
model_GB.fit(X_train_full, y_train_full)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.2612           0.0758            0.19s
         2           1.1906           0.0593            0.18s
         3           1.0866           0.0545            0.19s
         4           1.0977           0.0453            0.17s
         5           0.9957           0.0311            0.17s
         6           1.0297           0.0268            0.16s
         7           0.9754           0.0234            0.16s
         8           0.9498           0.0217            0.15s
         9           0.9690           0.0173            0.15s
        10           0.9135           0.0076            0.15s
        20           0.8496           0.0019            0.12s
        30           0.8301          -0.0015            0.10s


In [82]:
y_pred = model_GB.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(accuracy)

0.8379888268156425


## Logistic Regression

In [71]:
# Logistic Regression, a linear classification model
from sklearn.linear_model import LogisticRegression

model_log = LogisticRegression(random_state=RANDOM_SEED, 
                               max_iter=1000)

model_log.fit(X_train, y_train)

In [72]:
y_pred = model_log.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(accuracy)

0.8212290502793296


# Make prediction

In [73]:
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [74]:
# Prepare the data for submission with the same transformations as the training data.
# This is a step needed to pass the data to the model.
test_ids = df_test['PassengerId']

df_test['AgeRange'] = getAgeRanges(df_test['Age'])
df_test['Embarked_C'] = df_test['Embarked'].apply(
    lambda x: True if x == 'C' else False)
df_test['Accompanied'] = (
    df_test['SibSp'].apply(lambda x: False if x == 0 else True) 
    + df_test['Parch'].apply(lambda x: False if x == 0 else True))

df_dummies = pd.get_dummies(df_test[['Sex', 'AgeRange']], 
                            drop_first=True)
df_test.drop(
    ['PassengerId', 'Age', 'AgeRange', 'Sex', 'Ticket', 
     'Cabin', 'Embarked', 'Name', 'Fare', 'SibSp', 'Parch'], 
    axis=1, inplace=True)
df_test = df_test.join(df_dummies)

In [75]:
# Fill fare null value
#df_test.loc[df_test['Fare'].isnull(), 'Fare'] = df_test['Fare'].mean()
#df_test[df_test['Fare'].isnull()]

In [76]:
df_test.head()

Unnamed: 0,Pclass,Embarked_C,Accompanied,Sex_male,AgeRange_baby,AgeRange_elder,AgeRange_minor
0,3,False,False,True,False,False,False
1,3,False,True,False,False,False,False
2,2,False,False,True,False,False,False
3,3,False,False,True,False,False,False
4,3,False,True,False,False,False,False


In [77]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   Pclass          418 non-null    int64
 1   Embarked_C      418 non-null    bool 
 2   Accompanied     418 non-null    bool 
 3   Sex_male        418 non-null    bool 
 4   AgeRange_baby   418 non-null    bool 
 5   AgeRange_elder  418 non-null    bool 
 6   AgeRange_minor  418 non-null    bool 
dtypes: bool(6), int64(1)
memory usage: 5.8 KB


In [83]:
y_test_preds = model_GB.predict(df_test)
y_test_preds

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [84]:
df_submission_example.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [80]:
df_submission = pd.DataFrame({'PassengerID': test_ids, 
                              'Survived': y_test_preds})
df_submission.to_csv("/kaggle/working/submission.csv", index=False)

# Conclussions

* The best model was obtained by using an Ensemble method: Gradient Boosting Classifier, achieving a consistent accuracy in the survival predictions for the test data of 0.78.
* Many features had to be removed and transformed to achieve the best solution.
* Initial variance was too high due to overfitting. Thanks to tuning the hyperparameters of the model and applying a Stochastic Gradient Boost, the overfitting problem was solved.