### Load Packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import re #Regular Expressions
%matplotlib inline

### Import Data

In [24]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [25]:
#Combine train & test
titanic = pd.concat([train,test], sort = False)
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


### Treat Missing Values

In [26]:
#Check for missing values
titanic.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [27]:
#Treat Numericals
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
titanic['Fare'] = titanic['Fare'].fillna(titanic['Fare'].median())

In [28]:
#Treat Embarked
#titanic['Embarked'].value_counts().idxmax()
titanic['Embarked'] = titanic['Embarked'].fillna(titanic['Embarked'].value_counts().idxmax())

In [29]:
#Treat Cabin
titanic['Cabin'] = titanic['Cabin'].fillna('Missing')

In [30]:
titanic['Cabin'] = titanic['Cabin'].str[0]
titanic['Cabin'].value_counts()

M    1014
C      94
B      65
D      46
E      41
A      22
F      21
G       5
T       1
Name: Cabin, dtype: int64

In [31]:
#Encode Sex 
titanic['Sex'] = titanic['Sex'].map({'male': 1, 'female': 0})
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
PassengerId    1309 non-null int64
Survived       891 non-null float64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null int64
Age            1309 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1309 non-null float64
Cabin          1309 non-null object
Embarked       1309 non-null object
dtypes: float64(3), int64(5), object(4)
memory usage: 132.9+ KB


### New Features

#### Create Family Size and IsAlone

In [18]:
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1

titanic['IsAlone'] = 0
titanic.loc[titanic['FamilySize'] == 1, 'IsAlone'] = 1

In [33]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
PassengerId    1309 non-null int64
Survived       891 non-null float64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null int64
Age            1309 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1309 non-null float64
Cabin          1309 non-null object
Embarked       1309 non-null object
dtypes: float64(3), int64(5), object(4)
memory usage: 132.9+ KB


#### Extract Title

In [35]:
#Extract Title
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

In [36]:
titanic['Title'] = titanic['Name'].apply(get_title)

In [38]:
titanic.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0.0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,M,S,Mr


In [39]:
#Reduce Categories
titanic['Title'] = titanic['Title'].replace(['Capt', 'Col','Dr', 'Major', 'Rev'], 'Officer')
titanic['Title'] = titanic['Title'].replace(['Lady', 'Countess','Don', 'Sir', 'Jonkheer', 'Dona'], 'Royalty')
titanic['Title'] = titanic['Title'].replace(['Mlle', 'Ms'], 'Miss')
titanic['Title'] = titanic['Title'].replace('Mme', 'Mrs')

In [40]:
titanic['Title'].value_counts()

Mr         757
Miss       264
Mrs        198
Master      61
Officer     23
Royalty      6
Name: Title, dtype: int64

#### Extract Ticket

In [85]:
#Binning Fare
titanic['Fare'] = pd.qcut(titanic['Fare'], 13)

In [88]:
#Binning Age
titanic['Age'] = pd.qcut(titanic['Age'], 10, duplicates='drop')

In [42]:
#Extract Ticket
def get_ticket(ticket):
    txt = ticket.replace("/","")
    txt = txt.replace(".","")
    txt = txt.upper()
    
    if txt.isdigit():
        return 'xxx'
    else:
        title_search = re.search(r'[A-Z]+[A-Z0-9]+|[A-Z]|[A-Za-z]', txt)
        # If the title exists, extract and return it.
        if title_search:
            return title_search.group(0)
        return ""

In [43]:
titanic['Ticket'] = titanic['Ticket'].apply(get_ticket)

In [44]:
titanic['Ticket'].value_counts()

xxx        957
PC          92
CA          68
A5          28
SOTONOQ     24
SCPARIS     19
WC          15
STONO       14
A4          10
FCC          9
C            8
SOC          8
SOPP         7
STONO2       7
SCAH         5
WEP          4
LINE         4
PP           4
SOTONO2      3
FC           3
PPP          2
SC           2
SCA4         2
SWPP         2
AQ3          1
AS           1
CASOTON      1
SCA3         1
A            1
AQ4          1
LP           1
STONOQ       1
SOP          1
SCOW         1
FA           1
SP           1
Name: Ticket, dtype: int64

### Prepare Data for Modelling

In [89]:
titanic_all = titanic.drop(['PassengerId','Name','SibSp','Parch'],axis=1)

In [90]:
titanic_dummies.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Ticket_A4,Ticket_A5,Ticket_AQ3,Ticket_AQ4,Ticket_AS,...,Cabin_G,Cabin_M,Cabin_T,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
0,0.0,3,1,22.0,7.25,0,1,0,0,0,...,0,1,0,0,1,0,1,0,0,0
1,1.0,1,0,38.0,71.2833,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1.0,3,0,26.0,7.925,0,0,0,0,0,...,0,1,0,0,1,1,0,0,0,0
3,1.0,1,0,35.0,53.1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,0.0,3,1,35.0,8.05,0,0,0,0,0,...,0,1,0,0,1,0,1,0,0,0


In [91]:
titanic_dummies=pd.get_dummies(titanic_all, drop_first=True)
titanic_dummies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 72 columns):
Survived                  891 non-null float64
Pclass                    1309 non-null int64
Sex                       1309 non-null int64
Age_(16.0, 21.0]          1309 non-null uint8
Age_(21.0, 24.0]          1309 non-null uint8
Age_(24.0, 28.0]          1309 non-null uint8
Age_(28.0, 32.0]          1309 non-null uint8
Age_(32.0, 39.0]          1309 non-null uint8
Age_(39.0, 48.0]          1309 non-null uint8
Age_(48.0, 80.0]          1309 non-null uint8
Ticket_A4                 1309 non-null uint8
Ticket_A5                 1309 non-null uint8
Ticket_AQ3                1309 non-null uint8
Ticket_AQ4                1309 non-null uint8
Ticket_AS                 1309 non-null uint8
Ticket_C                  1309 non-null uint8
Ticket_CA                 1309 non-null uint8
Ticket_CASOTON            1309 non-null uint8
Ticket_FA                 1309 non-null uint8
Ticket_FC          

In [92]:
#Get Train
t_train = titanic_dummies[titanic_dummies['Survived'].notnull()]
t_train.describe()

Unnamed: 0,Survived,Pclass,Sex,"Age_(16.0, 21.0]","Age_(21.0, 24.0]","Age_(24.0, 28.0]","Age_(28.0, 32.0]","Age_(32.0, 39.0]","Age_(39.0, 48.0]","Age_(48.0, 80.0]",...,Cabin_G,Cabin_M,Cabin_T,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,...,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.647587,0.116723,0.08193,0.294052,0.094276,0.117845,0.093154,0.089787,...,0.004489,0.771044,0.001122,0.08642,0.725028,0.207632,0.580247,0.141414,0.020202,0.005612
std,0.486592,0.836071,0.47799,0.32127,0.274413,0.455871,0.292377,0.322606,0.290811,0.286037,...,0.06689,0.420397,0.033501,0.281141,0.446751,0.40584,0.493796,0.348644,0.14077,0.074743
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
75%,1.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
max,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [93]:
#Get Test
t_test = titanic_dummies[titanic_dummies['Survived'].isnull()]
t_test.describe()

Unnamed: 0,Survived,Pclass,Sex,"Age_(16.0, 21.0]","Age_(21.0, 24.0]","Age_(24.0, 28.0]","Age_(28.0, 32.0]","Age_(32.0, 39.0]","Age_(39.0, 48.0]","Age_(48.0, 80.0]",...,Cabin_G,Cabin_M,Cabin_T,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
count,0.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,...,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,,2.26555,0.636364,0.124402,0.107656,0.308612,0.090909,0.090909,0.102871,0.093301,...,0.002392,0.782297,0.0,0.110048,0.645933,0.188995,0.574163,0.172249,0.011962,0.002392
std,,0.841838,0.481622,0.330435,0.310316,0.462474,0.287824,0.287824,0.304154,0.291203,...,0.048912,0.413179,0.0,0.313324,0.478803,0.391974,0.495062,0.378049,0.108844,0.048912
min,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
75%,,3.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
max,,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [94]:
X = t_train.drop('Survived', axis = 1)
y = t_train['Survived']

In [95]:
import sklearn.model_selection as model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.2, random_state = 200)

### Modelling

#### Logistic Regression (Kaggle : 0.770)

In [54]:
#Import libraries
from sklearn.linear_model import LogisticRegression

In [57]:
#Build model
logreg = LogisticRegression(solver='lbfgs', max_iter= 1000)
logreg.fit( X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [58]:
print(f'Train : {logreg.score(X_train, y_train):.2f}')
print(f'Test : {logreg.score(X_test, y_test):.2f}')

Train : 0.83
Test : 0.77


In [59]:
sub_test = t_test.drop(['Survived'], axis = 1)
sub_test_pred = logreg.predict(sub_test).astype(int)

In [60]:
AllSub = pd.DataFrame({ 'PassengerId': test['PassengerId'],
                       'Survived' : sub_test_pred
    
})

AllSub.to_csv("Basic_Log_model.csv", index = False)

#### Bagging Classifier (Kaggle : 0.698)

In [62]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [63]:
bclf=BaggingClassifier(oob_score=False,n_jobs=-1,n_estimators=20,random_state=200,
                      base_estimator=DecisionTreeClassifier())

In [64]:
from sklearn.model_selection import GridSearchCV

In [65]:
parameters = {'n_estimators' : (5,10,15,25,30,35,40,35,50)}

In [66]:
Bag_grid  = GridSearchCV(bclf, param_grid = parameters, cv = 3)

In [67]:
Bag_model = Bag_grid.fit(X_train, y_train)

In [68]:
Bag_grid.best_estimator_

BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
                                                        random_state=None,


In [69]:
print(f'Train : {Bag_model.score(X_train, y_train):.2f}')
print(f'Test : {Bag_model.score(X_test, y_test):.2f}')

Train : 0.98
Test : 0.77


In [70]:
sub_test = t_test.drop(['Survived'], axis = 1)
sub_test_pred = Bag_model.predict(sub_test).astype(int)

In [71]:
AllSub = pd.DataFrame({ 'PassengerId': test['PassengerId'],
                       'Survived' : sub_test_pred
    
})

AllSub.to_csv("GRID_CV_Bagging_model.csv", index = False)

#### Random Forest (Kaggle : 0.779)

In [96]:
from sklearn.ensemble import RandomForestClassifier

In [97]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 300, num = 30)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 20)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [98]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 15, 20, 25, 31, 36, 41, 46, 52, 57, 62, 67, 73, 78, 83, 88, 94, 99, 104, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [99]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   21.6s finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [100]:
rf_random.best_params_

{'n_estimators': 80,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 25,
 'bootstrap': False}

In [101]:
print(f'Train : {rf_random.score(X_train, y_train):.2f}')
print(f'Test : {rf_random.score(X_test, y_test):.2f}')

Train : 0.87
Test : 0.78


In [102]:
sub_test = t_test.drop(['Survived'], axis = 1)
sub_test_pred = rf_random.predict(sub_test).astype(int)

In [103]:
AllSub = pd.DataFrame({ 'PassengerId': test['PassengerId'],
                       'Survived' : sub_test_pred
    
})

AllSub.to_csv("Random_CV_RF_model2.csv", index = False)