In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import re as re
%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook
%matplotlib inline

### Data Exploration

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
ntrain = train.shape[0]
ntest = test.shape[0]
all_data = [train,test]
print("Shape of training data:", train.shape)
print("Shape of test data:", test.shape)

Shape of training data: (891, 12)
Shape of test data: (418, 11)


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


#### Checking for Missing Values

In [5]:
for dataset in all_data:
    print(dataset.isnull().sum().sort_values(ascending=False))

Cabin          687
Age            177
Embarked         2
Fare             0
Ticket           0
Parch            0
SibSp            0
Sex              0
Name             0
Pclass           0
Survived         0
PassengerId      0
dtype: int64
Cabin          327
Age             86
Fare             1
Embarked         0
Ticket           0
Parch            0
SibSp            0
Sex              0
Name             0
Pclass           0
PassengerId      0
dtype: int64


### Feature Engineering

#### Sex

In [6]:
print (train[['Sex','Survived']].groupby(['Sex'], as_index=False).mean())

      Sex  Survived
0  female  0.742038
1    male  0.188908


#### Pclass

In [7]:
print (train[['Pclass','Survived']].groupby(['Pclass'], as_index=False).mean())

   Pclass  Survived
0       1  0.629630
1       2  0.472826
2       3  0.242363


#### Age
Age has some missing values so we will do some data cleaning and fill missing values as needed

In [8]:
for dataset in all_data:
    age_mean = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_count_null = dataset['Age'].isnull().sum()

    age_random_list = np.random.randint(age_mean - age_std, age_mean + age_std, size=age_count_null)
    dataset['Age'][np.isnan(dataset['Age'])] = age_random_list
    dataset['Age'] = dataset['Age'].astype(int)
    
train['Age_Category'] = pd.cut(train['Age'],5)
print(train[['Age_Category','Survived']].groupby(['Age_Category'],as_index=False).mean())

    Age_Category  Survived
0  (-0.08, 16.0]  0.522936
1   (16.0, 32.0]  0.336980
2   (32.0, 48.0]  0.408163
3   (48.0, 64.0]  0.434783
4   (64.0, 80.0]  0.090909


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


#### Name
Feature engineering with title of people

In [9]:
def get_title(name):
    title = re.search(' ([A-Za-z]+)\.', name)
    if title:
        return title.group(1)
    return ""

for dataset in all_data:
    dataset['Title'] = dataset['Name'].apply(get_title)

    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
                                                   'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

print(train[['Title','Survived']].groupby(['Title'],as_index=False).mean())

    Title  Survived
0  Master  0.575000
1    Miss  0.702703
2      Mr  0.156673
3     Mrs  0.793651
4    Rare  0.347826


#### SibSp & Parch
SibSp and Parch can be combined to make family size feature and also check number of people who were travelling alone

In [10]:
for dataset in all_data:
    dataset['FamilySize'] = dataset['SibSp']+dataset['Parch']+1
print(train[['FamilySize','Survived']].groupby(['FamilySize'],as_index=False).mean())

   FamilySize  Survived
0           1  0.303538
1           2  0.552795
2           3  0.578431
3           4  0.724138
4           5  0.200000
5           6  0.136364
6           7  0.333333
7           8  0.000000
8          11  0.000000


In [11]:
for dataset in all_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize']==1,'IsAlone'] = 1
print(train[['IsAlone','Survived']].groupby(['IsAlone'],as_index=False).mean())

   IsAlone  Survived
0        0  0.505650
1        1  0.303538


#### Embarked
This feature has some missing values and we fill them with the most common value

In [12]:
for dataset in all_data:
    dataset['Embarked'] = dataset['Embarked'].fillna(dataset['Embarked'].mode()[0])
print(train[['Embarked','Survived']].groupby(['Embarked'],as_index=False).mean())

  Embarked  Survived
0        C  0.553571
1        Q  0.389610
2        S  0.339009


#### Fare
This feature has some missing values and we fill them with the median

In [13]:
for dataset in all_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
train['Fare_Category'] = pd.qcut(train['Fare'],4)
print(train[['Fare_Category','Survived']].groupby(['Fare_Category'],as_index=False).mean())

     Fare_Category  Survived
0   (-0.001, 7.91]  0.197309
1   (7.91, 14.454]  0.303571
2   (14.454, 31.0]  0.454955
3  (31.0, 512.329]  0.581081


#### Data Cleaning

In [14]:
for dataset in all_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
    # Mapping titles
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Mapping Fare
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    # Mapping Age
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']                           = 4

# Feature Selection
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp',\
                 'Parch', 'FamilySize']
train = train.drop(drop_elements, axis = 1)
train = train.drop(['Age_Category', 'Fare_Category'], axis = 1)

test  = test.drop(drop_elements, axis = 1)

print (train.head(10))

train_d = train.drop(['Survived'],axis=1).values
test_d  = test.values
y = train['Survived'].values

   Survived  Pclass  Sex  Age  Fare  Embarked  Title  IsAlone
0         0       3    1    1     0         0      1        0
1         1       1    0    2     3         1      3        0
2         1       3    0    1     1         0      2        1
3         1       1    0    2     3         0      3        0
4         0       3    1    2     1         0      1        1
5         0       3    1    1     1         2      1        1
6         0       1    1    3     3         0      1        1
7         0       3    1    0     2         0      4        0
8         1       3    0    1     1         0      3        0
9         1       2    0    0     2         1      3        0


### Models

In [74]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn import grid_search
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin, clone

#### Support Vector Classifier

In [29]:
cs = [0.001, 0.01, 0.1, 1, 10, 100]
gammas = [0.001, 0.01, 0.1, 1]
kernels = ['linear', 'rbf']
param_grid = {'kernel':kernels, 'C':cs, 'gamma':gammas}
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(train_d, y)
print(grid_search.best_params_)

{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}


In [36]:
svc = SVC(C=100, kernel='rbf', gamma=0.01)
svc_score = cross_val_score(svc, train_d, y, cv=5)
print("CV score is:", svc_score.mean())

CV score is: 0.819299791084


#### K Neighbours Classifier

In [37]:
knn = KNeighborsClassifier(3)
knn_score = cross_val_score(knn, train_d, y, cv=5)
print("CV score is:", knn_score.mean())

CV score is: 0.763302192683


#### Decision Tree Classifier

In [41]:
param_grid = {'min_samples_split' : np.arange(2,100),'max_depth': np.arange(1,30)}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
grid_search.fit(train_d,y)
print(grid_search.best_params_)

{'max_depth': 9, 'min_samples_split': 9}


In [43]:
dtree = DecisionTreeClassifier(max_depth=9, min_samples_split=9)
dtree_score = cross_val_score(dtree, train_d, y, cv=5)
print("CV score is:", dtree_score.mean())

CV score is: 0.818232689207


#### Random Forest

In [48]:
param_grid = {"max_depth": np.arange(2,20)
            ,"min_samples_split" : np.arange(2,10)
            ,"n_estimators" : [10]
            ,"min_samples_leaf": np.arange(1,10)}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1, cv=5)
grid_search.fit(train_d, y)
print(grid_search.best_params_)

{'max_depth': 17, 'min_samples_leaf': 3, 'min_samples_split': 9, 'n_estimators': 10}


In [50]:
rfr = RandomForestClassifier(n_estimators=5000, max_depth=17, min_samples_leaf=3, min_samples_split=9)
rfr_score = cross_val_score(rfr, train_d, y, cv=5)
print("CV score is:", rfr_score.mean())

CV score is: 0.80815813393


#### AdaBoost

In [52]:
ada = AdaBoostClassifier()
ada_score = cross_val_score(ada, train_d, y, cv=5)
print("CV score is:", ada_score.mean())

CV score is: 0.796903276737


#### Gradient Boosting

In [55]:
gbc = GradientBoostingClassifier()
gbc_score = cross_val_score(gbc, train_d, y, cv=5)
print("CV score is:", gbc_score.mean())

CV score is: 0.809307121424


#### XGBoost

In [62]:
param_grid = {
        'max_depth': [1, 2, 3, 4, 5, 6, 7, 8],
        'n_estimators': [4, 5, 6, 7, 8, 9, 10],
        'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
    }
grid_search = GridSearchCV(XGBClassifier(), param_grid, cv=5, n_jobs=-1)
grid_search.fit(train_d, y)
print(grid_search.best_params_)

{'learning_rate': 0.4, 'max_depth': 2, 'n_estimators': 8}


In [65]:
xgb = XGBClassifier(max_depth=2, learning_rate=0.4, n_estimators=8)
xgb_score = cross_val_score(xgb, train_d, y, cv=5)
print("CV score is:", xgb_score.mean())

CV score is: 0.818176124651


#### Logistic Regression

In [67]:
logr = LogisticRegression()
logr_score = cross_val_score(logr, train_d, y, cv=5)
print("CV score is:", logr_score.mean())

CV score is: 0.794554943334


#### Stacking Algorithms

In [75]:
class StackingAveragedModels(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [78]:
stacked_average_models = StackingAveragedModels(base_models=(svc, dtree, rfr, gbc, xgb), meta_model=logr)
stack_score = cross_val_score(stacked_average_models, train_d, y, cv=5)
print("CV score is:", stack_score.mean())

CV score is: 0.812633826598


#### Submission

In [81]:
stacked_average_models.fit(train_d, y)
y_pred = stacked_average_models.predict(test_d)

t = pd.read_csv("test.csv")
sub = pd.DataFrame()
sub['PassengerId'] = t['PassengerId']
sub['Survived'] = y_pred
sub.to_csv("submission.csv",index=False)