In [1]:
import pandas as pandas


from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble    import RandomForestClassifier
from sklearn.ensemble    import AdaBoostClassifier
from sklearn.neighbors   import KNeighborsClassifier
from sklearn.svm         import SVC

from sklearn.metrics import accuracy_score

%matplotlib inline
import matplotlib.pyplot as plot

In [2]:
dataRaw     = pandas.read_csv("data/train.csv")
dataTestRaw = pandas.read_csv("data/test.csv")

# take a peak into data
dataRaw.head()

# some initial (logical) considerations:
# 1. 'Survived' does obviously not depend on 'Name' or 'PassengerId'
# 2. Same for 'Embarked' (at least I can't see any reasonable connection), but cross-check to be sure
# 3. 'Sex' and 'Age' might have strong influence on survival ("Women and children first!")
# 4. There might be a strong correlation between 'Fare' and 'Pclass',
#    which might also be strongly correlated to 'Survived' -> feature extraction?

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
dataRaw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
dataRaw.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
# Insights:
# - 'Ticket' contains useless information, same for 'Cabin' which holds majorly NaNs -> drop
# - 'Age' needs to be imputed (mean? median?)
# - Missing 'Embarked' need to be added (only 2 values missing, just add most frequent category)

In [6]:
# prepare data for further inspection
dataDirty = dataRaw.copy()

# convert 'Sex', and 'Embarked' to 'category' and encode with integers
dataDirty['Sex']      = dataRaw['Sex'].astype('category').cat.codes
dataDirty['Embarked'] = dataRaw['Embarked'].astype('category').cat.codes

# drop unneeded features
#dataDirty.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# drop labels 'Survived'
#dataDirty.drop('Survived', axis=1, inplace=True)

In [7]:
# calculate correlation matrix
corrMatrix = dataDirty.corr()

In [8]:
corrMatrix['Survived'].sort_values(ascending=False)

# Insights:
# - 'Survived' has strong correlation with 'Fare',
#   and strong negative correlation with 'Pclass', and 'Sex'.
# - Correlation with 'Age' is not as strong as expected
# - 'Parch' and 'SibSp' are apparently not very relevant for 'Survived'
# - 'Embarked' has higher importance than expected, worth to dig deeper

Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Embarked      -0.176509
Pclass        -0.338481
Sex           -0.543351
Name: Survived, dtype: float64

In [9]:
corrMatrix['Embarked'].sort_values(ascending=False)

# Insights:
# - 'Embarked' is somehow correlated to 'Pclass', and 'Sex'
# - Relatively strong negative correlation with 'Fare' (meaning 
#   in some ports tickets were probably more expensive, but is
#   this relevant for this specific problem?)

Embarked       1.000000
Pclass         0.173511
Sex            0.118492
SibSp          0.071480
Parch          0.043351
PassengerId    0.012985
Age           -0.044830
Survived      -0.176509
Fare          -0.230365
Name: Embarked, dtype: float64

In [10]:
# prepare data for training

# set up pipelines for:
# - converting pandas dataframe in numpy ndarray
# - dropping bad data ('Tickets', 'Cabin', 'PassengerId', 'Name')
# - impute missing data ('Age', 'Embarked')
# - Normalize 'Age' and 'Fare' values
# - one-hot encode 'Embarked' and 'Sex'
        
class DataFrameSelector(TransformerMixin):
    def __init__(self, attributeIds):
        self.attributeIds = attributeIds
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attributeIds]
    
class DataFrameConverter(TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.values
    
class OneHotCatEncoder(TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        catEncoder    = LabelEncoder()
        oneHotEncoder = OneHotEncoder()
        
        catEncoded = catEncoder.fit_transform(X.values.ravel())
        
        encoded = oneHotEncoder.fit_transform(catEncoded.reshape(-1,1))
        
        return pandas.DataFrame(oneHotEncoder.fit_transform(catEncoded.reshape(-1,1)).toarray())
    
class CatImputer(TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        mostFreqCat = X.iloc[0].value_counts().idxmax()
        
        return X.fillna(mostFreqCat)
        

# one-hot encode and impute 'Embarked'
embarkedPipeline = Pipeline([
    ('selector' , DataFrameSelector(['Embarked'])),
    ('imputer'  , CatImputer()),
    ('encoder'  , OneHotCatEncoder()),
    ('converter', DataFrameConverter())
])

# one-hot encode 'Sex'
sexPipeline = Pipeline([
    ('selector' , DataFrameSelector(['Sex'])),
    ('encoder'  , OneHotCatEncoder()),
    ('converter', DataFrameConverter())
])

# impute and normalize 'Age'
agePipeline = Pipeline([
    ('selector', DataFrameSelector(['Age'])),
    ('imputor' , Imputer(strategy='mean')),
    ('scaler'  , StandardScaler())
])

# normalize 'Fare'
farePipeline = Pipeline([
    ('selector', DataFrameSelector(['Fare'])),
    ('imputor' , Imputer(strategy='mean')),
    ('scaler'  , StandardScaler())
])

# select remaining attributes
remPipeline = Pipeline([
    ('selector', DataFrameSelector(['Pclass']))
])

pipeline = FeatureUnion(transformer_list=[
    ('embarkedPipeline', embarkedPipeline),
    ('sexPipeline'     , sexPipeline),
    ('agePipeline'     , agePipeline),
    ('farePipeline'    , farePipeline),
    ('remPipeline'     , remPipeline)
])

In [11]:
# split features and labels
dataTrain  = dataRaw.drop('Survived', axis=1)
dataLabels = dataRaw['Survived'].copy()

# run data through pipeline
dataTrainPrepared = pipeline.fit_transform(dataTrain)

In [12]:
# model selection

clsfRandomForest = RandomForestClassifier()
clsfSVC          = SVC()
clsfNaiveBayes   = GaussianNB()
clsfKNN          = KNeighborsClassifier()

In [28]:
# evaluate RandomForestClassifier

paramGrid = [
    {'n_estimators': [10, 20, 25, 30, 35]
   , 'max_depth'   : [10, 20, 30]
   , 'max_features': [1, 2, 3, 4, 5, 6]
   , 'min_samples_split': [2, 3, 4, 5, 6]
   , 'min_samples_leaf': [1, 2, 3, 4, 5, 6]
   , 'random_state': [42]}
]

gridSearch = GridSearchCV(clsfRandomForest, paramGrid, cv=5, scoring='neg_mean_squared_error')

gridSearch.fit(dataTrainPrepared, dataLabels)

gridSearch.best_params_

{'max_depth': 10,
 'max_features': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 20,
 'random_state': 42}

In [26]:
clsfRandomForest = RandomForestClassifier(max_depth=10, max_features=2, n_estimators=20, random_state=42)

cross_val_score(clsfRandomForest, dataTrainPrepared, dataLabels)

array([ 0.79461279,  0.82154882,  0.82491582])

In [15]:
# check over-/underfitting

clsfRandomForest.fit(dataTrainPrepared, dataLabels)

predictions = clsfRandomForest.predict(dataTrainPrepared)

accuracy_score(dataLabels, predictions)

0.93378226711560042

In [16]:
# evaluate SVC

paramGrid = [
    { 'C': [0.5, 1.0, 2.0, 4.0, 5.0]
    , 'kernel': ['linear', 'rbf', 'poly']}
]

gridSearch = GridSearchCV(clsfSVC, paramGrid, cv=5, scoring='neg_mean_squared_error')

gridSearch.fit(dataTrainPrepared, dataLabels)

gridSearch.best_params_

{'C': 2.0, 'kernel': 'rbf'}

In [17]:
clsfSVC = SVC(C=2.0, kernel='rbf')

cross_val_score(clsfSVC, dataTrainPrepared, dataLabels)

array([ 0.79461279,  0.84175084,  0.81818182])

In [18]:
# check over-/underfitting

clsfSVC.fit(dataTrainPrepared, dataLabels)

predictions = clsfSVC.predict(dataTrainPrepared)

accuracy_score(dataLabels, predictions)

0.82828282828282829

In [19]:
# evaluate Naive Bayes

cross_val_score(clsfNaiveBayes, dataTrainPrepared, dataLabels)

array([ 0.73400673,  0.77777778,  0.77104377])

In [20]:
# evaluate KNearestNeighbors

paramGrid = [
    {'n_neighbors': [1, 3, 5, 10, 15, 20, 25, 30], 'weights': ['uniform', 'distance']},
]

gridSearch = GridSearchCV(clsfKNN, paramGrid, cv=5, scoring='neg_mean_squared_error')

gridSearch.fit(dataTrainPrepared, dataLabels)

gridSearch.best_params_

{'n_neighbors': 5, 'weights': 'uniform'}

In [21]:
clsfKNN = KNeighborsClassifier(n_neighbors=5, weights='uniform')

cross_val_score(clsfKNN, dataTrainPrepared, dataLabels)

array([ 0.78451178,  0.81818182,  0.81481481])

In [22]:
# SVC is the most promising model,
# so train it with whole training set
# and validate against test set

# prepare test data
dataTestPrepared = pipeline.fit_transform(dataTestRaw)

# train model and do predictions
clsfSVC.fit(dataTrainPrepared, dataLabels)

predictions = clsfSVC.predict(dataTestPrepared)

In [23]:
# create submission file
submission = pandas.DataFrame({
    'PassengerId': dataTestRaw['PassengerId'],
    'Survived'   : predictions
})

submission.to_csv("data/submission.csv", index=False)