In [283]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import AdaBoostClassifier
%matplotlib inline
sns.set()

train_df = pd.read_csv('titanic/train.csv')
train_rows = train_df.shape[0]
test_df = pd.read_csv('titanic/test.csv')
survived_train = train_df.Survived.copy()
dataset = pd.concat([train_df.drop(['Survived'],axis = 1),test_df])
print(dataset.info())
train_df.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null object
Age            1046 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1308 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 122.7+ KB
None


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Van Impe, Mr. Jean Baptiste",male,,,,1601.0,,C23 C25 C27,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [284]:
dataset.groupby('Pclass').Age.mean()

Pclass
1    39.159930
2    29.506705
3    24.816367
Name: Age, dtype: float64

In [285]:
dataset['Age'] = dataset.Age.fillna(dataset.Age.median())
dataset['Fare'] = dataset.Fare.fillna(dataset.Fare.median())
dataset['Embarked'] = dataset.Embarked.fillna(dataset.Embarked.mode()[0])


In [286]:
# Feature Engineering
dataset['Family'] = dataset['SibSp']+dataset['Parch']+1
dataset['isAlone'] = 1
dataset['isAlone'].loc[dataset['Family']>1] = 0
dataset['Title'] = dataset['Name'].str.split(',',expand = True)[1].str.split('.',expand = True)[0]
dataset['FareBin'] = pd.qcut(dataset['Fare'],4)
dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int),5)

dataset['Title'].value_counts()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


 Mr              757
 Miss            260
 Mrs             197
 Master           61
 Rev               8
 Dr                8
 Col               4
 Major             2
 Mlle              2
 Ms                2
 Lady              1
 Dona              1
 Jonkheer          1
 the Countess      1
 Mme               1
 Don               1
 Sir               1
 Capt              1
Name: Title, dtype: int64

In [287]:
title_count = (dataset['Title'].value_counts() < 10)
dataset['Title'] = dataset['Title'].apply(lambda x:'Misc' if title_count.loc[x] == True else x)
dataset['Title'].value_counts()

 Mr        757
 Miss      260
 Mrs       197
 Master     61
Misc        34
Name: Title, dtype: int64

In [288]:
drop_col = ['PassengerId','Name','Ticket','Cabin']
dataset = dataset.drop(drop_col,axis = 1)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
Pclass      1309 non-null int64
Sex         1309 non-null object
Age         1309 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Fare        1309 non-null float64
Embarked    1309 non-null object
Family      1309 non-null int64
isAlone     1309 non-null int64
Title       1309 non-null object
FareBin     1309 non-null category
AgeBin      1309 non-null category
dtypes: category(2), float64(2), int64(5), object(3)
memory usage: 115.1+ KB


In [289]:
label = LabelEncoder()
dataset['Sex_code'] = label.fit_transform(dataset['Sex'])
dataset['Pclass_code'] = label.fit_transform(dataset['Pclass'])
dataset['Embarked_code'] = label.fit_transform(dataset['Embarked'])
dataset['Title_code'] = label.fit_transform(dataset['Title'])
dataset['FareBin_code'] = label.fit_transform(dataset['FareBin'])
dataset['AgeBin_code'] = label.fit_transform(dataset['AgeBin'])

In [290]:
dataset_code = ['Sex_code','Pclass_code','Embarked_code','Fare','Age','Family']
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 18 columns):
Pclass           1309 non-null int64
Sex              1309 non-null object
Age              1309 non-null float64
SibSp            1309 non-null int64
Parch            1309 non-null int64
Fare             1309 non-null float64
Embarked         1309 non-null object
Family           1309 non-null int64
isAlone          1309 non-null int64
Title            1309 non-null object
FareBin          1309 non-null category
AgeBin           1309 non-null category
Sex_code         1309 non-null int64
Pclass_code      1309 non-null int64
Embarked_code    1309 non-null int64
Title_code       1309 non-null int64
FareBin_code     1309 non-null int64
AgeBin_code      1309 non-null int64
dtypes: category(2), float64(2), int64(11), object(3)
memory usage: 176.5+ KB


In [291]:
data_train = dataset[dataset_code].iloc[:891]
data_test = dataset[dataset_code].iloc[891:]
X = data_train.values
test = data_test.values
y = survived_train.values

In [294]:
treeClassifier = tree.DecisionTreeClassifier(max_depth = 6)
treeClassifier.fit(X,y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [280]:
clf = AdaBoostClassifier(n_estimators = 300,random_state = 0,learning_rate = 0.1)
clf.fit(X,y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.1, n_estimators=300, random_state=0)

In [295]:
y_pred = treeClassifier.predict(test)
test_df['Survived'] = y_pred
test_df[['PassengerId','Survived']].to_csv('titanic/DT.csv',index = False)
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


In [281]:
y_pred = clf.predict(test)
test_df['Survived'] = y_pred
test_df[['PassengerId','Survived']].to_csv('titanic/ada.csv',index = False)