In [190]:
import numpy as np
import pandas as pd
from collections import Counter
import seaborn as sb
import re
import matplotlib.pyplot as plt
data_train = pd.read_csv("./data/train.csv")
data_test = pd.read_csv("./data/test.csv")
data_train = data_train.drop(['PassengerId','Ticket'],axis = 1)
data_test = data_test.drop(['Ticket','PassengerId'],axis=1)
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB


# visualize feature 'Embarked'
- factorplot
- countplot
- barplot

In [191]:
# Embarked
data_train['Embarked'].fillna('S',inplace = True)
sb.factorplot('Embarked','Survived',data=data_train,size=4,aspect = 3)
fig,[axis1,axis2,axis3] = plt.subplots(1,3,figsize=(15,3))
sb.countplot('Embarked',data=data_train,ax=axis1)
sb.countplot(x = 'Embarked',hue = 'Survived',data=data_train,ax=axis2)
mean_emb = data_train[['Embarked','Survived']].groupby(['Embarked'],as_index=False).mean()
sb.barplot(x='Embarked',y='Survived',data=mean_emb,order=['S','C','Q'],ax = axis3)
plt.show()

# onehot set of 'Embarked'

In [192]:
dumemb_train = pd.get_dummies(data_train['Embarked'])
dumemb_test = pd.get_dummies(data_test['Embarked'])
data_train = data_train.join(dumemb_train)
data_test = data_test.join(dumemb_test)
data_train.drop(['Embarked'],axis=1,inplace = True)
data_test.drop(['Embarked'],axis=1,inplace = True)
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Cabin       204 non-null object
C           891 non-null uint8
Q           891 non-null uint8
S           891 non-null uint8
dtypes: float64(2), int64(4), object(3), uint8(3)
memory usage: 65.3+ KB


# visialize feature 'Age'
- fill na data
- see diff between before and after

In [193]:
#train
fig,[axis1,axis2] = plt.subplots(1,2,figsize=(15,4))
axis1.set_title('origin age')
axis2.set_title('modified Age')

data_train['Age'].hist(bins = 70,ax = axis1)

# get distribute of age in test and train
## train
mean_age_train = data_train['Age'].mean()
std_age_train = data_train['Age'].std()
numnull_train = data_train['Age'].isnull().astype('int').sum()

## test
mean_age_test = data_test['Age'].mean()
std_age_test = data_test['Age'].std()
numnull_test = data_test['Age'].isnull().astype('int').sum()

# random initialize na value
rand1 = np.random.randint(mean_age_train-std_age_train,mean_age_train+std_age_train,size=numnull_train)
rand2 = np.random.randint(mean_age_test-std_age_test,mean_age_test+std_age_test,size=numnull_test)

data_train.loc[data_train['Age'].isnull(),'Age']=rand1
data_test.loc[data_test['Age'].isnull(),'Age']=rand2

data_train['Age'].hist(bins = 70,ax = axis2)
plt.show()

In [194]:
# .... continue with plot Age column

data_train['Age'] = data_train['Age'].astype('int')
data_test['Age'] = data_test['Age'].astype('int')

# peaks for survived/not survived passengers by their age
facet = sb.FacetGrid(data_train, hue="Survived",aspect=4)
facet.map(sb.kdeplot,'Age',shade= True)
facet.set(xlim=(0, data_train['Age'].max()))
facet.add_legend()

# average survived passengers by age
fig, axis1 = plt.subplots(1,1,figsize=(22,4))
average_age = data_train[["Age", "Survived"]].groupby(['Age'],as_index=False).mean()
sb.barplot(x='Age', y='Survived',data=average_age)
plt.show()

In [195]:
#Pclass
sb.countplot('Pclass',data=data_train)
facet = sb.FacetGrid(data_train,hue='Survived',aspect=4)
facet.map(sb.countplot,'Pclass')
facet.add_legend()
plt.show()

dumy_train = pd.get_dummies(data_train['Pclass'],prefix = 'pclass')
dumy_test = pd.get_dummies(data_test['Pclass'],prefix = 'pclass')

data_train = data_train.join(dumy_train)
data_test = data_test.join(dumy_test)

data_train.drop('Pclass',axis = 1, inplace = True)
data_test.drop('Pclass',axis = 1, inplace = True)

data_train.head()

Unnamed: 0,Survived,Name,Sex,Age,SibSp,Parch,Fare,Cabin,C,Q,S,pclass_1,pclass_2,pclass_3
0,0,"Braund, Mr. Owen Harris",male,22,1,0,7.25,,0,0,1,0,0,1
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,71.2833,C85,1,0,0,1,0,0
2,1,"Heikkinen, Miss. Laina",female,26,0,0,7.925,,0,0,1,0,0,1
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,53.1,C123,0,0,1,1,0,0
4,0,"Allen, Mr. William Henry",male,35,0,0,8.05,,0,0,1,0,0,1


In [196]:
#Sex
data_train.loc[data_train['Age'] < 16,'Sex'] = 'child'
data_test.loc[data_train['Age'] < 16, 'Sex'] = 'child'
sb.countplot(x = 'Sex',data=data_train)
plt.show()
dumy_train = pd.get_dummies(data_train['Sex'])
dumy_test = pd.get_dummies(data_test['Sex'])

data_train = data_train.join(dumy_train)
data_test = data_test.join(dumy_test)

data_train.drop('Sex',axis=1,inplace = True)
data_test.drop('Sex',axis=1,inplace=True)



In [197]:
#Cabin
data_train.drop('Cabin',axis=1,inplace=True)
data_test.drop('Cabin',axis=1,inplace=True)

In [198]:
#Fare only test
data_test.loc[data_test['Fare'].isnull(),'Fare'] = data_test['Fare'].mean()

In [199]:
#Famliy
data_train['Family'] = data_train['SibSp'] + data_train['Parch']
data_test['Family'] = data_test['SibSp'] + data_test['Parch']

data_train.loc[data_train['Family'] > 0,'Family'] = 1
data_train.loc[data_train['Family'] == 0, 'Family'] = 0

data_test.loc[data_test['Family'] > 0,'Family'] = 1
data_test.loc[data_test['Family'] == 0, 'Family'] = 0

data_train.drop(['SibSp','Parch'],axis=1,inplace = True)
data_test.drop(['SibSp','Parch'],axis=1,inplace = True)



In [200]:
#plot family
fig,[axis1,axis2]=plt.subplots(1,2,figsize=(15,4))
sb.countplot('Family',data=data_train,ax=axis1)
faav = data_train[['Family','Survived']].groupby(['Family'],as_index=False).mean()
sb.barplot('Family','Survived',data=faav,ax=axis2)
plt.show()

In [201]:
data_train.drop('Name',axis=1,inplace=True)
data_test.drop('Name',axis=1,inplace=True)
result = data_train['Survived']
data_train.drop('Survived',axis=1,inplace=True)

In [209]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier


#clf = RandomForestClassifier(n_estimators=100,min_samples_split = 4,min_samples_leaf = 2)
clf = ExtraTreesClassifier(n_estimators=100,min_samples_split = 4,min_samples_leaf = 2)
rfc = svm.SVC()
rfc.fit(data_train,result)
clf.fit(data_train,result)
accuracy = sum((clf.predict(data_train) == result).astype('float'))/len(data_train)
accuracy1 = sum((rfc.predict(data_train) == result).astype('float'))/len(data_train)
predictions = clf.predict(data_test)
predictions1 = rfc.predict(data_test)
predictions2 = (predictions + predictions1)/2.0
predictions2[predictions2 > .5] = 1
predictions2[predictions2 <= .5] = 0
predictions2 = predictions2.astype('int')
submisson = pd.DataFrame({ 'PassengerId' : pd.read_csv('data/test.csv')['PassengerId'], 'Survived': predictions1 })
submisson.to_csv('titanic-predictions.csv', index = False)
train_pd = clf.predict(data_train)
train_pd2 = rfc.predict(data_train)
train_pd3 = (train_pd+train_pd2)/2.0
train_pd3[train_pd3 > .5] = 1
train_pd3[train_pd3 <= .5] = 0
print(sum((train_pd3 == result).astype('float'))/len(data_train))
print(accuracy)
print(accuracy1)

0.859708193042
0.874298540965
0.877665544332
