In [293]:
# To plot inline
%matplotlib inline

# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [294]:
data_train = pd.read_csv('train.csv')
print('Number Of Rows: {}'.format(len(data_train)))
print('--------------------------')
data_train.head()

Number Of Rows: 891
--------------------------


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [295]:
for x in ['Name','Ticket','Cabin','Embarked','Fare']: del data_train[x]
data_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch
0,1,0,3,male,22.0,1,0
1,2,1,1,female,38.0,1,0
2,3,1,3,female,26.0,0,0
3,4,1,1,female,35.0,1,0
4,5,0,3,male,35.0,0,0


In [296]:
data_train.Sex = data_train.Sex.astype('category').cat.codes
data_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch
0,1,0,3,1,22.0,1,0
1,2,1,1,0,38.0,1,0
2,3,1,3,0,26.0,0,0
3,4,1,1,0,35.0,1,0
4,5,0,3,1,35.0,0,0


In [297]:
data_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex               int8
Age            float64
SibSp            int64
Parch            int64
dtype: object

In [298]:
data_train.dropna(inplace=True)
print(data_train.isnull().values.any())
print('----------------------------------')
print('Number Of Rows: {}'.format(len(data_train)))

False
----------------------------------
Number Of Rows: 714


In [299]:
nmbr = data_train.groupby('Survived').count()['PassengerId']
nmbr

Survived
0    424
1    290
Name: PassengerId, dtype: int64

In [300]:
print('%.2f percent of total people survided.\n%.2f percent people died.'%(nmbr[1]*100/(nmbr[1] + nmbr[0]),nmbr[0]*100/(nmbr[1] + nmbr[0])))

40.62 percent of total people survided.
59.38 percent people died.


In [301]:
data_train['FamilySize'] = data_train.SibSp + data_train.Parch
del data_train['SibSp']
del data_train['Parch']
target = data_train['Survived'].copy() 
data_train

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,FamilySize
0,1,0,3,1,22.0,1
1,2,1,1,0,38.0,1
2,3,1,3,0,26.0,0
3,4,1,1,0,35.0,1
4,5,0,3,1,35.0,0
6,7,0,1,1,54.0,0
7,8,0,3,1,2.0,4
8,9,1,3,0,27.0,2
9,10,1,2,0,14.0,1
10,11,1,3,0,4.0,2


In [302]:
gender_survival = data_train.groupby(['Survived','Sex']).count()['PassengerId']
gender_survival

Survived  Sex
0         0       64
          1      360
1         0      197
          1       93
Name: PassengerId, dtype: int64

In [303]:
pw_survided = gender_survival.iloc[1]*100/(gender_survival.iloc[1] + gender_survival.iloc[0])
pm_died = gender_survival.iloc[2]*100/(gender_survival.iloc[2] + gender_survival.iloc[3])
print('%.2f percent of the total survived are women and %.2f of the total died are men'%(pw_survided,pm_died))

84.91 percent of the total survived are women and 67.93 of the total died are men


In [304]:
del data_train['PassengerId']
del data_train['Survived']
data_train

Unnamed: 0,Pclass,Sex,Age,FamilySize
0,3,1,22.0,1
1,1,0,38.0,1
2,3,0,26.0,0
3,1,0,35.0,1
4,3,1,35.0,0
6,1,1,54.0,0
7,3,1,2.0,4
8,3,0,27.0,2
9,2,0,14.0,1
10,3,0,4.0,2


In [305]:
X = data_train.as_matrix()
print(X)
y = target.as_matrix()
print(y)

[[  3.   1.  22.   1.]
 [  1.   0.  38.   1.]
 [  3.   0.  26.   0.]
 ..., 
 [  1.   0.  19.   0.]
 [  1.   1.  26.   0.]
 [  3.   1.  32.   0.]]
[0 1 1 1 0 0 0 1 1 1 1 0 0 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 0 0
 0 1 1 0 1 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 1 0 1 1 0 1 0 1 1 0 1 0 0 0 0 0 0
 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 0 0 1 0 0
 0 1 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1
 1 0 0 1 0 1 1 1 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 1 1 0 1 0 1 0 0 1 0 1 0 0
 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 1 0 1
 0 0 1 0 0 0 1 0 1 0 1 1 1 1 0 0 0 0 0 1 0 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0
 1 1 1 0 1 1 1 0 0 0 0 1 1 0 1 1 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1
 0 0 0 0 1 0 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 1
 1 1 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 1 1 1 1 0 0 1 1 0 1 1 0
 0 0 0 0 1 0 1 1 0 0 0 0 1 0 0 1 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 1
 1 1 0 0 1 1 0 1 0 1 0 1 0 0 

In [321]:
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,GradientBoostingClassifier,VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV,StratifiedKFold,train_test_split,KFold

In [314]:
n_splits = 4

svc_params = {'C':[0.9,1,1.5,10]}

rfc_params = {'n_estimators':[30,40],
              'min_samples_split':[10,20],
              'random_state':[10,15],
              'warm_start':(True,False)
             }

etc_params = {'n_estimators':[20,40],
              'min_samples_split':[10,20,50],
              'random_state':[10,15,30],
              'warm_start':(True,False),
              'criterion' : ('gini','entropy')
             }

gbc_params = {  'learning_rate':[0.1,0.15],
              'n_estimators':[10,50,80],
              'max_depth':[3,8,10],
              'warm_start':(True,False),
              'subsample' : [0.5,0.8,1.0]
             }
dtc_params = {'criterion':('gini','entropy'),
             'splitter':('best','random'),
              'max_depth':[5,10,15,None],
              'min_samples_split':[2,5,10]
             }
vtc_params = {'voting': ('hard','soft'),
             
      }


In [315]:
skf = StratifiedKFold(n_splits=n_splits)

clf1 = SVC()
clf1 = GridSearchCV(clf1,svc_params,cv=skf,verbose=1,n_jobs=2)
clf1.fit(X,y)

clf2 = RandomForestClassifier()
clf2 = GridSearchCV(clf2,rfc_params,cv=skf,verbose=1,n_jobs=2)
clf2.fit(X,y)

clf3 = ExtraTreesClassifier()
clf3 = GridSearchCV(clf3,etc_params,cv=skf,verbose=1,n_jobs=2)
clf3.fit(X,y)

clf4 = GradientBoostingClassifier()
clf4 = GridSearchCV(clf4,gbc_params,cv=skf,verbose=1,n_jobs=2)
clf4.fit(X,y)

clf5 = DecisionTreeClassifier()
clf5 = GridSearchCV(clf5,dtc_params,cv=skf,verbose=1,n_jobs=2)
clf5.fit(X,y)

# models = [x.best_estimator_ for x in [clf1,clf2,clf3,clf4,clf5,clf6]]
# scores = [x.best_score_ for x in [clf1,clf2,clf3,clf4,clf5,clf6]]

    

Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 16 candidates, totalling 64 fits


[Parallel(n_jobs=2)]: Done  16 out of  16 | elapsed:    0.3s finished
[Parallel(n_jobs=2)]: Done  64 out of  64 | elapsed:    2.5s finished


Fitting 4 folds for each of 72 candidates, totalling 288 fits


[Parallel(n_jobs=2)]: Done 256 tasks      | elapsed:    7.5s
[Parallel(n_jobs=2)]: Done 288 out of 288 | elapsed:    8.5s finished


Fitting 4 folds for each of 108 candidates, totalling 432 fits


[Parallel(n_jobs=2)]: Done 342 tasks      | elapsed:   22.7s
[Parallel(n_jobs=2)]: Done 432 out of 432 | elapsed:   35.8s finished


Fitting 4 folds for each of 48 candidates, totalling 192 fits


[Parallel(n_jobs=2)]: Done 192 out of 192 | elapsed:    0.6s finished


GridSearchCV(cv=StratifiedKFold(n_splits=4, random_state=None, shuffle=False),
       error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=2,
       param_grid={'criterion': ('gini', 'entropy'), 'splitter': ('best', 'random'), 'max_depth': [5, 10, 15, None], 'min_samples_split': [2, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [325]:
kf = KFold(n_splits=n_splits)
for tr_index,ts_index in kf.split(X,y):
    clf6 = VotingClassifier(estimators = [('svc',clf1.best_estimator_),('rfc',clf2.best_estimator_),('etc',clf3.best_estimator_),('gbc',clf4.best_estimator_),('dtc',clf5.best_estimator_)])
    X_train = X[tr_index]
    X_test = X[ts_index]
    y_train = y[tr_index]
    y_test = y[ts_index]
    
    clf6.fit(X_train,y_train)
    print('Training Score : %0.2f'%clf6.score(X_train,y_train))
    print('Testing Score : %0.2f'%clf6.score(X_test,y_test))

Training Score : 0.89
Testing Score : 0.79
Training Score : 0.88
Testing Score : 0.83
Training Score : 0.88
Testing Score : 0.80
Training Score : 0.87
Testing Score : 0.87


In [327]:
data_test = pd.read_csv('test.csv')
print('Number Of Rows: {}'.format(len(data_test)))
print('--------------------------')
data_test.head()

Number Of Rows: 418
--------------------------


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
