In [1]:
# model building and submission

In [2]:
import numpy as np
import pandas as pd

In [3]:
train = pd.read_csv('train_1.csv')
test = pd.read_csv('test_1.csv')

In [4]:
train.head()

Unnamed: 0.1,Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,FamilySize
0,0,0,3,0,1.0,0.0,2.0,0,0,0.4
1,1,1,1,1,3.0,2.0,0.8,1,2,0.4
2,2,1,3,1,1.0,0.0,2.0,0,1,0.0
3,3,1,1,1,2.0,2.0,0.8,0,2,0.4
4,4,0,3,0,2.0,0.0,2.0,0,0,0.0


In [5]:
test.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,FamilySize
0,0,892,3,0,2.0,0.0,2.0,2,0,0.0
1,1,893,3,1,3.0,0.0,2.0,0,2,0.4
2,2,894,2,0,3.0,0.0,2.0,2,0,0.0
3,3,895,3,0,2.0,0.0,2.0,0,0,0.0
4,4,896,3,1,1.0,0.0,2.0,0,2,0.8


In [6]:
from sklearn.model_selection import train_test_split
X = train.drop(['Survived'], axis = 1)
y = train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.30, random_state =1)

In [7]:
# Model Building 

In [8]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis                        

In [9]:
# cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits = 10, shuffle = True, random_state =0)

In [10]:
classifier = [
      KNeighborsClassifier(),
      SVC(kernel = 'rbf', C=0.025, probability = True),
      DecisionTreeClassifier(),
      RandomForestClassifier(),
      AdaBoostClassifier(),
      GaussianNB(),
      GradientBoostingClassifier() 
      ]

for clf in classifier:
    clf.fit(X_train,y_train)
    name = clf.__class__.__name__
    
    print(name)
    pred= clf.predict(X_test)
    accuracy = accuracy_score(pred, y_test)
    print('Accuracy: {:.4%}'.format(accuracy))
    print('-'*30)

KNeighborsClassifier
Accuracy: 60.4478%
------------------------------
SVC
Accuracy: 57.0896%
------------------------------
DecisionTreeClassifier
Accuracy: 70.8955%
------------------------------
RandomForestClassifier
Accuracy: 75.0000%
------------------------------
AdaBoostClassifier
Accuracy: 76.8657%
------------------------------
GaussianNB
Accuracy: 77.2388%
------------------------------
GradientBoostingClassifier
Accuracy: 77.9851%
------------------------------


In [11]:
# XGBoost Classifier

In [12]:
import xgboost as xgb
xg_clf = xgb.XGBClassifier(objective ='binary:logistic', 
                           colsample_bytree=0.2,
                           learning_rate = 0.3,
                          max_depth = 4,  
                          n_estimators = 30
                          )
xg_clf.fit(X_train,y_train)
pred= xg_clf.predict(X_test)
accuracy = accuracy_score(pred, y_test)
print(accuracy)

0.7686567164179104


In [13]:
# 

In [None]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

RFC = RandomForestClassifier()

parameters = {
 'max_depth': [100,110, 120, 130, 135],
 'max_features': ['auto'],
 'min_samples_leaf': [2, 3],
 'min_samples_split': [12, 14],
 'n_estimators': [5, 10, 20, 30,35, 40]
             }

RFC_random = RandomizedSearchCV(estimator=RFC, param_distributions=parameters, n_iter=100, 
                               cv=5, verbose=2, random_state=42, n_jobs = -1)

RFC_random.fit(X_train, y_train)
%time RFC_random.best_params_

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.3s


In [None]:
parameters = {
 'max_depth': [110,120,123,125,127,129, 130, 131, 132,],
 'max_features': ['auto'],
 'min_samples_leaf': [1,2],
 'min_samples_split': [11, 12, 13],
 'n_estimators': [8,9, 10,11,12,13]
             }

grid_search = GridSearchCV(estimator = RFC, param_grid = parameters, 
                          cv = 5, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)
%time grid_search.best_params_

In [None]:
# Best cross validation score
print('Cross Validation Score:', grid_search.best_score_, '\n')
# Best parameters which resulted in the best score
print('Best Parameters:', grid_search.best_params_)
best_grid = grid_search.best_estimator_
best_grid

# validate the model

In [None]:
from sklearn.model_selection import cross_val_score
validation = cross_val_score(best_grid, X, y, cv=5)
print(validation, '\n')
print('Cross validation mean:', np.mean(validation))

In [None]:
# Prediction and Submission 
id1 = test['PassengerId']
predictions = best_grid.predict(test.drop('PassengerId', axis=1))
output = pd.DataFrame({'PassengerId': id1, 'Survived': predictions})
output.head()

In [None]:
#set the output as a dataframe and convert to csv file named submission.csv
output.to_csv('submission2.csv', index=False)