In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import PIL

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

%matplotlib inline

## Data preprocessing

In [2]:
train_df = pd.read_csv('./train.csv')
train_df = train_df.drop('PassengerId', 1)
train_df = train_df.drop('Name', 1)
train_df = train_df.drop('Ticket', 1)
train_df = train_df.drop('Cabin', 1)
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [3]:
encode_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
for enc in encode_features:
    one_hot = pd.get_dummies(train_df[enc], prefix=enc)
    train_df = train_df.drop(enc,axis = 1)
    train_df = train_df.join(one_hot)
train_df.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.25,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,1,26.0,7.925,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,35.0,53.1,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,35.0,8.05,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


In [4]:
train_df.describe()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
count,891.0,714.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,...,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,29.699118,32.204208,0.242424,0.20651,0.551066,0.352413,0.647587,0.682379,0.234568,...,0.760943,0.132435,0.089787,0.005612,0.004489,0.005612,0.001122,0.188552,0.08642,0.722783
std,0.486592,14.526497,49.693429,0.42879,0.405028,0.497665,0.47799,0.47799,0.465813,0.423966,...,0.426747,0.339154,0.286037,0.074743,0.06689,0.074743,0.033501,0.391372,0.281141,0.447876
min,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,20.125,7.9104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,28.0,14.4542,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,1.0,38.0,31.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,80.0,512.3292,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
real_features = ['Age', 'Fare']
cat_features = list(set(train_df.columns.values.tolist()) - set(real_features) )
cat_features.remove('Survived')
print (cat_features)

['SibSp_3', 'Embarked_C', 'Pclass_1', 'SibSp_4', 'Parch_4', 'Parch_5', 'Pclass_2', 'Embarked_S', 'Parch_0', 'SibSp_0', 'Pclass_3', 'SibSp_8', 'Parch_3', 'Sex_female', 'Sex_male', 'Parch_2', 'SibSp_5', 'Embarked_Q', 'Parch_6', 'Parch_1', 'SibSp_2', 'SibSp_1']


In [6]:
print (train_df.shape)

(891, 25)


In [7]:
train_df['Age'] = train_df['Age'].fillna(0)

In [8]:
y = train_df['Survived'].to_numpy()
train_df = train_df.drop('Survived', 1)

X_real = train_df[real_features].to_numpy()
X_cat = train_df[cat_features].to_numpy()

print ("X_real: {} ".format(X_real[0]))
print ("X_cat: {} ".format(X_cat[0]))
print ("y: {} ".format(y[0]))

X_real: [22.    7.25] 
X_cat: [0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1] 
y: 0 


In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_real_scaled = scaler.fit_transform(X_real)

In [10]:
#selected_features = (0, 1, 2, 5, 7, 9, 10, 12, 14, 16, 17, 18, 19, 21, 22)

In [11]:
X = np.hstack((X_real, X_cat))
#X = X[:,selected_features]

print (X.shape, y.shape)

(891, 24) (891,)


## Modeling

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier  

### L1

In [14]:
param_grid = {
    'C': [0.01, 0.02, 0.05, 0.07, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0, 2.0, 3.0, 5.0, 6.0, 7.0, 10.0, 20.0, 30.0],
    'solver': ['liblinear'],
    'penalty': ['l1'],
    'max_iter': [50, 100, 1000, 2000, 10000]
}

estimator = LogisticRegression()

lr1_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
lr1_model.fit(X, y)


Fitting 10 folds for each of 90 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   20.0s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.01, 0.02, 0.05, 0.07, 0.1, 0.2, 0.3, 0.4, 0.5,
                               1.0, 2.0, 3.0, 5.0, 6.0, 7.0, 10.0, 20.0, 30.0],
                         'max_iter': [50, 100, 1000, 2000, 10000],
                         'penalty': ['l1'], 'solver': ['liblinear']},
             pre_dispatch='2*n_jobs', refit=Tru

In [15]:
print (lr1_model.best_estimator_)

LogisticRegression(C=5.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)


In [16]:
print (lr1_model.best_score_)
print (lr1_model.best_estimator_.coef_)

0.7901234567901234
[[-0.01601113  0.00274347 -0.75214644  0.          0.74085418  0.
  -2.69947826 -1.10404975  0.         -0.29484047  0.          1.26242164
  -1.05508902 -2.53926933  0.          0.74102498 -1.92075348  0.21770523
  -1.24239366 -0.12023573 -1.60535026  0.50694448  1.11024461  1.38745057]]


In [17]:
c = 0
not_null_idx = []
for cf in lr1_model.best_estimator_.coef_[0]:
    if cf != 0:
        not_null_idx.append(c)
    c += 1
print (len(not_null_idx))    
print (not_null_idx)    

19
[0, 1, 2, 4, 6, 7, 9, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23]


In [18]:
X = X[:, not_null_idx]
print (X.shape)

(891, 19)


### L2

In [19]:
param_grid = {
    'C': [0.01, 0.02, 0.05, 0.07, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 5.0, 10.0, 20.0, 30.0],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'penalty': ['l2'],
    'max_iter': [50, 100, 1000]
}

estimator = LogisticRegression()

lr2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
lr2_model.fit(X, y)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 270 candidates, totalling 2700 fits


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 347 tasks      | elapsed:   29.1s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:   50.1s
[Parallel(n_jobs=-1)]: Done 1285 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 2002 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 2662 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 2700 out of 2700 | elapsed:  3.4min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.01, 0.02, 0.05, 0.07, 0.1, 0.2, 0.3, 0.4, 0.5,
                               0.6, 0.7, 0.8, 0.9, 1.0, 5.0, 10.0, 20.0, 30.0],
                         'max_iter': [50, 100, 1000], 'penalty': ['l2'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                 

In [20]:
print (lr2_model.best_estimator_)

LogisticRegression(C=0.9, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


In [21]:
print (lr2_model.best_score_)

0.797979797979798


## SVM

In [22]:
from sklearn.svm import SVC

In [23]:
svm_model = SVC(gamma='auto')

score = cross_val_score(svm_model, X, y, cv=10)
score_mean = score.mean()

In [24]:
print (score_mean)

0.7252258540460786


In [25]:
param_grid = {
    'kernel': ['linear'],
    'degree': [1],
    'C': [50.0, 100.0, 150.0, 200.0, 250.0 ]
    
}
estimator = SVC()

svm2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
svm2_model.fit(X, y)

Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [None]:
print (svm2_model.best_estimator_)

In [None]:
print (svm2_model.best_score_)

## Decision tree

In [None]:
tree = DecisionTreeClassifier()

score = cross_val_score(tree, X, y, cv=10)
score_mean = score.mean()


In [None]:
print (score_mean)

In [None]:
param_grid = {
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 10], 
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_split': [2, 3, 4, 5, 6, 10],
}
estimator = DecisionTreeClassifier()

tr2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
tr2_model.fit(X, y)

In [None]:
print (tr2_model.best_estimator_)

In [None]:
print (tr2_model.best_score_)

## Random Forest

In [None]:
rf_model = RandomForestClassifier(n_estimators=100)

score = cross_val_score(rf_model, X, y, cv=10)
score_mean = score.mean()


In [None]:
print (score_mean)

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300, 400], 
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [3, 4],
}
estimator = RandomForestClassifier()

rf2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
rf2_model.fit(X, y)

In [None]:
print (rf2_model.best_estimator_)

In [None]:
print (rf2_model.best_score_)

### Bagging

In [None]:
bc1_model = BaggingClassifier(n_estimators=100)

score = cross_val_score(bc1_model, X, y, cv=10)
score_mean = score.mean()


In [None]:
print (score_mean)

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300, 400],    
}
estimator = BaggingClassifier()

bc2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
bc2_model.fit(X, y)

In [None]:
print (bc2_model.best_estimator_)

In [None]:
print (bc2_model.best_score_)

In [None]:
mf = int(np.sqrt(X.shape[1]))
print (X.shape[1], mf)

tree = DecisionTreeClassifier(max_features=mf)

param_grid = {
    'base_estimator': [tree],
    'n_estimators': [100, 200, 300, 400],    
}
estimator = BaggingClassifier()

bc3_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
bc3_model.fit(X, y)

In [None]:
print (bc3_model.best_estimator_)

In [None]:
print (bc3_model.best_score_)

## ExtraTrees

In [None]:
etr_model = ExtraTreesClassifier()

score = cross_val_score(etr_model, X, y, cv=10)
score_mean = score.mean()

In [None]:
print (score_mean)

In [None]:
param_grid = {    
    'n_estimators': [20, 30, 40, 50, 100],    
    'max_depth': [2, 3, 4, 5, 10, 15],
    'min_samples_split': [2, 3, 4, 5],
}
estimator = ExtraTreesClassifier()

etr2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
etr2_model.fit(X, y)

In [None]:
print (etr2_model.best_estimator_)

In [None]:
print (etr2_model.best_score_)

## AdaBoost

In [None]:
ada_model = AdaBoostClassifier()

score = cross_val_score(ada_model, X, y, cv=10)
score_mean = score.mean()

In [None]:
print (score_mean)

In [None]:
param_grid = {        
    'learning_rate': [0.001, 0.01, 0.1, 0.5, 0.6, 0.7, 0.8, 1.],
    'n_estimators': [ 40, 50, 60, 70, 80, 90, 100]
}
estimator = AdaBoostClassifier()

ada2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
ada2_model.fit(X, y)

In [None]:
print (ada2_model.best_estimator_)

In [None]:
print (ada2_model.best_score_)

## GradientBoosting

In [None]:
gb_model = GradientBoostingClassifier()

score = cross_val_score(gb_model, X, y, cv=10)
score_mean = score.mean()

In [None]:
print (score_mean)

In [None]:
param_grid = {
    'learning_rate': [0.03, 0.04, 0.05, 0.06, 0.07, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [20, 22, 25, 27, 28, 30],
    'n_estimators': [400, 450, 500, 550, 600, 650, 700]
}
estimator = GradientBoostingClassifier()

gb2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
gb2_model.fit(X, y) 

In [None]:
print (gb2_model.best_estimator_)

In [None]:
print (gb2_model.best_score_)

## XGBoost

In [None]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier

In [None]:
xgb_model = XGBClassifier()

score = cross_val_score(xgb_model, X, y, cv=10)
score_mean = score.mean()


In [None]:
print (score_mean)

In [None]:
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.15],
    'max_depth': [2, 3, 4, 5],
    'min_child_weight': [2, 3],
    'gamma': [0.01, 0.06, 0.1, 0.2],
    'colsample_bytree': [0.5, 0.6, 0.7],
    'n_estimators': [400, 500, 600, 700, 800, 900]
}
estimator = XGBClassifier()

xgb2_model = GridSearchCV(estimator, param_grid, cv=10, n_jobs=-1, verbose=3)
xgb2_model.fit(X, y)

In [None]:
print (xgb2_model.best_estimator_)

In [None]:
print (xgb2_model.best_score_)

## Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
best_models = [
    #('lr1', lr1_model), 
    ('lr2', lr2_model),
    ('svm', svm2_model),  
    ('tr2', tr2_model), 
    ('rf2', rf2_model), 
    ('bc2', bc2_model),  
    #('bc3', bc3_model),
    ('etr', etr2_model),
    ('ada', ada2_model),
    ('gb', gb2_model),
    ('xgb', xgb2_model),
]
best_estimators = []
for md in best_models:
    print ("{}_model: {}".format(md[0], md[1].best_score_))
    best_estimators.append((md[0], md[1].best_estimator_))

In [None]:
vt_model = VotingClassifier(best_estimators, n_jobs=-1)

print (vt_model)

In [None]:
score = cross_val_score(vt_model, X, y, cv=10)
score_mean = score.mean()

In [None]:
print (score_mean)

In [None]:
vt_model.fit(X, y)

In [None]:
print (len(vt_model.estimators))

## VT submission

In [None]:
df = pd.read_csv('./test.csv')
passenger_ids = df['PassengerId'].to_numpy()
df = df.drop('PassengerId', 1)
df = df.drop('Name', 1)
df = df.drop('Ticket', 1)
df = df.drop('Cabin', 1)
df.head()

In [None]:
encode_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
for enc in encode_features:
    one_hot = pd.get_dummies(df[enc], prefix=enc)
    df = df.drop(enc,axis = 1)
    df = df.join(one_hot)
df.head()

In [None]:
df.describe()

In [None]:
df['Age'] = df['Age'].fillna(0)
df['Fare'] = df['Fare'].fillna(0)

In [None]:
X_testing_real_zeros = df[real_features].to_numpy()
X_testing_cat_zeros = df[cat_features].to_numpy()

X_testing_real_zeros_scaled = scaler.transform(X_testing_real_zeros)

print ("X_real: {} ".format(X_testing_real_zeros[0]))
print ("X_cat: {} ".format(X_testing_cat_zeros[0]))
print ("y: {} ".format(y[0]))

In [None]:
print (df.shape)

In [None]:
X_testing = np.hstack((X_testing_real_zeros_scaled, X_testing_cat_zeros))
#X_testing = X_testing[:,selected_features]
X_testing = X_testing[:, not_null_idx]
print (X.shape)

predicted = vt_model.predict(X_testing)

print (predicted)
print (passenger_ids)

In [None]:
with open("submission_vt4.txt", "w") as fout:
    fout.write("PassengerId,Survived\n")
    for val in zip(passenger_ids, predicted):
        fout.write("{},{}\n".format(str(int(val[0])), str(int(val[1]))))