## Ensembling

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold, KFold, RandomizedSearchCV, GridSearchCV, cross_validate
from sklearn.metrics import mean_squared_error, roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC
import xgboost as xgb

SEED = 42
NFOLDS = 5

train_ = pd.read_csv("assets/train.csv")
test_ = pd.read_csv("assets/test.csv")

In [102]:
num_features = set([c for c in train.columns if train[c].dtype != "object"])
cat_features = set([c for c in train.columns if c not in num_features])
print("Numerical Features: {}\nCategorical Features: {}".format(num_features, cat_features))

Numerical Features: {'SibSp', 'Survived', 'PassengerId', 'Parch', 'Pclass', 'Fare', 'Age'}
Categorical Features: {'Ticket', 'Embarked', 'Name', 'Cabin', 'Sex'}


In [135]:
def replace_rare_titles(df):
    title = df["Title"]
    if title in ["Capt", "Col", "Don", "Jonkheer", "Major", "Sir", "Rev"]:
        return "Mr"
    elif title in ["Mme", "th", "Lady", "Dona"]:
        return 'Mrs'
    elif title in ["Mlle", "Ms"]:
        return 'Miss'
    elif title =='Dr':
        if df['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

def substrings_in_string(big_string, substrings):
    if pd.isna(big_string):
        return "Unknown"
    for substring in substrings:
        if big_string.find(substring) != -1:
            if substring == "T":
                return "A"
            else:
                return substring
            
def engineer_numerical_features(df):
    temp = df.copy()
    
    ## AGE - FILL MISSING VALS
    age_avg = temp["Age"].mean()
    age_std = temp["Age"].std()
    age_null_count = temp["Age"].isnull().sum()
    rand_age = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    ages = temp["Age"].copy()
    ages[np.isnan(ages)] = rand_age
    temp["Age"] = ages
    
    ## PCLASS
    age_class = temp["Age"] * temp["Pclass"]
    temp["AgeClass"] = age_class

    ## AGE - BINNED
    temp["Age"] = pd.cut(temp["Age"], 5, labels=[0,1,2,3,4])
    temp["Age"] = temp["Age"].astype(int)

    ## FARE - FILL MISSING VALS
    temp["Fare"].fillna(temp["Fare"].median(), inplace=True)
    
    ## FAMILY SIZE
    family_size = temp["SibSp"] + temp["Parch"]
    temp["FamilySize"] = family_size
    
    ## FARE PER PERSON
    fare_per_person = temp["Fare"] / (temp["FamilySize"] + 1)
    temp["FarePerPerson"] = fare_per_person
    
    ## FARE - BINNED
    temp["Fare"] = pd.qcut(temp["Fare"], 4, labels=[0,1,2,3])
    temp["Fare"] = temp["Fare"].astype(int)
    
    return temp

def engineer_categorical_features(df):
    temp = df.copy()
    
    ## EMBARKED
    temp["Embarked"].fillna(temp["Embarked"].mode()[0], inplace=True)
    
    ## TITLES
    titles = temp["Name"].str.split(', ', expand=True)[1].str.split(". ", expand=True)[0]
    temp["Title"] = titles
    temp["Title"] = temp.apply(replace_rare_titles, axis=1)
    
    ## CABIN
    temp["Cabin"] = temp["Cabin"].fillna("U0")
    deck = temp["Cabin"].map(lambda x: x[0])
    temp["Deck"] = deck
    decks = {"A": 1, "T": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}
    temp["Deck"] = temp["Deck"].map(decks)
    temp["Deck"] = temp["Deck"].astype(int)
    
    ## ONE HOT ENCODING
    one_hot_cols = ["Title", "Sex", "Embarked"]
    
    for o in one_hot_cols:
        dummies = pd.get_dummies(temp[o], prefix=o)
        temp = pd.concat([temp, dummies], axis=1)
    
    temp.drop(columns=list(cat_features) + ["Title", "Sex", "Embarked", "PassengerId"], inplace=True)
    
    return temp

In [218]:
train = engineer_categorical_features(engineer_numerical_features(train_))
test = engineer_categorical_features(engineer_numerical_features(test_))

In [138]:
X = train.copy()
y = X.pop("Survived")

In [140]:
kfold = StratifiedKFold(n_splits=NFOLDS, random_state=SEED)

### Random Forest

**RandomizedSearch**

In [141]:
rf = RandomForestClassifier(random_state=SEED)

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

rf_random_param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator=rf, param_distributions=rf_random_param, n_iter=100, cv=kfold, random_state=SEED, n_jobs=-1)

rf_random.fit(X, y)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
          error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          retu

In [142]:
# rf_random.best_params_
rf_random.best_score_ # 0.8361391694725028

0.8361391694725028

In [160]:
#rf_random.best_params_

rf_random_p = {'n_estimators': 2000,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 40,
 'bootstrap': True}

**GridSearch**

In [147]:
rf = RandomForestClassifier(random_state=SEED)    

rf_grid_param = {'n_estimators': [1000, 1250, 1500, 1750],
     'min_samples_split': [8, 9, 10],
     'min_samples_leaf': [2, 3, 4],
     'max_features': ['auto'],
     'max_depth': [30, 40, 50, 60],
     'bootstrap': [True]}

rf_grid = GridSearchCV(estimator=rf, param_grid=rf_grid_param, cv=kfold, n_jobs=-1)

rf_grid.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
       error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [1000, 1250, 1500, 1750], 'min_samples_split': [8, 9, 10], 'min_samples_leaf': [2, 3, 4], 'max_features': ['auto'], 'max_depth': [30, 40, 50, 60], 'bootstrap': [True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [148]:
# rf_grid.best_params_
rf_random.best_score_ # 0.8361391694725028

0.8361391694725028

In [158]:
# rf_grid.best_params_

rf_grid_p = {'bootstrap': True,
 'max_depth': 30,
 'max_features': 'auto',
 'min_samples_leaf': 3,
 'min_samples_split': 9,
 'n_estimators': 1750}

**Comparing Performance**

In [152]:
rf_baseline = RandomForestClassifier(random_state=SEED)
rf_baseline.fit(X, y)
rf_baseline_results = cross_validate(rf_baseline, X, y, cv=kfold, return_train_score=True)

rf_random = RandomForestClassifier(random_state=SEED, **rf_random_p)
rf_random.fit(X, y)
rf_random_results = cross_validate(rf_random, X, y, cv=kfold, return_train_score=True)

rf_grid = RandomForestClassifier(random_state=SEED, **rf_grid_p)
rf_grid.fit(X, y)
rf_grid_results = cross_validate(rf_grid, X, y, cv=kfold, return_train_score=True)



In [153]:
rf_baseline_score = rf_baseline_results["test_score"].mean()
rf_random_score = rf_random_results["test_score"].mean()
rf_grid_score = rf_grid_results["test_score"].mean()
print("Baseline: {}\nRandom: {}\nGrid: {}".format(rf_baseline_score, rf_random_score, rf_grid_score))

Baseline: 0.8160614613798843
Random: 0.8361919535057082
Grid: 0.8362045785727787


### Gradient Boosting Classifier

**RandomizedSearch**

In [154]:
gbc = GradientBoostingClassifier(random_state=SEED)

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

gbc_random_param = {'loss' : ["deviance"],
              'n_estimators' : n_estimators,
              'learning_rate': [0.15, 0.1, 0.05, 0.01],
              'max_depth': [2, 4, 6, 8],
              'min_samples_leaf': [75, 100, 125, 150],
              'max_features': [0.5, 0.3, 0.1] 
              }

gbc_random = RandomizedSearchCV(estimator=gbc, param_distributions=gbc_random_param, n_iter=100, cv=kfold, random_state=SEED, n_jobs=-1)

gbc_random.fit(X, y)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
          error_score='raise-deprecating',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'loss': ['deviance'], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'learning_rate': [0.15, 0.1, 0.05, 0.01], 'max_depth': [2, 4, 6, 8], 'min_samples_leaf': [75, 100, 125, 150], 'max_features': [0.5, 0.3, 0.1]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [155]:
# gbc_random.best_params_
gbc_random.best_score_ # 0.8372615039281706

0.8372615039281706

In [157]:
#gbc_random.best_params_

gbc_random_p = {'n_estimators': 2000,
 'min_samples_leaf': 125,
 'max_features': 0.3,
 'max_depth': 8,
 'loss': 'deviance',
 'learning_rate': 0.15}

**GridSearch**

In [161]:
gbc = GradientBoostingClassifier(random_state=SEED)
gbc_grid_param = {'loss' : ["deviance"],
              'n_estimators' : [1600, 1800, 2000, 2200],
              'learning_rate': [0.2, 0.15, 0.1, 0.05],
              'max_depth': [6, 8, 10],
              'min_samples_leaf': [100, 125, 150],
              'max_features': [0.2, 0.3, 0.4]
              }

gbc_grid = GridSearchCV(gbc, param_grid=gbc_grid_param, cv=kfold, scoring="accuracy", n_jobs=-1)

gbc_grid.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
       error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'loss': ['deviance'], 'n_estimators': [1600, 1800, 2000, 2200], 'learning_rate': [0.2, 0.15, 0.1, 0.05], 'max_depth': [6, 8, 10], 'min_samples_leaf': [100, 125, 150], 'max_features': [0.2, 0.3, 0.4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [163]:
# gbc_grid.best_params_
gbc_grid.best_score_ # 0.8406285072951739

0.8406285072951739

In [165]:
# gbc_grid.best_params_

gbc_grid_p = {'learning_rate': 0.15,
 'loss': 'deviance',
 'max_depth': 6,
 'max_features': 0.2,
 'min_samples_leaf': 125,
 'n_estimators': 2000}

In [166]:
gbc_baseline = GradientBoostingClassifier(random_state=SEED)
gbc_baseline.fit(X, y)
gbc_baseline_results = cross_validate(gbc_baseline, X, y, cv=kfold, return_train_score=True)

gbc_random = GradientBoostingClassifier(random_state=SEED, **gbc_random_p)
gbc_random.fit(X, y)
gbc_random_results = cross_validate(gbc_random, X, y, cv=kfold, return_train_score=True)

gbc_grid = GradientBoostingClassifier(random_state=SEED, **gbc_grid_p)
gbc_grid.fit(X, y)
gbc_grid_results = cross_validate(gbc_grid, X, y, cv=kfold, return_train_score=True)

In [167]:
gbc_baseline_score = gbc_baseline_results["test_score"].mean()
gbc_random_score = gbc_random_results["test_score"].mean()
gbc_grid_score = gbc_grid_results["test_score"].mean()
print("Baseline: {}\nRandom: {}\nGrid: {}".format(gbc_baseline_score, gbc_random_score, gbc_grid_score))

Baseline: 0.8283959036622269
Random: 0.8373847386345139
Grid: 0.8407304168719127


### ExtraTrees

**RandomizedSearch**

In [190]:
etc = ExtraTreesClassifier(random_state=SEED)

etc_random_param = {"max_depth": [None],
              "max_features": [0.5, 2, 3],
              "min_samples_split": [2, 3, 4],
              "min_samples_leaf": [1, 2, 3],
              "bootstrap": [False],
              "n_estimators" :[100, 300, 500, 700, 900],
              "criterion": ["gini"]}

etc_random = RandomizedSearchCV(estimator=etc, param_distributions=etc_random_param, n_iter=100, cv=kfold, n_jobs=-1, random_state=SEED)

etc_random.fit(X,y)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
          error_score='raise-deprecating',
          estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'max_depth': [None], 'max_features': [0.5, 2, 3], 'min_samples_split': [2, 3, 4], 'min_samples_leaf': [1, 2, 3], 'bootstrap': [False], 'n_estimators': [100, 300, 500, 700, 900], 'criterion': ['gini']},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [191]:
# etc_random.best_params_
etc_random.best_score_ # 0.835016835016835

0.8316498316498316

In [192]:
# etc_random.best_params_

etc_random_p = {'n_estimators': 900,
 'min_samples_split': 2,
 'min_samples_leaf': 3,
 'max_features': 2,
 'max_depth': None,
 'criterion': 'gini',
 'bootstrap': False}

**GridSearch**

In [193]:
etc = ExtraTreesClassifier(random_state=SEED)

etc_grid_param = {"max_depth": [None],
              "max_features": [2, 3, 4],
              "min_samples_split": [2, 3, 4],
              "min_samples_leaf": [2, 3, 4],
              "bootstrap": [False],
              "n_estimators" :[800, 900, 1000, 1100],
              "criterion": ["gini"]}

etc_grid = GridSearchCV(etc, param_grid=etc_grid_param, scoring="accuracy", cv=kfold, n_jobs=-1)

etc_grid.fit(X,y)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
       error_score='raise-deprecating',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [None], 'max_features': [2, 3, 4], 'min_samples_split': [2, 3, 4], 'min_samples_leaf': [2, 3, 4], 'bootstrap': [False], 'n_estimators': [800, 900, 1000, 1100], 'criterion': ['gini']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [195]:
# etc_grid.best_params_
etc_grid.best_score_ # 0.8316498316498316

0.8316498316498316

In [197]:
# etc_grid.best_params_

etc_grid_p = {'bootstrap': False,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 2,
 'min_samples_leaf': 3,
 'min_samples_split': 2,
 'n_estimators': 900}

In [198]:
etc_baseline = ExtraTreesClassifier(random_state=SEED)
etc_baseline.fit(X, y)
etc_baseline_results = cross_validate(gbc_baseline, X, y, cv=kfold, return_train_score=True)

etc_random = ExtraTreesClassifier(random_state=SEED, **etc_random_p)
etc_random.fit(X, y)
etc_random_results = cross_validate(etc_random, X, y, cv=kfold, return_train_score=True)

etc_grid = ExtraTreesClassifier(random_state=SEED, **etc_grid_p)
etc_grid.fit(X, y)
etc_grid_results = cross_validate(etc_grid, X, y, cv=kfold, return_train_score=True)



In [199]:
etc_baseline_score = etc_baseline_results["test_score"].mean()
etc_random_score = etc_random_results["test_score"].mean()
etc_grid_score = etc_grid_results["test_score"].mean()
print("Baseline: {}\nRandom: {}\nGrid: {}".format(etc_baseline_score, etc_random_score, etc_grid_score))

Baseline: 0.8283959036622269
Random: 0.831672392276438
Grid: 0.831672392276438


### SVC

**RandomizedSearch**

In [200]:
svm = SVC(random_state=SEED, probability=True)

svm_random_param = {'kernel': ["rbf", "linear"], 
                  'gamma': [0.001, 0.01, 0.1, 1],
                  'C': [0.25, 1, 10, 50, 100, 500, 1000]}

svm_random = RandomizedSearchCV(estimator=svm, param_distributions=svm_random_param, n_iter=100, cv=kfold, random_state=SEED, n_jobs=-1)

svm_random.fit(X, y)



RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
          error_score='raise-deprecating',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'kernel': ['rbf', 'linear'], 'gamma': [0.001, 0.01, 0.1, 1], 'C': [0.25, 1, 10, 50, 100, 500, 1000]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [203]:
# svm_random.best_params_
svm_random.best_score_ # 0.8237934904601572

0.8237934904601572

In [225]:
# svm_random.best_params_

svm_random_p = {'kernel': 'linear', 'gamma': 0.001, 'C': 100, "probability":True}

**GridSearch**

In [207]:
svm = SVC(random_state=SEED)

svm_grid_param = {'kernel': ["linear"], 
                  'gamma': [0.001, 0.005, 0.01],
                  'C': [90, 100, 110, 120]}

svm_grid = GridSearchCV(svm, param_grid=svm_grid_param, scoring="accuracy", cv=kfold, n_jobs=-1)

svm_grid.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
       error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'kernel': ['linear'], 'gamma': [0.001, 0.005, 0.01], 'C': [90, 100, 110, 120]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [210]:
# svm_grid.best_params_
svm_grid.best_score_ # 0.8294051627384961

0.8237934904601572

In [228]:
# svm_grid.best_params_

svm_grid_p = {'C': 100, 'gamma': 0.001, 'kernel': 'linear', "probability":True}

In [229]:
svm_baseline = SVC(random_state=SEED, gamma="scale")
svm_baseline.fit(X, y)
svm_baseline_results = cross_validate(svm_baseline, X, y, cv=kfold, return_train_score=True)

svm_random = SVC(random_state=SEED, **svm_random_p)
svm_random.fit(X, y)
svm_random_results = cross_validate(svm_random, X, y, cv=kfold, return_train_score=True)

svm_grid = SVC(random_state=SEED, **svm_grid_p)
svm_grid.fit(X, y)
svm_grid_results = cross_validate(svm_grid, X, y, cv=kfold, return_train_score=True)

In [230]:
svm_baseline_score = svm_baseline_results["test_score"].mean()
svm_random_score = svm_random_results["test_score"].mean()
svm_grid_score = svm_grid_results["test_score"].mean()
print("Baseline: {}\nRandom: {}\nGrid: {}".format(svm_baseline_score, svm_random_score, svm_grid_score))

Baseline: 0.7015660048081646
Random: 0.8237945277426982
Grid: 0.8237945277426982


### Voting Ensemble

In [231]:
voting_ensemble = VotingClassifier(estimators=[("rf", rf_grid), 
                                               ("gbc", gbc_grid), 
                                               ("etc", etc_grid),
                                               ("svm", svm_grid)], voting="soft", n_jobs=5)
voting_ensemble.fit(X, y)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=30, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=9,
            min_weig...ar',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=5, voting='soft', weights=None)

In [232]:
yhat = voting_ensemble.predict(test)

In [233]:
def generate_submission(filename, yhat, save=False):
    submission_df = pd.DataFrame(columns=["PassengerId", "Survived"])
    submission_df["PassengerId"] = test_["PassengerId"]
    submission_df["Survived"] = yhat
    if save:
        submission_df.to_csv("submissions/"+filename, header=True, index=False)
    return submission_df

In [234]:
filename = "voting_ensemble_v2.csv"
submission = generate_submission(filename, yhat, True)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


See https://www.kaggle.com/arthurtok/0-808-with-simple-stacking

In [235]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kfold.split(train, y)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.fit(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [236]:
train_c = train.copy()
y_train = train_c.pop("Survived").ravel()
X_train = train_c.values
X_test = test.values

ntrain = train.shape[0]
ntest = test.shape[0]

In [237]:
et_oof_train, et_oof_test = get_oof(etc_grid, X_train, y_train, X_test)
rf_oof_train, rf_oof_test = get_oof(rf_grid, X_train, y_train, X_test)
gb_oof_train, gb_oof_test = get_oof(gbc_grid, X_train, y_train, X_test)
svc_oof_train, svc_oof_test = get_oof(svm_grid, X_train, y_train, X_test)

x_train = np.concatenate(( et_oof_train, rf_oof_train, gb_oof_train, svc_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, gb_oof_test, svc_oof_test), axis=1)
print("{},{}".format(x_train.shape, x_test.shape))

(891, 4),(418, 4)


### XGB

**RandomizedSearch**

In [238]:
gbm = xgb.XGBClassifier(random_state=SEED)

gbm_random_params = {
    "n_estimators": [500, 1000, 1500, 2000],
    "max_depth": [3, 4, 5],
    "min_child_weight": [2, 3, 4],
    "gamma": [0.9, 1, 1.1],
    "subsample": [0.8, 0.9, 1],
    "colsample_bytree": [0.8, 0.9, 1]
}

gbm_random = RandomizedSearchCV(estimator=gbm, param_distributions=gbm_random_params, n_iter=100, cv=kfold, random_state=SEED, n_jobs=-1)

gbm_random.fit(X_train, y_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
          error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
          fit_params=None, iid='warn', n_iter=100, n_jobs=5,
          param_distributions={'n_estimators': [500, 1000, 1500, 2000], 'max_depth': [3, 4, 5], 'min_child_weight': [2, 3, 4], 'gamma': [0.9, 1, 1.1], 'subsample': [0.8, 0.9, 1], 'colsample_bytree': [0.8, 0.9, 1]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [239]:
# gbm_random.best_params_
gbm_random.best_score_ # 0.8395061728395061

0.8395061728395061

In [241]:
# gbm_random.best_params_

gbm_random_p = {'subsample': 0.9,
 'n_estimators': 500,
 'min_child_weight': 4,
 'max_depth': 5,
 'gamma': 1,
 'colsample_bytree': 1}

**GridSearch**

In [244]:
gbm = xgb.XGBClassifier(random_state=SEED)

gbm_grid_params = {
    "n_estimators": [400, 500, 600, 700],
    "max_depth": [4, 5, 6],
    "min_child_weight": [3, 4, 5],
    "gamma": [0.9, 1, 1.1],
    "subsample": [0.8, 0.9, 1],
    "colsample_bytree": [0.9, 1]
}

gbm_grid = GridSearchCV(gbm, param_grid=gbm_grid_params, cv=kfold, scoring="accuracy", n_jobs=-1)

gbm_grid.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
       error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [400, 500, 600, 700], 'max_depth': [4, 5, 6], 'min_child_weight': [3, 4, 5], 'gamma': [0.9, 1, 1.1], 'subsample': [0.8, 0.9, 1], 'colsample_bytree': [0.9, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [245]:
# gbm_grid.best_params_
gbm_grid.best_score_ # 0.8428731762065096

0.8428731762065096

In [247]:
# gbm_grid.best_params_

gbm_grid_p = {'colsample_bytree': 0.9,
 'gamma': 1.1,
 'max_depth': 6,
 'min_child_weight': 5,
 'n_estimators': 700,
 'subsample': 0.8}

In [249]:
yhat = gbm_grid.predict(X_test)

filename = "stacked_model_v2.csv"
submission = generate_submission(filename, yhat, True)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
