## Ensembling

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold, RandomizedSearchCV, GridSearchCV, cross_validate
from sklearn.metrics import mean_squared_error, roc_auc_score, accuracy_score

sns.set()
plt.rcParams["figure.figsize"] = (10, 8)
SEED = 42

train_ = pd.read_csv("assets/train.csv")
test_ = pd.read_csv("assets/test.csv")

train = train_.copy()
test = test_.copy()

In [2]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [3]:
num_features = set([c for c in train.columns if train[c].dtype != "object"])
cat_features = set([c for c in train.columns if c not in num_features])
print("Numerical Features: {}\nCategorical Features: {}".format(num_features, cat_features))

Numerical Features: {'PassengerId', 'Age', 'Parch', 'Pclass', 'Survived', 'Fare', 'SibSp'}
Categorical Features: {'Sex', 'Name', 'Cabin', 'Ticket', 'Embarked'}


In [4]:
def replace_rare_titles(df):
    title = df["Title"]
    if title in ["Capt", "Col", "Don", "Jonkheer", "Major", "Sir", "Rev"]:
        return "Mr"
    elif title in ["Mme", "th", "Lady", "Dona"]:
        return 'Mrs'
    elif title in ["Mlle", "Ms"]:
        return 'Miss'
    elif title =='Dr':
        if df['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
    
    
def substrings_in_string(big_string, substrings):
    if pd.isna(big_string):
        return "Unknown"
    for substring in substrings:
        if big_string.find(substring) != -1:
            if substring == "T":
                return "A"
            else:
                return substring
            
def engineer_numerical_features(df):
    temp = df.copy()
    
    # temp["Age"].fillna(temp["Age"].mean(), inplace=True)
    mean = temp["Age"].mean()
    std = temp["Age"].std()
    is_null = temp["Age"].isnull().sum()
    rand_age = np.random.randint(mean - std, mean + std, size=is_null)
    ages = temp["Age"].copy()
    ages[np.isnan(ages)] = rand_age
    temp["Age"] = ages
    temp["Age"] = temp["Age"].astype(int)

    temp.loc[temp["Age"] <= 18, "Age"] = 0
    temp.loc[(temp["Age"] > 18) & (temp["Age"] <= 23), "Age"] = 1
    temp.loc[(temp["Age"] > 23) & (temp["Age"] <= 28), "Age"] = 2
    temp.loc[(temp["Age"] > 28) & (temp["Age"] <= 34), "Age"] = 3
    temp.loc[(temp["Age"] > 34) & (temp["Age"] <= 44), "Age"] = 4
    temp.loc[(temp["Age"] > 44), "Age"] = 5

    #temp["Fare"].fillna(0, inplace=True)
    mean = temp["Fare"].mean()
    std = temp["Fare"].std()
    is_null = temp["Fare"].isnull().sum()
    rand_fare = np.random.randint(mean - std, mean + std, size=is_null)
    fares = temp["Fare"].copy()
    fares[np.isnan(fares)] = rand_fare
    temp["Fare"] = fares
    
    # Need to fix this binning:

    temp.loc[ temp["Fare"] <= 7.775, "Fare"] = 0
    temp.loc[(temp["Fare"] > 7.775) & (temp["Fare"] <= 8.662), "Fare"] = 1
    temp.loc[(temp["Fare"] > 8.662) & (temp["Fare"] <= 14.454), "Fare"] = 2
    temp.loc[(temp["Fare"] > 14.454) & (temp["Fare"] <= 26), "Fare"] = 3
    temp.loc[(temp["Fare"] > 26) & (temp["Fare"] <= 52.369), "Fare"] = 4
    temp.loc[ temp["Fare"] > 52.369, "Fare"] = 5
    temp["Fare"] = temp["Fare"].astype(int)
    
    family_size = temp["SibSp"] + temp["Parch"]
    temp["FamilySize"] = family_size
    
    fare_per_person = temp["Fare"] / (temp["FamilySize"] + 1)
    temp["FarePerPerson"] = fare_per_person
    
    age_class = temp["Age"] * temp["Pclass"]
    temp["AgeClass"] = age_class
    
    return temp

def engineer_categorical_features(df):
    temp = df.copy()
    
    temp["Embarked"].fillna(temp["Embarked"].mode()[0], inplace=True)
    
    titles = temp["Name"].str.split(', ', expand=True)[1].str.split(". ", expand=True)[0]
    temp["Title"] = titles
    temp["Title"] = temp.apply(replace_rare_titles, axis=1)
    
    #deck = temp["Cabin"].map(lambda x: substrings_in_string(x, cabin_list))
    temp["Cabin"] = temp["Cabin"].fillna("U0")
    deck = temp["Cabin"].map(lambda x: x[0])
    temp["Deck"] = deck
    decks = {"A": 1, "T": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}
    temp["Deck"] = temp["Deck"].map(decks)
    temp["Deck"] = temp["Deck"].astype(int)
    
#     genders = {"male": 0, "female": 1}
#     temp["Sex"] = temp["Sex"].map(genders)
    
#     ports = {"S": 0, "C": 1, "Q": 2}
#     temp["Embarked"] = temp["Embarked"].map(ports)
    
    one_hot_cols = ["Title", "Sex", "Embarked"]
    
    for o in one_hot_cols:
        dummies = pd.get_dummies(temp[o], prefix=o)
        temp = pd.concat([temp, dummies], axis=1)
    
    temp.drop(columns=list(cat_features) + ["Title", "Sex", "Embarked", "PassengerId"], inplace=True)
    
    return temp

In [5]:
train = engineer_numerical_features(train_)
test = engineer_numerical_features(test_)
train = engineer_categorical_features(train)
test = engineer_categorical_features(test)

In [6]:
X = train.copy()
y = X.pop("Survived")

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [8]:
kfold = StratifiedKFold(n_splits=5)

### Random Forest

**RandomizedSearch**

In [9]:
rf = RandomForestClassifier(random_state=SEED)

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

rf_random_param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator=rf, param_distributions=rf_random_param, n_iter=100, cv=kfold, random_state=SEED, n_jobs=4)

# rf_random.fit(X, y)

In [10]:
# rf_random.best_params_
# rf_random.best_score_ # 0.8305274971941639

In [11]:
# rf_random.best_params_

rf_random_p = {'n_estimators': 1000,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 100,
 'bootstrap': True}

**GridSearch**

In [12]:
rf_grid_param = {'n_estimators': [600, 800, 1000, 1200],
     'min_samples_split': [3, 5, 7],
     'min_samples_leaf': [3, 4, 5, 6],
     'max_features': ['auto'],
     'max_depth': [80, 90, 100, 110],
     'bootstrap': [True]}

rf = RandomForestClassifier(random_state=SEED)    
rf_grid = GridSearchCV(estimator=rf, param_grid=rf_grid_param, cv=kfold, n_jobs=4)

# rf_grid.fit(X, y)

In [13]:
# rf_grid.best_params_
# rf_random.best_score_ # 0.8305274971941639

In [14]:
# rf_grid.best_params_

rf_grid_p = {'bootstrap': True,
 'max_depth': 80,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 3,
 'n_estimators': 1000}

**Comparing Performance**

In [15]:
rf_baseline = RandomForestClassifier(random_state=SEED)
rf_baseline.fit(X, y)
rf_baseline_results = cross_validate(rf_baseline, X, y, cv=kfold, return_train_score=True)

rf_random = RandomForestClassifier(random_state=SEED, **rf_random_p)
rf_random.fit(X, y)
rf_random_results = cross_validate(rf_random, X, y, cv=kfold, return_train_score=True)

rf_grid = RandomForestClassifier(random_state=SEED, **rf_grid_p)
rf_grid.fit(X, y)
rf_grid_results = cross_validate(rf_grid, X, y, cv=kfold, return_train_score=True)



In [16]:
rf_baseline_score = rf_baseline_results["test_score"].mean()
rf_random_score = rf_random_results["test_score"].mean()
rf_grid_score = rf_grid_results["test_score"].mean()
print("Baseline: {}\nRandom: {}\nGrid: {}".format(rf_baseline_score, rf_random_score, rf_grid_score))

Baseline: 0.7958110665805609
Random: 0.8238825485754775
Grid: 0.8238825485754775


Baseline: 0.8100558659217877
Random: 0.8156424581005587
Grid: 0.8100558659217877

### Gradient Boosting Classifier

**RandomizedSearch**

In [33]:
gbc = GradientBoostingClassifier(random_state=SEED)

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

gbc_random_param = {'loss' : ["deviance"],
              'n_estimators' : n_estimators,
              'learning_rate': [0.15, 0.1, 0.05, 0.01],
              'max_depth': [2, 4, 6, 8],
              'min_samples_leaf': [75, 100, 125, 150],
              'max_features': [0.5, 0.3, 0.1] 
              }

gbc_random = RandomizedSearchCV(estimator=gbc, param_distributions=gbc_random_param, n_iter=100, cv=kfold, random_state=SEED, n_jobs=4)

gbc_random.fit(X, y)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
          error_score='raise-deprecating',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=4,
          param_distributions={'loss': ['deviance'], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'learning_rate': [0.15, 0.1, 0.05, 0.01], 'max_depth': [2, 4, 6, 8], 'min_samples_leaf': [75, 100, 125, 150], 'max_features': [0.5, 0.3, 0.1]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [38]:
# gbc_random.best_params_
# gbc_random.best_score_ # 0.8338945005611672

In [36]:
#gbc_random.best_params_

gbc_random_p = {'n_estimators': 1800,
 'min_samples_leaf': 150,
 'max_features': 0.5,
 'max_depth': 8,
 'loss': 'deviance',
 'learning_rate': 0.15}

**GridSearch**

In [39]:
gbc = GradientBoostingClassifier(random_state=SEED)
gbc_grid_param = {'loss' : ["deviance"],
              'n_estimators' : [1600, 1800, 2000, 2200],
              'learning_rate': [0.2, 0.15, 0.1, 0.05],
              'max_depth': [6, 8, 10],
              'min_samples_leaf': [125, 150, 175],
              'max_features': [0.75, 0.5, 0.25] 
              }

gbc_grid = GridSearchCV(gbc, param_grid=gbc_grid_param, cv=kfold, scoring="accuracy", n_jobs=4)

# gbc_grid.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
       error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'loss': ['deviance'], 'n_estimators': [1600, 1800, 2000, 2200], 'learning_rate': [0.2, 0.15, 0.1, 0.05], 'max_depth': [6, 8, 10], 'min_samples_leaf': [125, 150, 175], 'max_features': [0.75, 0.5, 0.25]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [41]:
# gbc_grid.best_params_
# gbc_grid.best_score_ # 0.8159371492704826 w/o randomizedsearch, 0.8361391694725028 after randomizedsearch

{'learning_rate': 0.15,
 'loss': 'deviance',
 'max_depth': 6,
 'max_features': 0.75,
 'min_samples_leaf': 150,
 'n_estimators': 1600}

In [42]:
# gbc_grid.best_params_

gbc_grid_p = {'learning_rate': 0.15,
 'loss': 'deviance',
 'max_depth': 6,
 'max_features': 0.75,
 'min_samples_leaf': 150,
 'n_estimators': 1600}

In [43]:
gbc_baseline = GradientBoostingClassifier(random_state=SEED)
gbc_baseline.fit(X, y)
gbc_baseline_results = cross_validate(gbc_baseline, X, y, cv=kfold, return_train_score=True)

gbc_random = GradientBoostingClassifier(random_state=SEED, **gbc_random_p)
gbc_random.fit(X, y)
gbc_random_results = cross_validate(gbc_random, X, y, cv=kfold, return_train_score=True)

gbc_grid = GradientBoostingClassifier(random_state=SEED, **gbc_grid_p)
gbc_grid.fit(X, y)
gbc_grid_results = cross_validate(gbc_grid, X, y, cv=kfold, return_train_score=True)

In [45]:
gbc_baseline_score = gbc_baseline_results["test_score"].mean()
gbc_random_score = gbc_random_results["test_score"].mean()
gbc_grid_score = gbc_grid_results["test_score"].mean()
print("Baseline: {}\nRandom: {}\nGrid: {}".format(gbc_baseline_score, gbc_random_score, gbc_grid_score))

Baseline: 0.8093196755641472
Random: 0.8339319955727152
Grid: 0.8361855345811581


### ExtraTrees

**RandomizedSearch**

In [47]:
etc = ExtraTreesClassifier(random_state=SEED)

etc_random_param = {"max_depth": [None],
              "max_features": [1, 3, 5, 7, 10],
              "min_samples_split": [2, 3, 5, 7, 10],
              "min_samples_leaf": [1, 3, 5, 10],
              "bootstrap": [False],
              "n_estimators" :[100, 300, 500, 700, 900],
              "criterion": ["gini"]}

etc_random = RandomizedSearchCV(estimator=etc, param_distributions=etc_random_param, n_iter=100, cv=kfold, n_jobs=4, random_state=SEED)

#etc_random.fit(X,y)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
          error_score='raise-deprecating',
          estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=4,
          param_distributions={'max_depth': [None], 'max_features': [1, 3, 5, 7, 10], 'min_samples_split': [2, 3, 5, 7, 10], 'min_samples_leaf': [1, 3, 5, 10], 'bootstrap': [False], 'n_estimators': [100, 300, 500, 700, 900], 'criterion': ['gini']},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [49]:
# etc_random.best_params_
# etc_random.best_score_ # 0.8316498316498316

0.8316498316498316

In [48]:
# etc_random.best_params_

etc_random_p = {'n_estimators': 900,
 'min_samples_split': 10,
 'min_samples_leaf': 5,
 'max_features': 7,
 'max_depth': None,
 'criterion': 'gini',
 'bootstrap': False}

{'n_estimators': 900,
 'min_samples_split': 10,
 'min_samples_leaf': 5,
 'max_features': 7,
 'max_depth': None,
 'criterion': 'gini',
 'bootstrap': False}

**GridSearch**

In [52]:
etc = ExtraTreesClassifier(random_state=SEED)

etc_grid_param = {"max_depth": [None],
              "max_features": [6, 8, 10],
              "min_samples_split": [6, 8, 10, 12],
              "min_samples_leaf": [3, 5, 7],
              "bootstrap": [False],
              "n_estimators" :[800, 1000, 1200],
              "criterion": ["gini"]}

etc_grid = GridSearchCV(etc, param_grid=etc_grid_param, scoring="accuracy", cv=kfold, n_jobs=4)

#etc_grid.fit(X,y)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
       error_score='raise-deprecating',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [None], 'max_features': [6, 8, 10], 'min_samples_split': [6, 8, 10, 12], 'min_samples_leaf': [3, 5, 7], 'bootstrap': [False], 'n_estimators': [800, 1000, 1200], 'criterion': ['gini']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [53]:
# etc_grid.best_params_
# etc_grid.best_score_ # 0.835016835016835

0.835016835016835

In [54]:
# etc_grid.best_params_

etc_grid_p = {'bootstrap': False,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 6,
 'min_samples_leaf': 5,
 'min_samples_split': 12,
 'n_estimators': 1000}

{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 6,
 'min_samples_leaf': 5,
 'min_samples_split': 12,
 'n_estimators': 1000}

### Ensemble

In [55]:
voting_ensemble = VotingClassifier(estimators=[("rf", rf_grid), 
                                               ("gbc", gbc_grid), 
                                               ("etc", etc_grid)], voting="soft", n_jobs=-1)
# voting_ensemble.fit(X, y)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=80, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=3,
            min_weig..._dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0))],
         flatten_transform=None, n_jobs=-1, voting='soft', weights=None)

In [56]:
yhat = voting_ensemble.predict(test)

In [57]:
def generate_submission(filename, yhat, save=False):
    submission_df = pd.DataFrame(columns=["PassengerId", "Survived"])
    submission_df["PassengerId"] = test_["PassengerId"]
    submission_df["Survived"] = yhat
    if save:
        submission_df.to_csv("submissions/"+filename, header=True, index=False)
    return submission_df

filename = "ensemble_model_rf_gbc_etc.csv"
submission = generate_submission(filename, yhat, False)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [59]:
yhat2 = gbc_grid.predict(test)

In [60]:
filename = "gbc_grid.csv"
submission = generate_submission(filename, yhat2, True)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
