## Ensembling

In [44]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold, RandomizedSearchCV, GridSearchCV, cross_validate
from sklearn.metrics import mean_squared_error, roc_auc_score, accuracy_score

sns.set()
plt.rcParams["figure.figsize"] = (10, 8)
SEED = 42

train_ = pd.read_csv("assets/train.csv")
test_ = pd.read_csv("assets/test.csv")

train = train_.copy()
test = test_.copy()

In [45]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [46]:
num_features = set([c for c in train.columns if train[c].dtype != "object"])
cat_features = set([c for c in train.columns if c not in num_features])
print("Numerical Features: {}\nCategorical Features: {}".format(num_features, cat_features))

Numerical Features: {'PassengerId', 'Parch', 'SibSp', 'Fare', 'Pclass', 'Survived', 'Age'}
Categorical Features: {'Name', 'Sex', 'Embarked', 'Ticket', 'Cabin'}


In [47]:
def replace_rare_titles(df):
    title = df["Title"]
    if title in ["Capt", "Col", "Don", "Jonkheer", "Major", "Sir", "Rev"]:
        return "Mr"
    elif title in ["Mme", "th", "Lady", "Dona"]:
        return 'Mrs'
    elif title in ["Mlle", "Ms"]:
        return 'Miss'
    elif title =='Dr':
        if df['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
    
    
def substrings_in_string(big_string, substrings):
    if pd.isna(big_string):
        return "Unknown"
    for substring in substrings:
        if big_string.find(substring) != -1:
            if substring == "T":
                return "A"
            else:
                return substring
            
def engineer_numerical_features(df):
    temp = df.copy()
    
    # temp["Age"].fillna(temp["Age"].mean(), inplace=True)
    mean = temp["Age"].mean()
    std = temp["Age"].std()
    is_null = temp["Age"].isnull().sum()
    rand_age = np.random.randint(mean - std, mean + std, size=is_null)
    ages = temp["Age"].copy()
    ages[np.isnan(ages)] = rand_age
    temp["Age"] = ages
    temp["Age"] = temp["Age"].astype(int)

    temp.loc[temp["Age"] <= 18, "Age"] = 0
    temp.loc[(temp["Age"] > 18) & (temp["Age"] <= 23), "Age"] = 1
    temp.loc[(temp["Age"] > 23) & (temp["Age"] <= 28), "Age"] = 2
    temp.loc[(temp["Age"] > 28) & (temp["Age"] <= 34), "Age"] = 3
    temp.loc[(temp["Age"] > 34) & (temp["Age"] <= 44), "Age"] = 4
    temp.loc[(temp["Age"] > 44), "Age"] = 5

    #temp["Fare"].fillna(0, inplace=True)
    mean = temp["Fare"].mean()
    std = temp["Fare"].std()
    is_null = temp["Fare"].isnull().sum()
    rand_fare = np.random.randint(mean - std, mean + std, size=is_null)
    fares = temp["Fare"].copy()
    fares[np.isnan(fares)] = rand_fare
    temp["Fare"] = fares
    
    # Need to fix this binning:

    temp.loc[ temp["Fare"] <= 7.775, "Fare"] = 0
    temp.loc[(temp["Fare"] > 7.775) & (temp["Fare"] <= 8.662), "Fare"] = 1
    temp.loc[(temp["Fare"] > 8.662) & (temp["Fare"] <= 14.454), "Fare"] = 2
    temp.loc[(temp["Fare"] > 14.454) & (temp["Fare"] <= 26), "Fare"] = 3
    temp.loc[(temp["Fare"] > 26) & (temp["Fare"] <= 52.369), "Fare"] = 4
    temp.loc[ temp["Fare"] > 52.369, "Fare"] = 5
    temp["Fare"] = temp["Fare"].astype(int)
    
    family_size = temp["SibSp"] + temp["Parch"]
    temp["FamilySize"] = family_size
    
    fare_per_person = temp["Fare"] / (temp["FamilySize"] + 1)
    temp["FarePerPerson"] = fare_per_person
    
    age_class = temp["Age"] * temp["Pclass"]
    temp["AgeClass"] = age_class
    
    return temp

def engineer_categorical_features(df):
    temp = df.copy()
    
    temp["Embarked"].fillna(temp["Embarked"].mode()[0], inplace=True)
    
    titles = temp["Name"].str.split(', ', expand=True)[1].str.split(". ", expand=True)[0]
    temp["Title"] = titles
    temp["Title"] = temp.apply(replace_rare_titles, axis=1)
    
    #deck = temp["Cabin"].map(lambda x: substrings_in_string(x, cabin_list))
    temp["Cabin"] = temp["Cabin"].fillna("U0")
    deck = temp["Cabin"].map(lambda x: x[0])
    temp["Deck"] = deck
    decks = {"A": 1, "T": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}
    temp["Deck"] = temp["Deck"].map(decks)
    temp["Deck"] = temp["Deck"].astype(int)
    
#     genders = {"male": 0, "female": 1}
#     temp["Sex"] = temp["Sex"].map(genders)
    
#     ports = {"S": 0, "C": 1, "Q": 2}
#     temp["Embarked"] = temp["Embarked"].map(ports)
    
    one_hot_cols = ["Title", "Sex", "Embarked"]
    
    for o in one_hot_cols:
        dummies = pd.get_dummies(temp[o], prefix=o)
        temp = pd.concat([temp, dummies], axis=1)
    
    temp.drop(columns=list(cat_features) + ["Title", "Sex", "Embarked", "PassengerId"], inplace=True)
    
    return temp

In [49]:
train = engineer_numerical_features(train_)
test = engineer_numerical_features(test_)
train = engineer_categorical_features(train)
test = engineer_categorical_features(test)

In [6]:
X = train_z.copy()
y = X.pop("Survived")

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [39]:
kfold = StratifiedKFold(n_splits=10)

### Random Forest

In [8]:
rf_baseline = RandomForestClassifier(random_state=SEED)
rf_baseline.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [9]:
rf = RandomForestClassifier(random_state=SEED)

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

rf_random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator=rf, param_distributions=rf_random_grid, n_iter=100, cv=3, random_state=SEED, n_jobs=2)

# rf_random.fit(X_train, y_train)

In [10]:
# rf_random.best_params_

rf_random_p = {'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 80,
 'bootstrap': True}

In [18]:
rf_param_grid = {'n_estimators': [200, 300, 400, 500, 600],
     'min_samples_split': [3, 5, 7],
     'min_samples_leaf': [4, 5, 6, 7],
     'max_features': ['auto'],
     'max_depth': [40, 50, 60, 70, 80],
     'bootstrap': [True]}

rf = RandomForestClassifier(random_state=SEED)    
rf_grid = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=3, n_jobs=-1)

# rf_grid.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [200, 300, 400, 500, 600], 'min_samples_split': [3, 5, 7], 'min_samples_leaf': [4, 5, 6, 7], 'max_features': ['auto'], 'max_depth': [40, 50, 60, 70, 80], 'bootstrap': [True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [20]:
# rf_grid.best_params_

# rf_grid_p = {'bootstrap': True,
#  'max_depth': 40,
#  'max_features': 'auto',
#  'min_samples_leaf': 6,
#  'min_samples_split': 3,
#  'n_estimators': 600}

rf_param_p = {'bootstrap': True,
 'max_depth': 40,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 3,
 'n_estimators': 400}

**Comparing Performance**

In [21]:
rf_baseline = RandomForestClassifier(random_state=SEED)
rf_baseline.fit(X_train, y_train)

rf_random = RandomForestClassifier(random_state=SEED, **rf_random_p)
rf_random.fit(X_train, y_train)

rf_grid = RandomForestClassifier(random_state=SEED, **rf_grid_p)
rf_grid.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=40, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=6, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=600, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [32]:
rf_baseline_score = rf_baseline.score(X_test, y_test)
rf_random_score = rf_random.score(X_test, y_test)
rf_grid_score = rf_grid.score(X_test, y_test)
print("Baseline: {}\nRandom: {}\nGrid: {}".format(rf_baseline_score, rf_random_score, rf_grid_score))

Baseline: 0.8100558659217877
Random: 0.8156424581005587
Grid: 0.8100558659217877


### Gradient Boosting Classifier

In [36]:
gbc = GradientBoostingClassifier(random_state=SEED)
gbc_param_grid = {'loss' : ["deviance"],
              'n_estimators' : [100, 200, 300],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 8],
              'min_samples_leaf': [100, 150],
              'max_features': [0.3, 0.1] 
              }

gbc_grid = GridSearchCV(gbc, param_grid = gbc_param_grid, cv=3, scoring="accuracy", n_jobs= -1, verbose = 1)

gbc_grid.fit(X_train, y_train)

gbc_grid_best_estimator = gbc_grid.best_estimator_
gbc_grid_best_score = gbc_grid.best_score_

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed:    3.1s finished


In [25]:
# gbc_grid.best_params_

gbc_grid_p = {'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 4,
 'max_features': 0.3,
 'min_samples_leaf': 100,
 'n_estimators': 200}

In [31]:
gbc_baseline = GradientBoostingClassifier(random_state=SEED)
gbc_baseline.fit(X_train, y_train)

gbc_grid = GradientBoostingClassifier(random_state=SEED, **gbc_grid_p)
gbc_grid.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=8,
              max_features=0.3, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=100, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=300,
              n_iter_no_change=None, presort='auto', random_state=42,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [34]:
gbc_baseline_score = gbc_baseline.score(X_test, y_test)
gbc_grid_score = gbc_grid.score(X_test, y_test)
print("Baseline: {}\nGrid: {}".format(gbc_baseline_score, gbc_grid_score))

Baseline: 0.8324022346368715
Grid: 0.8100558659217877


### ExtraTrees

In [None]:
etc = ExtraTreesClassifier(random_state=SEED)



### Ensemble

In [55]:
voting_ensemble = VotingClassifier(estimators=[("rf", rf_grid), ("gbc", gbc_grid)], voting="soft", n_jobs=-1)
voting_ensemble.fit(X, y)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=40, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=6, min_samples_split=3,
            min_weig..._dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1))],
         flatten_transform=None, n_jobs=-1, voting='soft', weights=None)

In [56]:
yhat = voting_ensemble.predict(test)

In [57]:
def generate_submission(filename, yhat, save=False):
    submission_df = pd.DataFrame(columns=["PassengerId", "Survived"])
    submission_df["PassengerId"] = test_["PassengerId"]
    submission_df["Survived"] = yhat
    if save:
        submission_df.to_csv("submissions/"+filename, header=True, index=False)
    return submission_df

filename = "ensemble_model_2.csv"
submission = generate_submission(filename, yhat, True)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
