# Random forest


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib widget

import titanic.data.load
import titanic.data.wrangling as wrng


train_df_orig, test_df_orig = titanic.data.load.from_csv()

train_df = wrng.wrangling(train_df_orig)
test_df = wrng.wrangling(test_df_orig)

X_train, X_test, y_train, y_test = train_test_split(train_df, train_df_orig.Survived, test_size=0.3, random_state=50)

train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Survived      891 non-null    int64  
 1   Pclass        891 non-null    int64  
 2   Sex           891 non-null    object 
 3   Age           891 non-null    float64
 4   SibSp         891 non-null    int64  
 5   Parch         891 non-null    int64  
 6   Fare          891 non-null    float64
 7   Embarked      889 non-null    object 
 8   Title         891 non-null    object 
 9   CabLet        891 non-null    object 
 10  Alone         891 non-null    int64  
 11  Familiars     891 non-null    int64  
 12  TicketLetter  891 non-null    object 
 13  LenName       891 non-null    int64  
dtypes: float64(2), int64(7), object(5)
memory usage: 97.6+ KB


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import  GridSearchCV
from titanic.data.preprocessing import preprocessor

n_est = [800,900,1000]
max_depth = [2,5,10]
max_depth.append(None)


param_grid = {
#     "preprocessor__num__norm": ['l1', 'l2', 'max'],
#     Number of trees in random forest
    "rand_for__n_estimators": n_est,
    # Number of features to consider at every split
    "rand_for__max_features": ['auto', 'sqrt'],
    # Maximum number of levels in tree
    "rand_for__max_depth": max_depth,
    # Minimum number of samples required to split a node
    "rand_for__min_samples_split": [2, 5, 10],
    # Minimum number of samples required at each leaf node
    "rand_for__min_samples_leaf": [2, 4, 8],
    # Method of selecting samples for training each tree
    "rand_for__bootstrap": [True, False],

}


best_params = {'bootstrap': True,
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 800}

clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("rand_for", RandomForestClassifier())]
)

grid_search = GridSearchCV(clf, param_grid, cv=10, n_jobs=10, verbose=5)

grid_search.fit(X_train, y_train)
# clf.fit(X_train, y_train)

In [3]:
est = grid_search
print("best fit params:", est.best_params_)
# est = clf
print("model score: %.3f" % est.score(X_test, y_test))
y_pred = est.predict(test_df)

best fit params: {'rand_for__bootstrap': True, 'rand_for__max_depth': None, 'rand_for__max_features': 'sqrt', 'rand_for__min_samples_leaf': 2, 'rand_for__min_samples_split': 2, 'rand_for__n_estimators': 800}
model score: 0.840


In [4]:
import kaggle 

file = r'../data/submission.csv'
competition = 'titanic'
message = ''
test_df_orig['Survived'] = y_pred
test_df_orig[['PassengerId', 'Survived']].to_csv(file, index=False)


kaggle.api.competition_submit(file,message,competition)
 


100%|██████████| 2.77k/2.77k [00:02<00:00, 1.37kB/s]


Successfully submitted to Titanic - Machine Learning from Disaster

# Gradient boosting



In [None]:
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import  GridSearchCV
import titanic.data.preprocessing

n_est = [800,900,1000]
max_depth = [2,5,10]
max_depth.append(None)
param_grid = {
    #  "preprocessor__num__norm": ['l1', 'l2', 'max'],
    # The number of boosting stages to perform. Gradient boosting is fairly robust to over-fitting so a large number usually results in better performance.
    "grad_boost__n_estimators": n_est,
    # Learning rate shrinks the contribution of each tree by learning_rate. There is a trade-off between learning_rate and n_estimators.
    "grad_boost__learning_rate": [0.5,1,2],
    # Maximum number of levels in tree
    "grad_boost__max_depth": max_depth,
    # Controls the random seed given to each Tree estimator at each boosting iteration. In addition, it controls the random permutation of the features at each split (see Notes for more details). It also controls the random splitting of the training data to obtain a validation set if n_iter_no_change is not None. Pass an int for reproducible output across multiple function calls.
    "grad_boost__random_state": [20, 30, 42],
    # The loss function to be optimized. ‘deviance’ refers to deviance (= logistic regression) for classification with probabilistic outputs. For loss ‘exponential’ gradient boosting recovers the AdaBoost algorithm.
    "grad_boost__loss": ['deviance', 'exponential'],
    # The function to measure the quality of a split. Supported criteria are ‘friedman_mse’ for the mean squared error with improvement score by Friedman, ‘squared_error’ for mean squared error, and ‘mae’ for the mean absolute error. The default value of ‘friedman_mse’ is generally the best as it can provide a better approximation in some cases.
    "grad_boost__criterion": ['friedman_mse', 'squared_error', 'mse', 'mae'],
    # Number of features to consider at every split
    "grad_boost__max_features": ['auto', 'sqrt', 'log2'],

}

preprocessor = titanic.data.preprocessing.preprocessing()


clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("grad_boost", GradientBoostingClassifier())]
)

grid_search = GridSearchCV(clf, param_grid, cv=10, n_jobs=10, verbose=5)

grid_search.fit(X_train, y_train)
# clf.fit(X_train, y_train)