In [1]:
import time
import re
from __future__ import print_function
from collections import defaultdict

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, LabelEncoder, MinMaxScaler, LabelBinarizer, \
                                  OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

import xgboost as xgb
import lightgbm as lgb

%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 8)
pd.options.display.float_format = '{:.2f}'.format

In [2]:
df_train = pd.read_csv(r'C:\Users\adwiz\Documents\Courses\machine_learning\datasets\train.csv')
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df_test = pd.read_csv(r'C:\Users\adwiz\Documents\Courses\machine_learning\datasets\test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.83,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.69,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.66,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.29,,S


In [4]:
# move target to the right
survived = df_train['Survived']
df_train.drop(labels=['Survived'], axis=1, inplace=True)
df_train['Survived'] = survived
df_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [5]:
class LabelEncoderPipelineFriendly(LabelEncoder):
    
    def fit(self, X, y=None):
        super(LabelEncoderPipelineFriendly, self).fit(X)
    
    def transform(self, X, y=None):
        return super(LabelEncoderPipelineFriendly, self).transform(X).reshape(-1, 1)
    
    def fit_transform(self, X, y=None):
        return super(LabelEncoderPipelineFriendly, self).fit(X).transform(X).reshape(-1, 1)

class FeaturesSum(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return np.sum(X, axis=1).reshape(-1, 1)
    
    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

class AgeFeature(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X['Initilal'] = 0
        for i in X:
            X['Initial'] = X.Name.str.extract('([A-Za-z]+)\.')
        
        X['Initial'].replace(
            ['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
            ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],
            inplace=True
        )
        X.groupby('Initial')['Age'].mean()
                  
        X.loc[(X.Age.isnull()) & (X.Initial=='Mr'), 'Age'] = 33
        X.loc[(X.Age.isnull()) & (X.Initial=='Mrs'), 'Age'] = 36
        X.loc[(X.Age.isnull()) & (X.Initial=='Master'), 'Age'] = 5
        X.loc[(X.Age.isnull()) & (X.Initial=='Miss'), 'Age'] = 22
        X.loc[(X.Age.isnull()) & (X.Initial=='Other'), 'Age'] = 46
        return X['Age'].to_numpy().reshape(-1, 1)
    
    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

In [6]:
def get_sex_col(df):
    return df[['Sex']]

def get_age_name_cols(df):
    return df[['Age', 'Name']]

def get_pclass_col(df):
    return df[['Pclass']]

def get_sum_cols(df):
    return df[['Age', 'Fare']]

def get_num_cols(df):
    return df[['Fare', 'SibSp', 'Parch']]

vec = make_union(*[
    make_pipeline(FunctionTransformer(get_pclass_col, validate=False), OneHotEncoder(sparse=False)),
    make_pipeline(FunctionTransformer(get_sex_col, validate=False), LabelEncoderPipelineFriendly()),
    make_pipeline(FunctionTransformer(get_num_cols, validate=False), SimpleImputer(strategy='mean'), MinMaxScaler()),
    make_pipeline(FunctionTransformer(get_age_name_cols, validate=False), AgeFeature()),
])

In [7]:
x_train = vec.fit_transform(df_train)
x_train.shape

  return f(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Initilal'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Initial'] = X.Name.str.extract('([A-Za-z]+)\.')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats i

(891, 8)

In [8]:
x_train

array([[ 0.        ,  0.        ,  1.        , ...,  0.125     ,
         0.        , 22.        ],
       [ 1.        ,  0.        ,  0.        , ...,  0.125     ,
         0.        , 38.        ],
       [ 0.        ,  0.        ,  1.        , ...,  0.        ,
         0.        , 26.        ],
       ...,
       [ 0.        ,  0.        ,  1.        , ...,  0.125     ,
         0.33333333, 22.        ],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , 26.        ],
       [ 0.        ,  0.        ,  1.        , ...,  0.        ,
         0.        , 32.        ]])

In [9]:
y_train = df_train['Survived']
y_train.shape

(891,)

In [10]:
lr = LogisticRegressionCV(cv=10)
lr.fit(x_train, y_train)
lr

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegressionCV(cv=10)

In [11]:
accuracy_score(y_train, lr.predict(x_train))

0.8035914702581369

# Применение модели

In [12]:
def apply_model(model, submission_name):
    x_test = vec.fit_transform(df_test)
    print(f'shape of x_test is {x_test.shape}')
    y_test = model.predict(x_test)
    print(f'shape of y_test is {y_test.shape}')
    df_predicted = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_test})
    df_predicted.to_csv(submission_name + '.csv', sep=',', index=False)

In [13]:
apply_model(lr, 'log_regression_cv')

shape of x_test is (418, 8)
shape of y_test is (418,)


  return f(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Initilal'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Initial'] = X.Name.str.extract('([A-Za-z]+)\.')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats i

# Обучение ансамблей

In [14]:
def randomized_cv(model, param_grid, x_train=x_train, y_train=y_train):
    grid_search = RandomizedSearchCV(model, param_grid, cv=5, scoring='accuracy')
    t_start = time.time()
    grid_search.fit(x_train, y_train)
    t_end = time.time()
    print(f'model {model.__class__.__name__} best accuracy score is {grid_search.best_score_}')
    print(f'time for training is {t_end - t_start} seconds')
    print(grid_search.best_score_)
    return grid_search.best_estimator_

# XGBoost

In [15]:
import xgboost as xgb
param_grid = {
    'max_depth': [2, 3, 4],
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.025],
    'eval_metric': ['logloss']
}
xgb = randomized_cv(xgb.XGBClassifier(use_label_encoder=False), param_grid)

model XGBClassifier best accuracy score is 0.8238089259933462
time for training is 3.1675965785980225 seconds
0.8238089259933462


In [16]:
apply_model(xgb, 'xgb_cv')

shape of x_test is (418, 8)
shape of y_test is (418,)


  return f(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Initilal'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Initial'] = X.Name.str.extract('([A-Za-z]+)\.')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats i

# LightGBM

In [17]:
import lightgbm as lgb

param_grid = {
    'max_depth': [2, 3, 4, 5],
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.02, 0.05],
#     'eval_metric': ['logloss']
}
model = randomized_cv(lgb.LGBMClassifier(), param_grid)

model LGBMClassifier best accuracy score is 0.8361621994852803
time for training is 2.2054688930511475 seconds
0.8361621994852803


In [18]:
apply_model(model, 'lgb_cv')

shape of x_test is (418, 8)
shape of y_test is (418,)


  return f(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Initilal'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Initial'] = X.Name.str.extract('([A-Za-z]+)\.')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats i

# H2O GBM

In [19]:
import h2o
import numpy as np
import math
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch
h2o.init(nthreads=-1, strict_version_check=True)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.281-b09, mixed mode)
  Starting server from C:\Users\adwiz\anaconda3\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\adwiz\AppData\Local\Temp\tmpmttecj5l
  JVM stdout: C:\Users\adwiz\AppData\Local\Temp\tmpmttecj5l\h2o_adwiz_started_from_python.out
  JVM stderr: C:\Users\adwiz\AppData\Local\Temp\tmpmttecj5l\h2o_adwiz_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Europe/Moscow
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.1
H2O_cluster_version_age:,26 days
H2O_cluster_name:,H2O_from_python_adwiz_j2wbyy
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,5.332 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


In [20]:
data = np.c_[x_train, y_train]
data = pd.DataFrame(data, columns=['C{}'.format(idx + 1) for idx in range(data.shape[-1])])

train_df_h2o = h2o.H2OFrame(python_obj=data)
train_df_h2o['C9'] = train_df_h2o['C9'].asfactor()

train_df_h2o.show()

Parse progress: |█████████████████████████████████████████████████████████| 100%


C1,C2,C3,C4,C5,C6,C7,C8,C9
0,0,1,1,0.0141511,0.125,0.0,22,0
1,0,0,0,0.139136,0.125,0.0,38,1
0,0,1,0,0.0154686,0.0,0.0,26,1
1,0,0,0,0.103644,0.125,0.0,35,1
0,0,1,1,0.0157126,0.0,0.0,35,0
0,0,1,1,0.0165095,0.0,0.0,33,0
1,0,0,1,0.101229,0.0,0.0,54,0
0,0,1,1,0.0411357,0.375,0.166667,2,0
0,0,1,0,0.0217308,0.0,0.333333,27,1
0,1,0,0,0.0586943,0.125,0.0,14,1


In [32]:
x_test = vec.fit_transform(df_test)
data_test = pd.DataFrame(x_test, columns=['C{}'.format(idx + 1) for idx in range(x_test.shape[-1])])

test_df_h2o = h2o.H2OFrame(python_obj=data_test)
test_df_h2o.show()

  return f(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Initilal'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Initial'] = X.Name.str.extract('([A-Za-z]+)\.')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats i

Parse progress: |█████████████████████████████████████████████████████████| 100%


C1,C2,C3,C4,C5,C6,C7,C8
0,0,1,1,0.0152816,0.0,0.0,34.5
0,0,1,0,0.0136631,0.125,0.0,47.0
0,1,0,1,0.0189087,0.0,0.0,62.0
0,0,1,1,0.0169081,0.0,0.0,27.0
0,0,1,0,0.0239836,0.125,0.111111,22.0
0,0,1,1,0.018006,0.0,0.0,14.0
0,0,1,0,0.0148912,0.0,0.0,30.0
0,1,0,1,0.0566042,0.125,0.111111,26.0
0,0,1,0,0.0141105,0.0,0.0,18.0
0,0,1,1,0.0471377,0.25,0.0,21.0


In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   C1      891 non-null    float64
 1   C2      891 non-null    float64
 2   C3      891 non-null    float64
 3   C4      891 non-null    float64
 4   C5      891 non-null    float64
 5   C6      891 non-null    float64
 6   C7      891 non-null    float64
 7   C8      891 non-null    float64
 8   C9      891 non-null    float64
dtypes: float64(9)
memory usage: 62.8 KB


In [22]:
gbm = H2OGradientBoostingEstimator()
gbm.train(x=['C{}'.format(idx + 1) for idx in range(data.shape[-1] - 1)], y='C9', training_frame=train_df_h2o)
print(gbm)

gbm Model Build progress: |███████████████████████████████████████████████| 100%
Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_model_python_1618938482793_1


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,50.0,50.0,13285.0,5.0,5.0,5.0,6.0,29.0,16.46




ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.0893923322821615
RMSE: 0.2989855051372248
LogLoss: 0.3044905775157067
Mean Per-Class Error: 0.11830121752468603
AUC: 0.9412461785915912
AUCPR: 0.9298919513771351
Gini: 0.8824923571831824

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.4790210246684239: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,520.0,29.0,0.0528,(29.0/549.0)
1,1,63.0,279.0,0.1842,(63.0/342.0)
2,Total,583.0,308.0,0.1033,(92.0/891.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.48,0.86,180.0
1,max f2,0.24,0.86,251.0
2,max f0point5,0.62,0.9,147.0
3,max accuracy,0.48,0.9,180.0
4,max precision,0.99,1.0,0.0
5,max recall,0.08,1.0,352.0
6,max specificity,0.99,1.0,0.0
7,max absolute_mcc,0.48,0.78,180.0
8,max min_per_class_accuracy,0.34,0.87,225.0
9,max mean_per_class_accuracy,0.45,0.88,193.0



Gains/Lift Table: Avg response rate: 38,38 %, avg score: 38,39 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.01,0.97,2.61,2.61,1.0,0.98,1.0,0.98,0.03,0.03,160.53,160.53,0.03
1,2,0.03,0.97,2.61,2.61,1.0,0.97,1.0,0.97,0.04,0.07,160.53,160.53,0.07
2,3,0.03,0.97,2.61,2.61,1.0,0.97,1.0,0.97,0.02,0.08,160.53,160.53,0.08
3,4,0.04,0.97,2.61,2.61,1.0,0.97,1.0,0.97,0.03,0.11,160.53,160.53,0.11
4,5,0.05,0.97,2.61,2.61,1.0,0.97,1.0,0.97,0.02,0.13,160.53,160.53,0.13
5,6,0.1,0.96,2.61,2.61,1.0,0.96,1.0,0.97,0.13,0.26,160.53,160.53,0.26
6,7,0.15,0.93,2.61,2.61,1.0,0.94,1.0,0.96,0.13,0.39,160.53,160.53,0.39
7,8,0.2,0.84,2.55,2.59,0.98,0.89,0.99,0.94,0.13,0.52,154.74,159.07,0.52
8,9,0.3,0.61,2.2,2.46,0.84,0.71,0.94,0.86,0.22,0.74,119.54,145.94,0.71
9,10,0.4,0.38,1.13,2.12,0.43,0.48,0.82,0.77,0.11,0.85,12.89,112.5,0.73




Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2021-04-20 20:17:18,0.040 sec,0.0,0.49,0.67,0.5,0.38,1.0,0.62
1,,2021-04-20 20:17:18,0.235 sec,1.0,0.46,0.62,0.9,0.89,2.61,0.18
2,,2021-04-20 20:17:18,0.275 sec,2.0,0.44,0.58,0.9,0.89,2.61,0.17
3,,2021-04-20 20:17:18,0.300 sec,3.0,0.42,0.55,0.9,0.89,2.61,0.17
4,,2021-04-20 20:17:18,0.330 sec,4.0,0.41,0.52,0.9,0.89,2.61,0.17
5,,2021-04-20 20:17:18,0.350 sec,5.0,0.39,0.49,0.9,0.89,2.61,0.17
6,,2021-04-20 20:17:18,0.370 sec,6.0,0.38,0.48,0.91,0.89,2.61,0.15
7,,2021-04-20 20:17:18,0.390 sec,7.0,0.37,0.46,0.91,0.89,2.61,0.14
8,,2021-04-20 20:17:18,0.410 sec,8.0,0.37,0.44,0.91,0.89,2.61,0.15
9,,2021-04-20 20:17:18,0.430 sec,9.0,0.36,0.43,0.91,0.9,2.61,0.14



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,C4,330.4,1.0,0.48
1,C8,106.0,0.32,0.15
2,C5,91.32,0.28,0.13
3,C3,83.01,0.25,0.12
4,C6,46.73,0.14,0.07
5,C1,16.51,0.05,0.02
6,C2,8.27,0.03,0.01
7,C7,5.3,0.02,0.01





In [25]:
hyper_params = {'max_depth': list(range(1, 30, 2))}

gbm_grid = H2OGradientBoostingEstimator(
    ntrees = 10000,
    learn_rate = 0.05,
    learn_rate_annealing = 0.99,
    sample_rate = 0.8,
    col_sample_rate = 0.8,
    seed = 1234,
    score_tree_interval = 10,
    stopping_rounds = 5,
    stopping_metric = 'misclassification',
    stopping_tolerance = 1e-4)

grid = H2OGridSearch(gbm_grid, hyper_params,
                     grid_id = 'depth_grid',
                     search_criteria = {'strategy': 'RandomDiscrete'})

grid.train(x=['C{}'.format(idx + 1) for idx in range(data.shape[-1] - 1)], y='C9', training_frame=train_df_h2o)

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [26]:
print(grid)

     max_depth            model_ids              logloss
0           27   depth_grid_model_4  0.20299158583991428
1           29   depth_grid_model_6  0.20299158583991428
2           23  depth_grid_model_10   0.2039713947108158
3           25   depth_grid_model_7  0.20398177460482617
4           21  depth_grid_model_13  0.20431413668073337
5           19   depth_grid_model_8  0.20493492849948586
6           17   depth_grid_model_3    0.205886912454518
7           15   depth_grid_model_5   0.2077973029930745
8           13   depth_grid_model_9   0.2157334855478311
9           11  depth_grid_model_14  0.23454424301875623
10           9   depth_grid_model_1  0.23838085072065496
11           7  depth_grid_model_15   0.2554648180810368
12           5  depth_grid_model_11   0.2989661874465791
13           3   depth_grid_model_2  0.35871674057094916
14           1  depth_grid_model_12  0.46068523316875015



In [27]:
best_model = h2o.get_model(grid.sorted_metric_table()['model_ids'][0])
best_model

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  depth_grid_model_4


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,500.0,500.0,358961.0,9.0,27.0,16.27,46.0,57.0,52.06




ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.05619102209709977
RMSE: 0.2370464555674684
LogLoss: 0.20299158583991428
Mean Per-Class Error: 0.07405809605982172
AUC: 0.9820487009874413
AUCPR: 0.9747627645851848
Gini: 0.9640974019748827

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.4157814191724123: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,514.0,35.0,0.0638,(35.0/549.0)
1,1,29.0,313.0,0.0848,(29.0/342.0)
2,Total,543.0,348.0,0.0718,(64.0/891.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.42,0.91,193.0
1,max f2,0.25,0.94,233.0
2,max f0point5,0.61,0.93,153.0
3,max accuracy,0.48,0.93,174.0
4,max precision,0.99,1.0,0.0
5,max recall,0.07,1.0,334.0
6,max specificity,0.99,1.0,0.0
7,max absolute_mcc,0.45,0.85,183.0
8,max min_per_class_accuracy,0.36,0.92,200.0
9,max mean_per_class_accuracy,0.34,0.93,205.0



Gains/Lift Table: Avg response rate: 38,38 %, avg score: 38,38 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.01,0.99,2.61,2.61,1.0,0.99,1.0,0.99,0.03,0.03,160.53,160.53,0.03
1,2,0.02,0.99,2.61,2.61,1.0,0.99,1.0,0.99,0.03,0.05,160.53,160.53,0.05
2,3,0.03,0.99,2.61,2.61,1.0,0.99,1.0,0.99,0.03,0.08,160.53,160.53,0.08
3,4,0.04,0.99,2.61,2.61,1.0,0.99,1.0,0.99,0.03,0.11,160.53,160.53,0.11
4,5,0.05,0.98,2.61,2.61,1.0,0.98,1.0,0.99,0.03,0.13,160.53,160.53,0.13
5,6,0.1,0.97,2.61,2.61,1.0,0.98,1.0,0.98,0.13,0.26,160.53,160.53,0.26
6,7,0.15,0.94,2.61,2.61,1.0,0.96,1.0,0.98,0.13,0.39,160.53,160.53,0.39
7,8,0.2,0.88,2.61,2.61,1.0,0.91,1.0,0.96,0.13,0.52,160.53,160.53,0.52
8,9,0.3,0.68,2.43,2.55,0.93,0.79,0.98,0.9,0.24,0.77,142.96,154.69,0.76
9,10,0.4,0.36,1.56,2.3,0.6,0.52,0.88,0.81,0.16,0.92,56.32,129.96,0.85




Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2021-04-20 20:29:35,3.495 sec,0.0,0.49,0.67,0.5,0.38,1.0,0.62
1,,2021-04-20 20:29:35,3.525 sec,10.0,0.39,0.49,0.93,0.91,2.61,0.13
2,,2021-04-20 20:29:35,3.560 sec,20.0,0.35,0.41,0.94,0.92,2.61,0.12
3,,2021-04-20 20:29:35,3.600 sec,30.0,0.32,0.36,0.95,0.93,2.61,0.12
4,,2021-04-20 20:29:35,3.645 sec,40.0,0.31,0.33,0.95,0.93,2.61,0.11
5,,2021-04-20 20:29:35,3.685 sec,50.0,0.3,0.31,0.95,0.94,2.61,0.1
6,,2021-04-20 20:29:35,3.720 sec,60.0,0.29,0.3,0.96,0.94,2.61,0.1
7,,2021-04-20 20:29:35,3.755 sec,70.0,0.29,0.29,0.96,0.94,2.61,0.1
8,,2021-04-20 20:29:35,3.795 sec,80.0,0.28,0.27,0.96,0.95,2.61,0.1
9,,2021-04-20 20:29:35,3.830 sec,90.0,0.28,0.27,0.96,0.95,2.61,0.09



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,C8,1017.63,1.0,0.33
1,C5,841.96,0.83,0.27
2,C4,744.86,0.73,0.24
3,C3,193.21,0.19,0.06
4,C6,139.07,0.14,0.04
5,C7,64.95,0.06,0.02
6,C1,50.56,0.05,0.02
7,C2,38.61,0.04,0.01




In [28]:
best_model.accuracy()

[[0.4818895224478181, 0.9292929292929293]]

In [33]:
preds = best_model.predict(test_df_h2o)
preds.head()

gbm prediction progress: |████████████████████████████████████████████████| 100%


predict,p0,p1
0,0.966056,0.0339441
0,0.841551,0.158449
0,0.832241,0.167759
1,0.497027,0.502973
0,0.596389,0.403611
0,0.915552,0.0844483
0,0.678888,0.321112
0,0.971505,0.0284949
1,0.310666,0.689334
0,0.921239,0.0787613




In [37]:
pred_df = preds.as_data_frame()

submit = pd.DataFrame()
submit['PassengerId'] = df_test['PassengerId']
submit['Survived'] = pred_df['predict']
submit.to_csv('h2o.csv', sep=',', index=False)

# CatBoost

In [38]:
from catboost import CatBoostClassifier

param_grid = {
    'iterations': [2, 3, 4, 5],
    'depth': [2, 3, 4, 5],
    'learning_rate': [1, 0.1, 0.01, 0.001]
}
cbm = randomized_cv(CatBoostClassifier(), param_grid)

0:	learn: 0.6891317	total: 94.5ms	remaining: 283ms
1:	learn: 0.6854102	total: 97.1ms	remaining: 97.1ms
2:	learn: 0.6812586	total: 99.5ms	remaining: 33.2ms
3:	learn: 0.6775615	total: 102ms	remaining: 0us
0:	learn: 0.6888097	total: 2.11ms	remaining: 6.34ms
1:	learn: 0.6852665	total: 4.64ms	remaining: 4.64ms
2:	learn: 0.6812738	total: 7.21ms	remaining: 2.4ms
3:	learn: 0.6776518	total: 9.53ms	remaining: 0us
0:	learn: 0.6889784	total: 1.73ms	remaining: 5.19ms
1:	learn: 0.6854390	total: 3.33ms	remaining: 3.33ms
2:	learn: 0.6817863	total: 5ms	remaining: 1.67ms
3:	learn: 0.6783476	total: 6.56ms	remaining: 0us
0:	learn: 0.6887956	total: 1.49ms	remaining: 4.47ms
1:	learn: 0.6853348	total: 2.95ms	remaining: 2.95ms
2:	learn: 0.6816494	total: 4.49ms	remaining: 1.5ms
3:	learn: 0.6781409	total: 5.99ms	remaining: 0us
0:	learn: 0.6891340	total: 1.33ms	remaining: 4.01ms
1:	learn: 0.6857597	total: 2.88ms	remaining: 2.88ms
2:	learn: 0.6818863	total: 4.58ms	remaining: 1.53ms
3:	learn: 0.6783516	total: 5.96