### Практическое задание

### GLM для прогнозирования наступления страхового случая

#### Построить обобщенную линейную модель (GLM) для прогнозирования наступления страховых случаев на рассмотренных в ноутбуке данных. Подобрать необходимое распределение и тип связи, при необходимости ознакомиться с документацией H20. Придумать и использовать дополнительные факторы при построении модели (например, пересечения признаков или функции от них и т.д.). Оценить результаты построенной модели при помощи различных метрик (можно использовать и другие метрики помимо представленных в ноутбуке), проанализировать вероятные проблемы. Предложить способы их решения и/или попробовать их решить, улучшив результат.

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Загрузим набор данных

df = pd.read_csv('freMPL-R.csv', low_memory=False)
df = df.loc[df.Dataset.isin([5, 6, 7, 8, 9])]
df.drop('Dataset', axis=1, inplace=True)
df.dropna(axis=1, how='all', inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115155 entries, 0 to 115154
Data columns (total 20 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Exposure           115155 non-null  float64
 1   LicAge             115155 non-null  int64  
 2   RecordBeg          115155 non-null  object 
 3   RecordEnd          59455 non-null   object 
 4   Gender             115155 non-null  object 
 5   MariStat           115155 non-null  object 
 6   SocioCateg         115155 non-null  object 
 7   VehUsage           115155 non-null  object 
 8   DrivAge            115155 non-null  int64  
 9   HasKmLimit         115155 non-null  int64  
 10  BonusMalus         115155 non-null  int64  
 11  ClaimAmount        115155 non-null  float64
 12  ClaimInd           115155 non-null  int64  
 13  ClaimNbResp        115155 non-null  float64
 14  ClaimNbNonResp     115155 non-null  float64
 15  ClaimNbParking     115155 non-null  float64
 16  Cl

In [3]:
NegClaimAmount = df.loc[df.ClaimAmount < 0, ['ClaimAmount','ClaimInd']]
print('Unique values of ClaimInd:', NegClaimAmount.ClaimInd.unique())
NegClaimAmount.head()

Unique values of ClaimInd: [0]


Unnamed: 0,ClaimAmount,ClaimInd
82,-74.206042,0
175,-1222.585196,0
177,-316.288822,0
363,-666.75861,0
375,-1201.600604,0


In [4]:
df.loc[df.ClaimAmount < 0, 'ClaimAmount'] = 0

In [5]:
def SeriesFactorizer(series):
    series, unique = pd.factorize(series)
    reference = {x: i for x, i in enumerate(unique)}
    print(reference)
    return series, reference

In [6]:
df.Gender, GenderRef = SeriesFactorizer(df.Gender)

{0: 'Male', 1: 'Female'}


In [7]:
df.MariStat, MariStatRef = SeriesFactorizer(df.MariStat)

{0: 'Other', 1: 'Alone'}


In [8]:
list(df.VehUsage.unique())

['Professional', 'Private+trip to office', 'Private', 'Professional run']

In [9]:
VU_dummies = pd.get_dummies(df.VehUsage, prefix='VehUsg', drop_first=False)
VU_dummies.head()

Unnamed: 0,VehUsg_Private,VehUsg_Private+trip to office,VehUsg_Professional,VehUsg_Professional run
0,0,0,1,0
1,0,0,1,0
2,0,1,0,0
3,0,1,0,0
4,1,0,0,0


In [10]:
df['SocioCateg'].unique()

array(['CSP50', 'CSP55', 'CSP60', 'CSP48', 'CSP6', 'CSP66', 'CSP1',
       'CSP46', 'CSP21', 'CSP47', 'CSP42', 'CSP37', 'CSP22', 'CSP3',
       'CSP49', 'CSP20', 'CSP2', 'CSP40', 'CSP7', 'CSP26', 'CSP65',
       'CSP41', 'CSP17', 'CSP57', 'CSP56', 'CSP38', 'CSP51', 'CSP59',
       'CSP30', 'CSP44', 'CSP61', 'CSP63', 'CSP45', 'CSP16', 'CSP43',
       'CSP39', 'CSP5', 'CSP32', 'CSP35', 'CSP73', 'CSP62', 'CSP52',
       'CSP27', 'CSP24', 'CSP19', 'CSP70'], dtype=object)

In [11]:
df['SocioCateg'] = df.SocioCateg.str.slice(0,4)

In [12]:
pd.DataFrame(df.SocioCateg.value_counts().sort_values()).rename({'SocioCateg': 'Frequency'}, axis=1)

Unnamed: 0,Frequency
CSP7,14
CSP3,1210
CSP1,2740
CSP2,3254
CSP4,7648
CSP6,24833
CSP5,75456


In [13]:
df = pd.get_dummies(df, columns=['VehUsage','SocioCateg'])

In [14]:
df = df.select_dtypes(exclude=['object'])

In [15]:
df['DrivAgeSq'] = df.DrivAge.apply(lambda x: x**2)
df['LicAgeSq'] = df.LicAge.apply(lambda x: x**2)
df['BonusMalusSq'] = df.BonusMalus.apply(lambda x: x**2)
df['DrivAgeLog'] = df.DrivAge.apply(lambda x: np.log(x))
df['LicAgeLog'] = df.LicAge.apply(lambda x: np.log(x))
df['BonusMalusLog'] = df.BonusMalus.apply(lambda x: np.log(x))
df.head()

Unnamed: 0,Exposure,LicAge,Gender,MariStat,DrivAge,HasKmLimit,BonusMalus,ClaimAmount,ClaimInd,ClaimNbResp,...,SocioCateg_CSP4,SocioCateg_CSP5,SocioCateg_CSP6,SocioCateg_CSP7,DrivAgeSq,LicAgeSq,BonusMalusSq,DrivAgeLog,LicAgeLog,BonusMalusLog
0,0.083,332,0,0,46,0,50,0.0,0,0.0,...,0,1,0,0,2116,110224,2500,3.828641,5.805135,3.912023
1,0.916,333,0,0,46,0,50,0.0,0,0.0,...,0,1,0,0,2116,110889,2500,3.828641,5.808142,3.912023
2,0.55,173,0,0,32,0,68,0.0,0,0.0,...,0,1,0,0,1024,29929,4624,3.465736,5.153292,4.219508
3,0.089,364,1,0,52,0,50,0.0,0,0.0,...,0,1,0,0,2704,132496,2500,3.951244,5.897154,3.912023
4,0.233,426,0,0,57,0,50,0.0,0,0.0,...,0,0,1,0,3249,181476,2500,4.043051,6.054439,3.912023


In [16]:
df.drop(["ClaimNbResp", "ClaimNbNonResp", "ClaimNbParking", "ClaimNbFireTheft", "ClaimNbWindscreen"], axis=1, inplace=True)

In [17]:
from sklearn.model_selection import train_test_split

In [47]:
# Разбиение датасета на train/val/test

x_train, x_test, y_train, y_test = train_test_split(df.drop(['ClaimInd', 'ClaimAmount'], axis=1), df.ClaimInd, test_size=0.3, random_state=1)
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=1)

In [19]:
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "13.0.1" 2019-10-15; Java(TM) SE Runtime Environment (build 13.0.1+9); Java HotSpot(TM) 64-Bit Server VM (build 13.0.1+9, mixed mode, sharing)
  Starting server from /Users/user/opt/anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/xr/jyhzcr794ml6k843yzr0knh00000gn/T/tmp5jwt48ns
  JVM stdout: /var/folders/xr/jyhzcr794ml6k843yzr0knh00000gn/T/tmp5jwt48ns/h2o_user_started_from_python.out
  JVM stderr: /var/folders/xr/jyhzcr794ml6k843yzr0knh00000gn/T/tmp5jwt48ns/h2o_user_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Europe/Moscow
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.1
H2O_cluster_version_age:,25 days
H2O_cluster_name:,H2O_from_python_user_n2vwlt
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [55]:
# Преобразование в H2O-Frame

h2o_train = h2o.H2OFrame(pd.concat([x_train, y_train], axis=1))
h2o_valid = h2o.H2OFrame(pd.concat([x_valid, y_valid], axis=1))
h2o_test = h2o.H2OFrame(pd.concat([x_test, y_test], axis=1))

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [54]:
# Преобразуем целевую переменную ClaimInd в категориальную при помощи метода asfactor во всех наборах данных

h2o_train['ClaimInd'] = h2o_train['ClaimInd'].asfactor()
h2o_valid['ClaimInd'] = h2o_valid['ClaimInd'].asfactor()
h2o_test['ClaimInd'] = h2o_test['ClaimInd'].asfactor()

In [22]:
# Инициализируем и обучим GLM модель c кросс-валидацией

glm = H2OGeneralizedLinearEstimator(family="binomial", link="logit", nfolds=5)
glm.train(y="ClaimInd", x=h2o_train.names[:-1], training_frame=h2o_train, validation_frame=h2o_valid)

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [23]:
# Параметры модели: распределение, функция связи, гиперпараметры регуляризации, количество использованных объясняющих переменных

glm.summary()


GLM Model: summary


Unnamed: 0,Unnamed: 1,family,link,regularization,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
0,,binomial,logit,"Elastic Net (alpha = 0.5, lambda = 1.029E-4 )",26,20,3,py_1_sid_b4f5




In [24]:
# Метрики качества модели - по всем данным и на кросс-валидации

glm.cross_validation_metrics_summary().as_data_frame()

Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.70810735,0.029157773,0.7444287,0.68957996,0.68608314,0.68528074,0.73516434
1,auc,0.6846673,0.006111024,0.6879943,0.67722946,0.6791816,0.6876152,0.69131595
2,aucpr,0.1705188,0.005635749,0.17911673,0.16817896,0.16671005,0.16541204,0.17317624
3,err,0.29189262,0.029157773,0.25557134,0.31042004,0.31391686,0.31471923,0.26483566
4,err_count,4705.2,463.81967,4140.0,4981.0,5082.0,5061.0,4262.0
5,f0point5,0.19464457,0.012648499,0.21466854,0.18962303,0.18637961,0.18344142,0.19911024
6,f1,0.25399238,0.0091175465,0.26880962,0.2508648,0.24800237,0.24608968,0.25619546
7,f2,0.36667985,0.0068450687,0.35947096,0.37053493,0.37050137,0.37372184,0.35917008
8,lift_top_group,2.7790751,0.23046863,2.4983222,2.7269793,2.7667425,2.7629871,3.1403453
9,logloss,0.29542395,0.009043915,0.3087482,0.2988216,0.29432043,0.28488475,0.29034477


In [25]:
# Таблица коэффициентов модели (в зависимости от модели могут выводиться также стандартная ошибка, z-score и p-value)

glm._model_json['output']['coefficients_table'].as_data_frame()

Unnamed: 0,names,coefficients,standardized_coefficients
0,Intercept,-4.132658,-2.430618
1,Exposure,2.105119,0.608827
2,LicAge,0.000769,0.122768
3,Gender,0.0,0.0
4,MariStat,-0.107308,-0.038657
5,DrivAge,0.0,0.0
6,HasKmLimit,-0.344011,-0.107431
7,BonusMalus,0.0,0.0
8,OutUseNb,0.093155,0.064809
9,RiskArea,0.006938,0.015372


In [26]:
# Таблица нормированных коэффициентов по всем данным и на кросс-валидации

pmodels = {}
pmodels['overall'] = glm.coef_norm()
for x in range(len(glm.cross_validation_models())):
    pmodels[x] = glm.cross_validation_models()[x].coef_norm()
pd.DataFrame.from_dict(pmodels).round(5)

Unnamed: 0,overall,0,1,2,3,4
Intercept,-2.43062,-2.44875,-2.43701,-2.43034,-2.41475,-2.42215
Exposure,0.60883,0.60438,0.61394,0.61476,0.60676,0.60324
LicAge,0.12277,0.04501,0.10413,0.10827,0.05843,0.09007
Gender,0.0,0.00036,0.00698,-0.00123,-0.00691,0.00241
MariStat,-0.03866,-0.03377,-0.03125,-0.04649,-0.0512,-0.02971
DrivAge,0.0,0.00022,0.0,0.0,0.00116,0.0
HasKmLimit,-0.10743,-0.1112,-0.10198,-0.11723,-0.09609,-0.11222
BonusMalus,0.0,0.02498,0.0,0.01601,0.0,0.02243
OutUseNb,0.06481,0.0621,0.06442,0.06072,0.06816,0.06792
RiskArea,0.01537,0.01763,0.01917,0.02072,0.00765,0.01204


In [27]:
# Построение прогнозных значений для обучающей, валидационной и тестовой выборок

train_pred = glm.predict(h2o_train).as_data_frame()
valid_pred = glm.predict(h2o_valid).as_data_frame()
test_pred = glm.predict(h2o_test).as_data_frame()

glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%


In [28]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, roc_auc_score, log_loss

In [29]:
# Выведем импортированные выше метрики классификации для обучающей, валидационной и тестовой выборок

print(f'Train Accuracy: {np.round(accuracy_score(y_train, train_pred["predict"].values), 4)}')
print(f'Valid Accuracy: {np.round(accuracy_score(y_valid, valid_pred["predict"].values), 4)}')
print(f'Test Accuracy: {np.round(accuracy_score(y_test, test_pred["predict"].values), 4)}')
print()

print(f'Train F1: {np.round(f1_score(y_train, train_pred["predict"].values), 4)}')
print(f'Valid F1: {np.round(f1_score(y_valid, valid_pred["predict"].values), 4)}')
print(f'Test F1: {np.round(f1_score(y_test, test_pred["predict"].values), 4)}')
print()

print(f'Train Precision: {np.round(precision_score(y_train, train_pred["predict"].values), 4)}')
print(f'Valid Precision: {np.round(precision_score(y_valid, valid_pred["predict"].values), 4)}')
print(f'Test Precision: {np.round(precision_score(y_test, test_pred["predict"].values), 4)}')
print()

print(f'Train Recall: {np.round(recall_score(y_train, train_pred["predict"].values), 4)}')
print(f'Valid Recall: {np.round(recall_score(y_valid, valid_pred["predict"].values), 4)}')
print(f'Test Recall: {np.round(recall_score(y_test, test_pred["predict"].values), 4)}')
print()

print(f'Train ROC AUC: {np.round(roc_auc_score(y_train, train_pred["p1"].values), 4)}')
print(f'Valid ROC AUC: {np.round(roc_auc_score(y_valid, valid_pred["p1"].values), 4)}')
print(f'Test ROC AUC: {np.round(roc_auc_score(y_test, test_pred["p1"].values), 4)}')
print()

print(f'Train Log Loss: {np.round(log_loss(y_train, train_pred["p1"].values), 4)}')
print(f'Valid Log Loss: {np.round(log_loss(y_valid, valid_pred["p1"].values), 4)}')
print(f'Test Log Loss: {np.round(log_loss(y_test, test_pred["p1"].values), 4)}')
print()

Train Accuracy: 0.7166
Valid Accuracy: 0.7179
Test Accuracy: 0.7164

Train F1: 0.2507
Valid F1: 0.2509
Test F1: 0.256

Train Precision: 0.1672
Valid Precision: 0.1667
Test Precision: 0.17

Train Recall: 0.5006
Valid Recall: 0.5071
Test Recall: 0.5188

Train ROC AUC: 0.6858
Valid ROC AUC: 0.6842
Test ROC AUC: 0.6925

Train Log Loss: 0.2952
Valid Log Loss: 0.2922
Test Log Loss: 0.2924



#### Какие проблемы вы здесь видите? Как можно улучшить данный результат?

У каждой модели есть настраиваемые гиперпараметры. Попробуем провести настроку степени регуляризации в нашей модели, возможно, это позволит улучшить результат.

In [122]:
from h2o.grid.grid_search import H2OGridSearch

In [127]:
hyper_parameters = {'alpha': [0.0001,0.001,0.01,0.3,0.5,1.0], 'lambda': [0.0001,0.001,0.01,0.3,0.5,1.0]}
search_criteria = {'strategy': "RandomDiscrete", 'max_models': 100, 'stopping_tolerance': 0.001, 'stopping_rounds': 3,'seed': 1}
grid = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial', nfolds=5),
                             hyper_parameters, search_criteria=search_criteria)
grid.train(y="ClaimInd", x=h2o_train.names[:-1], training_frame=h2o_train, validation_frame=h2o_valid)

glm Grid Build progress: |████████████████████████████████████████████████| 100%


In [128]:
print(grid.sort_by('logloss', True))

  """Entry point for launching an IPython kernel.



Grid Search Results for H2OGeneralizedLinearEstimator: 


Unnamed: 0,Model Id,"Hyperparameters: [alpha, lambda]",logloss
0,Grid_GLM_Key_Frame__upload_8523d7daaf5242389d79882176a34b45.hex_mo...,"[0.0001, 0.0001]",0.295134
1,Grid_GLM_Key_Frame__upload_8523d7daaf5242389d79882176a34b45.hex_mo...,"[0.5, 0.0001]",0.295155
2,Grid_GLM_Key_Frame__upload_8523d7daaf5242389d79882176a34b45.hex_mo...,"[0.3, 0.0001]",0.295163
3,Grid_GLM_Key_Frame__upload_8523d7daaf5242389d79882176a34b45.hex_mo...,"[0.0001, 0.001]",0.295164
4,Grid_GLM_Key_Frame__upload_8523d7daaf5242389d79882176a34b45.hex_mo...,"[0.5, 0.001]",0.295243
5,Grid_GLM_Key_Frame__upload_8523d7daaf5242389d79882176a34b45.hex_mo...,"[0.001, 0.01]",0.295425
6,Grid_GLM_Key_Frame__upload_8523d7daaf5242389d79882176a34b45.hex_mo...,"[0.3, 0.01]",0.295974
7,Grid_GLM_Key_Frame__upload_8523d7daaf5242389d79882176a34b45.hex_mo...,"[0.5, 0.01]",0.296448
8,Grid_GLM_Key_Frame__upload_8523d7daaf5242389d79882176a34b45.hex_mo...,"[0.0001, 0.3]",0.305618
9,Grid_GLM_Key_Frame__upload_8523d7daaf5242389d79882176a34b45.hex_mo...,"[0.01, 0.3]",0.306259





In [130]:
# Инициализируем и обучим GLM модель c кросс-валидацией

glm_tuned = H2OGeneralizedLinearEstimator(family="binomial", link="logit", alpha=0.0001, lambda_=0.0001, nfolds=5)
glm_tuned.train(y="ClaimInd", x=h2o_train.names[:-1], training_frame=h2o_train, validation_frame=h2o_valid)

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [133]:
# Метрики качества модели - по всем данным и на кросс-валидации

glm_tuned.cross_validation_metrics_summary().as_data_frame()

Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.7116688,0.042102564,0.76442695,0.6721638,0.70160896,0.7464049,0.6737394
1,auc,0.68438655,0.0046685487,0.6885501,0.6767957,0.6872302,0.6860871,0.68326974
2,aucpr,0.17027879,0.0058368323,0.17413004,0.16366176,0.17795755,0.16600226,0.16964234
3,err,0.2883312,0.042102564,0.23557307,0.32783622,0.29839104,0.25359508,0.3262606
4,err_count,4647.2,669.53546,3825.0,5268.0,4859.0,4056.0,5228.0
5,f0point5,0.1948363,0.009584579,0.20795318,0.18258607,0.19747567,0.19709724,0.18906936
6,f1,0.2530084,0.0052971113,0.25684866,0.2452722,0.25873378,0.2508312,0.2533562
7,f2,0.36262557,0.020982277,0.33580574,0.37350553,0.37508845,0.3448451,0.38388297
8,lift_top_group,2.7520409,0.21625808,2.9915874,2.6740108,2.9661317,2.5041385,2.6243362
9,logloss,0.29547107,0.0038507476,0.29825634,0.29333338,0.29992846,0.29026464,0.2955726


In [136]:
# Таблица коэффициентов модели (в зависимости от модели могут выводиться также стандартная ошибка, z-score и p-value)

glm_tuned._model_json['output']['coefficients_table'].as_data_frame()

Unnamed: 0,names,coefficients,standardized_coefficients
0,Intercept,-4.184089,-2.431279
1,Exposure,2.105948,0.609067
2,LicAge,0.001427113,0.22791
3,Gender,2.510607e-05,1.2e-05
4,MariStat,-0.110412,-0.039775
5,DrivAge,0.01253649,0.187977
6,HasKmLimit,-0.3427273,-0.10703
7,BonusMalus,-0.004034127,-0.061925
8,OutUseNb,0.09400241,0.065398
9,RiskArea,0.007236844,0.016034


In [137]:
# Таблица нормированных коэффициентов по всем данным и на кросс-валидации

pmodels = {}
pmodels['overall'] = glm_tuned.coef_norm()
for x in range(len(glm_tuned.cross_validation_models())):
    pmodels[x] = glm_tuned.cross_validation_models()[x].coef_norm()
pd.DataFrame.from_dict(pmodels).round(5)

Unnamed: 0,overall,0,1,2,3,4
Intercept,-2.43128,-2.43452,-2.43021,-2.43713,-2.42395,-2.43176
Exposure,0.60907,0.60664,0.61787,0.60297,0.60815,0.60896
LicAge,0.22791,0.1809,0.23311,0.24711,0.23948,0.19082
Gender,1e-05,0.00305,-0.00084,-0.01337,0.00587,0.00517
MariStat,-0.03977,-0.03141,-0.02785,-0.03715,-0.04531,-0.05687
DrivAge,0.18798,0.14287,0.10805,0.17872,0.17676,0.18062
HasKmLimit,-0.10703,-0.0951,-0.11318,-0.10631,-0.11083,-0.10994
BonusMalus,-0.06193,-0.11931,-0.02195,-0.01091,-0.01512,-0.04086
OutUseNb,0.0654,0.06841,0.07498,0.05472,0.06377,0.0643
RiskArea,0.01603,0.01558,0.02044,0.01099,0.01196,0.02135


In [138]:
# Построение прогнозных значений для обучающей, валидационной и тестовой выборок

train_pred = glm_tuned.predict(h2o_train).as_data_frame()
valid_pred = glm_tuned.predict(h2o_valid).as_data_frame()
test_pred = glm_tuned.predict(h2o_test).as_data_frame()

glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%


In [139]:
# Выведем импортированные выше метрики классификации для обучающей, валидационной и тестовой выборок

print(f'Train Accuracy: {np.round(accuracy_score(y_train, train_pred["predict"].values), 4)}')
print(f'Valid Accuracy: {np.round(accuracy_score(y_valid, valid_pred["predict"].values), 4)}')
print(f'Test Accuracy: {np.round(accuracy_score(y_test, test_pred["predict"].values), 4)}')
print()

print(f'Train F1: {np.round(f1_score(y_train, train_pred["predict"].values), 4)}')
print(f'Valid F1: {np.round(f1_score(y_valid, valid_pred["predict"].values), 4)}')
print(f'Test F1: {np.round(f1_score(y_test, test_pred["predict"].values), 4)}')
print()

print(f'Train Precision: {np.round(precision_score(y_train, train_pred["predict"].values), 4)}')
print(f'Valid Precision: {np.round(precision_score(y_valid, valid_pred["predict"].values), 4)}')
print(f'Test Precision: {np.round(precision_score(y_test, test_pred["predict"].values), 4)}')
print()

print(f'Train Recall: {np.round(recall_score(y_train, train_pred["predict"].values), 4)}')
print(f'Valid Recall: {np.round(recall_score(y_valid, valid_pred["predict"].values), 4)}')
print(f'Test Recall: {np.round(recall_score(y_test, test_pred["predict"].values), 4)}')
print()

print(f'Train ROC AUC: {np.round(roc_auc_score(y_train, train_pred["p1"].values), 4)}')
print(f'Valid ROC AUC: {np.round(roc_auc_score(y_valid, valid_pred["p1"].values), 4)}')
print(f'Test ROC AUC: {np.round(roc_auc_score(y_test, test_pred["p1"].values), 4)}')
print()

print(f'Train Log Loss: {np.round(log_loss(y_train, train_pred["p1"].values), 4)}')
print(f'Valid Log Loss: {np.round(log_loss(y_valid, valid_pred["p1"].values), 4)}')
print(f'Test Log Loss: {np.round(log_loss(y_test, test_pred["p1"].values), 4)}')
print()

Train Accuracy: 0.6964
Valid Accuracy: 0.6966
Test Accuracy: 0.6971

Train F1: 0.2514
Valid F1: 0.2499
Test F1: 0.2547

Train Precision: 0.164
Valid Precision: 0.1623
Test Precision: 0.1657

Train Recall: 0.5383
Valid Recall: 0.5426
Test Recall: 0.5502

Train ROC AUC: 0.6859
Valid ROC AUC: 0.6845
Test ROC AUC: 0.6924

Train Log Loss: 0.2951
Valid Log Loss: 0.2922
Test Log Loss: 0.2924



Подбор степени регуляризации с помощью перебора гиперпараметров не помог. Метики качества остаются на том же урове с разницей да 3 знака после запятой.

In [140]:
h2o.cluster().shutdown()

H2O session _sid_b4f5 closed.
