In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import random

from copy import deepcopy
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from tqdm import  tqdm
from lifelines.utils import concordance_index

np.random.seed(42)
random.seed(42)

import warnings
warnings.filterwarnings("ignore")

In [19]:
def get_features_clients():
    train = pd.read_csv("data/train.csv")
    clients = pd.read_csv("data/clients.csv").drop(columns=['bankemplstatus'])
    clients['employee_count_nm'] = LabelEncoder().fit_transform(clients['employee_count_nm'])

    target_values = train.loc[train.user_id.isin(train.user_id.unique()), 'target'].values
    clients['target'] = -1
    clients.loc[clients.user_id.isin(train.user_id.unique()), 'target'] = target_values
    return clients, train

In [20]:
clients, train = get_features_clients()
df_all2 = pd.read_parquet("features_data/df_all.parquet")
clients = clients.merge(df_all2, on='user_id', how='left')

In [4]:
train_new = clients.loc[clients.target!=-1]

X = train_new.drop(columns=['user_id', 'target'])
y = train_new.target.values

In [13]:
param_xgb = {'lambda': 0.02952384119001387,
 'alpha': 9.073365018418769e-05,
 'colsample_bytree': 0.25,
 'subsample': 0.8,
 'learning_rate': 0.017846526484495713,
 'n_estimators': 500,
 'max_depth': 8,
 'min_child_weight': 28}

param_lgb = {'lambda_l1': 3.629026441945485,'lambda_l2': 8.803696026000702, 'num_leaves': 500,
             'feature_fraction': 0.55, 'bagging_fraction': 0.50458230379904,
 'bagging_freq': 1, 'min_child_samples': 100, 'max_depth': 7,  'n_estimators': 1500, 'learning_rate': 0.008}

param_cat = {'learning_rate': 0.04886319540870409,
             'depth': 5, 'l2_leaf_reg': 4.509225581343402, 'bagging_temperature': 29.365597561189034}
models, score_list = [], []
strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, valid_index in strat_kfold.split(train, train['target']): 
    X_train, X_val = X.values[train_index], X.values[valid_index]
    y_train, y_val = y[train_index], y[valid_index]
    
    model = VotingClassifier([
        ('cat', CatBoostClassifier(iterations=1500, verbose=0, random_state=42, **param_cat)),
        ('xgb', XGBClassifier(random_state=45, **param_xgb)),
        ('lgb', LGBMClassifier(seed=35, **param_lgb)),
    ], voting='soft') 
     
    model.fit(X_train, y_train)
    
    predict = model.predict_proba(X_val)[:, 1]
    models.append(model)
    score_list.append(concordance_index(y_val, predict))
    print(concordance_index(y_val, predict))
    
print(np.mean(score_list)) 

# cv = 0.7868718879153279 lb = 0,7885853384520891
# catboost = 0.782391805532867

0.7804988702874366
0.7884091041806103
0.787184943533285
0.7797748592456905
0.7859880958830073
0.784371174626006


In [6]:
data = pd.DataFrame({
    'col': X.columns,
    'val': np.mean([m.feature_importances_ for m in [m.estimators_[1] for m in models]], axis=0)
}).sort_values(by='val', ascending=False)

In [7]:
test = pd.read_csv('data/sample_submit_naive.csv')
test = clients.loc[clients.user_id.isin(test.user_id.unique())]

X_test = test.drop(columns=['user_id', 'target'])

In [12]:
predict_mean = []

for m in models:
    print("Модель начинает обработку")
    predict_mean.append(m.predict_proba(X_test.values)[:, 1])
    
predict_test = np.sum(predict_mean, axis=0)

test['predict'] = predict_test
test[['user_id', 'predict']].to_csv('sub/submission_ensemble.csv', index=False)# 0,7888975433996723