In [5]:
# This Python 3 environment comes with many import pandas as pd
import numpy as np
import gc 
import pandas as pd
import optuna
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import VotingClassifier

import random
random.seed(42)
np.random.seed(42)

In [6]:
target_col = "churn"

In [7]:
def get_features(df):
    df = df.drop(['slctn_nmbr','postal_code', 'phone_number', 'email'], axis=1)
    df['region'] = LabelEncoder().fit_transform(df["region"])
    df = df.drop_duplicates()
    df['id_'] = df['quarter'] + df['npo_account_id']
    df = df.drop_duplicates("id_")
    
    df['lst_pmnt_date_per_qrtr'] = pd.to_datetime(df['lst_pmnt_date_per_qrtr'])
    df['lst_pmnt_date_per_qrtr_day'] = df['lst_pmnt_date_per_qrtr'].dt.day
    df['lst_pmnt_date_per_qrtr_month'] = df['lst_pmnt_date_per_qrtr'].dt.month
    df['lst_pmnt_date_per_qrtr_year'] = df['lst_pmnt_date_per_qrtr'].dt.year
    df['day_sum1'] = df['lst_pmnt_date_per_qrtr_month']*30+df['lst_pmnt_date_per_qrtr_day']
    df['quarter1'] = LabelEncoder().fit_transform(df["quarter"].copy())
    
    col = ['balance', 'oprtn_sum_per_qrtr', 'oprtn_sum_per_year', 'pmnts_sum', 'incm_per_year',
    'pmnts_nmbr', 'pmnts_sum_per_qrtr', 'pmnts_sum_per_year', 'pmnts_nmbr_per_qrtr', 'pmnts_nmbr_per_year', 'incm_sum', 'incm_per_qrtr']

    for c in tqdm(col):
        data1 = df.groupby('npo_account_id')[c].mean().reset_index()
        data1.columns = ['npo_account_id', f'{c}_quarter']
        df = pd.merge(df, data1, on='npo_account_id', how='left')
        
    data1 = df.groupby('npo_account_id')["quarter"].nunique().reset_index()
    data1.columns = ['npo_account_id', f'quarter_size']
    df = pd.merge(df, data1, on='npo_account_id', how='left')
    df['quartal'] = df['quarter'].str[-1].astype('int32')
    return df

In [11]:
df = pd.read_parquet("C:/Users/Andrey/Documents/Ml/train.parquet")
df = get_features(df)

100%|██████████| 12/12 [00:48<00:00,  4.01s/it]


In [None]:
X = df.drop(columns=[target_col, 'id_', 'client_id', 'npo_account_id', 'quarter',
    'lst_pmnt_date_per_qrtr', 'frst_pmnt_date', 'quarter']).values 
y = df[target_col].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

In [None]:
'''def objective(trial):
    params = {
        'device':'gpu',
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': 1,
        'boosting_type': 'gbdt',
        'max_depth': trial.suggest_int('max_depth', 6, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-4, 1),
        'n_estimators': 50,
        'num_leaves ': trial.suggest_int('num_leaves', 3, 100)
    }
    model = LGBMClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)
print(study.best_params)
print(study.best_value)
'''

In [None]:
models = []
score_list_acc = []
score_list_auc = []
strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, valid_index in strat_kfold.split(X, y): 
    X_train, X_val = X[train_index], X[valid_index]
    y_train, y_val = y[train_index], y[valid_index]
    
    # 1500
    lgb_params = {'max_depth': 18, 'min_child_samples': 96, 'min_child_weight': 0.13310829983031214, 'num_leaves': 85,
                  'n_estimators':2350, 'device':'gpu', 'boosting_type': 'gbdt'}
    lgb = LGBMClassifier(**lgb_params)
    cat = CatBoostClassifier(iterations=5000, verbose=100, random_state=42, task_type="GPU", devices='0')#3000
    model = VotingClassifier(estimators=[('catboost', cat), ('lgbm', lgb)], voting='soft')
    model.fit(X_train, y_train)
    predict = model.predict_proba(X_val)[:, 1]
    models.append(model)
    
    score_list_auc.append(roc_auc_score(y_val, predict))
    score_list_acc.append(accuracy_score(y_val, np.round(predict)))
    
    print('roc_auc_score', roc_auc_score(y_val, predict))
    print('accuracy_score', accuracy_score(y_val, np.round(predict)))
    print()
    break
    
print('roc_auc_score', np.mean(score_list_auc))
print('accuracy_score', np.mean(score_list_acc))

# iter = 1200
#roc_auc_score 0.9967203256838154
#accuracy_score 0.9953078990898689

# iter = 1500
#roc_auc_score 0.9969534858449924
#accuracy_score 0.9954274661400886

#roc_auc_score 0.9984288345924386
#accuracy_score 0.9963405236144924

#quartal+
#roc_auc_score 0.9984746575073541
#accuracy_score 0.9963795431646806

#roc_auc_score 0.9984746575073541
#accuracy_score 0.9963795431646806

#ensemble+
#roc_auc_score 0.9981023210843869
#accuracy_score 0.9967850677755651

#roc_auc_score 0.9981023210843869
#accuracy_score 0.9967850677755651

In [None]:
predict = model.predict_proba(X)[:, 1]
predidct_dataframe = df[["","quartal", "npo_account_id"]]
predidct_dataframe['predict'] = predict
predidct_dataframe.to_csv("train_predict.csv")

In [None]:
def predict_one_npo_account_id(test_data, npo_account_id, impo=False):
    """
    Возращает для всех определенного  пользователя вероятности
     impo возращать веротяности или False=нет
    """
    col = ['npo_accnts_nmbr', 'pmnts_type', 'year', 'gender', 'age',
       'clnt_cprtn_time_d', 'actv_prd_d', 'lst_pmnt_rcnc_d', 'balance',
       'oprtn_sum_per_qrtr', 'oprtn_sum_per_year', 'frst_pmnt', 'lst_pmnt',
       'pmnts_sum', 'pmnts_nmbr', 'pmnts_sum_per_qrtr', 'pmnts_sum_per_year',
       'pmnts_nmbr_per_qrtr', 'pmnts_nmbr_per_year', 'incm_sum',
       'incm_per_qrtr', 'incm_per_year', 'mgd_accum_period',
       'mgd_payment_period', 'lk', 'assignee_npo', 'assignee_ops', 'region',
       'citizen', 'fact_addrss', 'appl_mrkr', 'evry_qrtr_pmnt',
       'lst_pmnt_date_per_qrtr_day', 'lst_pmnt_date_per_qrtr_month',
       'lst_pmnt_date_per_qrtr_year', 'day_sum1', 'quarter1',
       'balance_quarter', 'oprtn_sum_per_qrtr_quarter',
       'oprtn_sum_per_year_quarter', 'pmnts_sum_quarter',
       'incm_per_year_quarter', 'pmnts_nmbr_quarter',
       'pmnts_sum_per_qrtr_quarter', 'pmnts_sum_per_year_quarter',
       'pmnts_nmbr_per_qrtr_quarter', 'pmnts_nmbr_per_year_quarter',
       'incm_sum_quarter', 'incm_per_qrtr_quarter', 'quarter_size']
    
    test_data = test_data.loc[(test_data.npo_account_id==npo_account_id)]
    
    if impo:
        predict = model.predict_proba(test_data[col].values)[:, 1]
    else:
        predict = model.predict(test_data[col].values)
        
    predidct_dataframe = test_data[["quarter", "npo_account_id"]]
    predidct_dataframe['predict'] = predict
    return predidct_dataframe

In [None]:
predidct_dataframe.loc[predidct_dataframe.npo_account_id=="0x9DADF88CB3407C4E89403315F640393E"]

In [None]:
import seaborn as sns
sns.lineplot(predidct_dataframe.loc[predidct_dataframe.npo_account_id=="0x9DADF88CB3407C4E89403315F640393E"].predict)