In [None]:
import numpy as np
import pandas as pd
import swifter
import re
import json
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from hashlib import sha1
from collections import defaultdict
from model_saver import save_params_scores
plt.rcParams['figure.figsize'] = (10, 10)

In [None]:
def proc_str(ss):
    return ss.replace(' Go', 'Go').lower().replace(': ', ':').replace('+', ' ').replace('; ', ';').replace(' ;', ';'
           ).replace('=5000', ' 5000f').replace(' =', '=').replace('= ', '=').replace('on-net', ' ONNET ').replace('onnet', ' ONNET '
           ).replace('on net', ' ONNET ').replace('all-net', ' ALLNET ').replace('allnet', ' ALLNET '
           ).replace('_', ' ').replace('1 month', '30d').replace('24', ' 24').replace(' f=', 'f='
           ).replace('0 f', '0f').replace('0=', '0f=').replace('zone ', 'ZONE').replace('0o', '0f o'
           ).replace('cvm', ' CVM ').replace('apanews', ' APANEWS ').replace('evc', ' EVC '
           ).replace('fifa ts', ' FIFATS ').replace('jokko', ' JOKKO ').replace('ivr', ' IVR '
           ).replace('mixt', ' MIXT ').replace('youth', 'youth ').replace('youth', ' YOUTH '
           ).replace('new clir', ' NEWCLIR ').replace('pilot', ' PILOT ').replace('wifi family', ' WIFIFAMILY '
           ).replace('fnf', 'FNF').replace('supermagik ', 'SUPERMAGIK').replace('0 o', '0f o'
           ).replace('unlimited', 'UNLIMITED').replace('30 days', '30d').replace('0 mo', '0mo'
           ).replace(' mb', 'mb').replace('=', ' ').replace(';', ' ').replace(',', ' ').replace(':', ' '
           ).replace('(', ' ').replace(')', ' ').replace('0f', '0f ').replace('off net', 'offnet').replace('mega', 'mega '
           ).replace('12500', '12500f').replace('off', 'offnet').replace('on', 'ONNET').replace('20k', '20000f '
           ).replace('80k', '80000f ').replace('offnetnet', 'offnet').replace('offnet', ' offnet ').replace('500 ', '500f '
           ).replace('600 ', '600f ').replace('5000 ', '5000f ').replace('mONNETth', 'month ').replace('1250  O', '1250f O'
           ).replace('500f 5000', '500f 5000f').replace('zONNETe', 'zone').replace('0ff', '0f').replace('1250 ONNET', '1250f ONNET'
           ).replace('00h-08h', '8h').replace('offnet er', 'offer').replace('4000 ', '4000f ').replace('O 30', 'O 30d'
           ).replace('24h/', '24h ').replace('equal', 'f = ').replace('ALLNET 1000 ', 'ALLNET 1000 ').replace('  ', ' '
           ).replace('  ', ' ').replace('  ', ' ').strip().lower().replace(' offnet', 'offnet').replace(' onnet', 'onnet').replace(' allnet', 'allnet'
           ).replace('fonnete', 'fonnet')

def embd_string(ss):
    string_processed = proc_str(ss)
    num_groups = re.findall(r'([\d\.]+)([^\d\s]+)', string_processed)
    
    num_groups_dict = {}
    for tp in ['milf', 'f', 'f2', 'fonnet', 'fallnet', 'foffnet', 'mnonnet', 'mnoffnet', 'mnallnet', 'honnet', 'hoffnet', 'hallnet', 'mo', 'go', 'sms', 'd', 'gportal', 'day', 'mb', 'gb', 'mbps', 'h', 'opia', 'mn']:
        num_groups_dict[tp] = []
    if num_groups:
        for num, tp in num_groups:
            num_groups_dict[tp].append(float(num))
    for tp in ['milf', 'f', 'f2', 'fonnet', 'fallnet', 'foffnet', 'mnonnet', 'mnoffnet', 'mnallnet', 'honnet', 'hoffnet', 'hallnet', 'mo', 'go', 'sms', 'd', 'gportal', 'day', 'mb', 'gb', 'mbps', 'h', 'opia', 'mn']:
        if not num_groups_dict[tp]:
            num_groups_dict[tp].append(0)
        elif len(num_groups_dict[tp]) > 1:
            num_groups_dict['f2'] = [num_groups_dict[tp][1]]
            num_groups_dict[tp] = [num_groups_dict[tp][0]]
            
    return np.concatenate([cvec.transform([re.sub(r'([\d\.]+)([^\d\s]+)', '', ww)]).toarray()[0], np.array(list(num_groups_dict.values())).flatten()], dtype=np.float32)


def create_mapping(dftr, dfts):
    cvec = CountVectorizer(token_pattern='\S+')
    cvec.fit([re.sub(r'([\d\.]+)([^\d\s]+)', '', wtp) for wtp in to_process])
    to_process = []
    mapping_toppack = {}

    for ss in sorted(list(set(
        dftr[~dftr['TOP_PACK'].isna()]['TOP_PACK'].unique()
    ).union(set(
        dfts[~dfts['TOP_PACK'].isna()]['TOP_PACK'].unique()
    )))):
        to_process.append(proc_str(ss))

    for ss in sorted(list(set(
        dftr[~dftr['TOP_PACK'].isna()]['TOP_PACK'].unique()
    ).union(set(
        dfts[~dfts['TOP_PACK'].isna()]['TOP_PACK'].unique()
    )))):
        mapping_toppack[ss] = embd_string(ss)

    mapping_toppack[dfts[dfts['TOP_PACK'].isna()].loc[:, 'TOP_PACK'].values[0]] = np.zeros_like(embd_string(ss), dtype=np.float32)
    return mapping_toppack, len(embd_string(ss))

In [None]:
def gb_features(dftr, dfts, categorical_column, continious_column_can_be_filled):
    feature_name_diff = f'GB_DIFF_FEATURE__{categorical_column}__{continious_column_can_be_filled}'
    feature_name = f'GB_FEATURE__{categorical_column}__{continious_column_can_be_filled}'
    gb = dftr.groupby([categorical_column])[continious_column_can_be_filled].mean()
    msk_dftr = ~dftr[categorical_column].isna()
    msk_dfts = ~dfts[categorical_column].isna()
    dftr.loc[msk_dftr, feature_name_diff] = dftr[msk_dftr][[categorical_column, continious_column_can_be_filled + '__FILLED']].swifter.apply(
        lambda x: x[continious_column_can_be_filled + '__FILLED'] - gb[x[categorical_column]], axis=1
    )
    dfts.loc[msk_dfts, feature_name_diff] = dfts[msk_dfts][[categorical_column, continious_column_can_be_filled + '__FILLED']].swifter.apply(
        lambda x: x[continious_column_can_be_filled + '__FILLED'] - gb[x[categorical_column]], axis=1
    )
    na_mean = dftr[~msk_dftr][continious_column_can_be_filled].mean()
    dftr.loc[~msk_dftr, feature_name_diff] = dftr[continious_column_can_be_filled + '__FILLED'] - na_mean
    dfts.loc[~msk_dfts, feature_name_diff] = dfts[continious_column_can_be_filled + '__FILLED'] - na_mean
    
    dftr[feature_name] = dftr[feature_name_diff] + dftr[continious_column_can_be_filled + '__FILLED']
    dfts[feature_name] = dfts[feature_name_diff] + dfts[continious_column_can_be_filled + '__FILLED']
    
    return dftr, dfts, [feature_name_diff, feature_name]

In [None]:
def rank_target_encoding(dftr, dfts, feature_name):
    dd = dict(dftr.groupby(feature_name)['CHURN'].mean())
    ddd = dict(zip(np.sort(dftr.groupby(feature_name)['CHURN'].mean()), np.arange(len(dd))))
    mapping = {k: ddd[v] for k, v in dd.items()}
    dftr[feature_name + '__RANK_ENCODED'] = dftr[feature_name].map(mapping)
    dfts[feature_name + '__RANK_ENCODED'] = dfts[feature_name].map(mapping)
    return dftr, dfts, feature_name + '__RANK_ENCODED'

In [None]:
def add_features(dftr, dfts):
    new_features = []
    
    # # # # #
    new_features.append('NANS_NUM')
    dftr['NANS_NUM'] = dftr.isna().sum(1)
    dfts['NANS_NUM'] = dfts.isna().sum(1)
    print(new_features[-1], end=', ')
    
    
    # # # # #
    new_features.append('REVENUE__FILLED')
    dftr['REVENUE__FILLED'] = dftr['REVENUE'].fillna(0)
    dfts['REVENUE__FILLED'] = dfts['REVENUE'].fillna(0)
    print(new_features[-1], end=', ')
    
    new_features.append('FREQUENCE__FILLED')
    dftr['FREQUENCE__FILLED'] = dftr['FREQUENCE'].fillna(0)
    dfts['FREQUENCE__FILLED'] = dfts['FREQUENCE'].fillna(0)
    print(new_features[-1], end=', ')
    
    new_features.append('FREQUENCE_RECH__FILLED')
    dftr['FREQUENCE_RECH__FILLED'] = dftr['FREQUENCE_RECH'].fillna(0)
    dfts['FREQUENCE_RECH__FILLED'] = dfts['FREQUENCE_RECH'].fillna(0)
    print(new_features[-1], end=', ')
    
    new_features.append('REVENUE__*__FREQUENCE_RECH')
    dftr['REVENUE__*__FREQUENCE_RECH'] = dftr['REVENUE__FILLED'] * dftr['FREQUENCE_RECH__FILLED']
    dfts['REVENUE__*__FREQUENCE_RECH'] = dfts['REVENUE__FILLED'] * dfts['FREQUENCE_RECH__FILLED']
    print(new_features[-1], end=', ')
    
    new_features.append('REVENUE__*__FREQUENCE')
    dftr['REVENUE__*__FREQUENCE'] = dftr['REVENUE__FILLED'] * dftr['FREQUENCE__FILLED']
    dfts['REVENUE__*__FREQUENCE'] = dfts['REVENUE__FILLED'] * dfts['FREQUENCE__FILLED']
    print(new_features[-1], end=', ')
    
    new_features.append('REVENUE__*__FREQUENCE__-__REVENUE__*__FREQUENCE_RECH')
    dftr['REVENUE__*__FREQUENCE__-__REVENUE__*__FREQUENCE_RECH'] = dftr['REVENUE__*__FREQUENCE'] - dftr['REVENUE__*__FREQUENCE_RECH']
    dfts['REVENUE__*__FREQUENCE__-__REVENUE__*__FREQUENCE_RECH'] = dfts['REVENUE__*__FREQUENCE'] - dfts['REVENUE__*__FREQUENCE_RECH']
    print(new_features[-1], end=', ')
    
    
    # # # # #
    new_features.append('FREQ_TOP_PACK__FILLED')
    dftr['FREQ_TOP_PACK__FILLED'] = dftr['FREQ_TOP_PACK'].fillna(0)
    dfts['FREQ_TOP_PACK__FILLED'] = dfts['FREQ_TOP_PACK'].fillna(0)
    print(new_features[-1], end=', ')
    
    new_features.append('MONTANT__FILLED')
    dftr['MONTANT__FILLED'] = dftr['MONTANT'].fillna(np.median(dftr[~dftr['MONTANT'].isna()]['MONTANT']))
    dfts['MONTANT__FILLED'] = dfts['MONTANT'].fillna(np.median(dftr[~dftr['MONTANT'].isna()]['MONTANT']))
    print(new_features[-1], end=', ')
    
    new_features.append('LOG__MONTANT')
    dftr['LOG__MONTANT'] = dftr['MONTANT__FILLED'].apply(np.log)
    dfts['LOG__MONTANT'] = dfts['MONTANT__FILLED'].apply(np.log)
    print(new_features[-1], end=', ')
    
    new_features.append('DATA_VOLUME__FILLED')
    dftr['DATA_VOLUME__FILLED'] = dftr['DATA_VOLUME'].fillna(np.median(dftr[~dftr['DATA_VOLUME'].isna()]['DATA_VOLUME']))
    dfts['DATA_VOLUME__FILLED'] = dfts['DATA_VOLUME'].fillna(np.median(dftr[~dftr['DATA_VOLUME'].isna()]['DATA_VOLUME']))
    print(new_features[-1], end=', ')
    
    new_features.append('LOG1P__DATA_VOLUME')
    dftr['LOG1P__DATA_VOLUME'] = dftr['DATA_VOLUME__FILLED'].apply(np.log1p)
    dfts['LOG1P__DATA_VOLUME'] = dfts['DATA_VOLUME__FILLED'].apply(np.log1p)
    print(new_features[-1], end=', ')
    
    
    # # # # #
    new_features.append('ON_NET__FILLED')
    dftr['ON_NET__FILLED'] = dftr['ON_NET'].fillna(0)
    dfts['ON_NET__FILLED'] = dfts['ON_NET'].fillna(0)
    print(new_features[-1], end=', ')
    
    new_features.append('ORANGE__FILLED')
    dftr['ORANGE__FILLED'] = dftr['ORANGE'].fillna(0)
    dfts['ORANGE__FILLED'] = dfts['ORANGE'].fillna(0)
    print(new_features[-1], end=', ')
    
    new_features.append('TIGO__FILLED')
    dftr['TIGO__FILLED'] = dftr['TIGO'].fillna(0)
    dfts['TIGO__FILLED'] = dfts['TIGO'].fillna(0)
    print(new_features[-1], end=', ')
    
    new_features.append('ZONE1__FILLED')
    dftr['ZONE1__FILLED'] = dftr['ZONE1'].fillna(0)
    dfts['ZONE1__FILLED'] = dfts['ZONE1'].fillna(0)
    print(new_features[-1], end=', ')
    
    new_features.append('ZONE2__FILLED')
    dftr['ZONE2__FILLED'] = dftr['ZONE2'].fillna(0)
    dfts['ZONE2__FILLED'] = dfts['ZONE2'].fillna(0)
    print(new_features[-1], end=', ')
    
    new_features.append('ON_NET__-__ZONES')
    dftr['ON_NET__-__ZONES'] = dftr['ON_NET__FILLED'] - (dftr['ZONE2__FILLED'] + dftr['ZONE1__FILLED'])
    dfts['ON_NET__-__ZONES'] = dfts['ON_NET__FILLED'] - (dfts['ZONE2__FILLED'] + dfts['ZONE1__FILLED'])
    print(new_features[-1], end=', ')
    
    new_features.append('ON_NET__-__OT')
    dftr['ON_NET__-__OT'] = dftr['ON_NET__FILLED'] - (dftr['ORANGE__FILLED'] + dftr['TIGO__FILLED'])
    dfts['ON_NET__-__OT'] = dfts['ON_NET__FILLED'] - (dfts['ORANGE__FILLED'] + dfts['TIGO__FILLED'])
    print(new_features[-1], end=', ')
    
    new_features.append('LOG__DATA_VOLUME__+1__/__ON_NET__+1')
    dftr['LOG__DATA_VOLUME__+1__/__ON_NET__+1'] = ((dftr['DATA_VOLUME__FILLED'] + 1) / (dftr['ON_NET__FILLED'] + 1)).apply(np.log)
    dfts['LOG__DATA_VOLUME__+1__/__ON_NET__+1'] = ((dfts['DATA_VOLUME__FILLED'] + 1) / (dfts['ON_NET__FILLED'] + 1)).apply(np.log)
    print(new_features[-1], end=', ')
    
    
    # # # # #
    new_features.append('ARPU_SEGMENT__FILLED')
    dftr['ARPU_SEGMENT__FILLED'] = dftr['ARPU_SEGMENT'].fillna(0)
    dfts['ARPU_SEGMENT__FILLED'] = dfts['ARPU_SEGMENT'].fillna(0)
    print(new_features[-1], end=', ')
    
    new_features.append('ARPU_SEGMENT__/__REVENUE')
    dftr['ARPU_SEGMENT__/__REVENUE'] = dftr['ARPU_SEGMENT__FILLED'] / dftr['REVENUE__FILLED']
    dfts['ARPU_SEGMENT__/__REVENUE'] = dfts['ARPU_SEGMENT__FILLED'] / dfts['REVENUE__FILLED']
    dftr.loc[(dftr['ARPU_SEGMENT__FILLED'] == 0) & (dftr['REVENUE__FILLED'] == 0), 'ARPU_SEGMENT__/__REVENUE'] = 1
    dfts.loc[(dfts['ARPU_SEGMENT__FILLED'] == 0) & (dfts['REVENUE__FILLED'] == 0), 'ARPU_SEGMENT__/__REVENUE'] = 1
    print(new_features[-1], end=', ')
    
    new_features.append('LOG1P__REVENUE__-__ARPU_SEGMENT')
    dftr['LOG1P__REVENUE__-__ARPU_SEGMENT'] = (dftr['REVENUE__FILLED'] - dftr['ARPU_SEGMENT__FILLED']).apply(np.log1p)
    dfts['LOG1P__REVENUE__-__ARPU_SEGMENT'] = (dfts['REVENUE__FILLED'] - dfts['ARPU_SEGMENT__FILLED']).apply(np.log1p)
    print(new_features[-1], end=', ')
    
    
    # # # # #
    categorical_features = ['TENURE', 'REGION']
    continuous_features = ['REVENUE', 'MONTANT', 'ARPU_SEGMENT', 'DATA_VOLUME', 'ON_NET', 'ORANGE', 'TIGO', 'ZONE1', 'ZONE2']
    
    for conf in continuous_features:
        for catf in categorical_features:
            dftr, dfts, new_feature_names = gb_features(dftr, dfts, catf, conf)
            new_features += new_feature_names
            print(new_features[-2:], end=', ')
    
    
    # # # # #
    dftr['REGION__FILLED'] = dftr['REGION'].fillna('unk')
    dfts['REGION__FILLED'] = dfts['REGION'].fillna('unk')
    dftr, dfts, fname = rank_target_encoding(dftr, dfts, 'REGION__FILLED')
    new_features.append(fname)
    print(new_features[-1], end=', ')
    
    dftr['TENURE__FILLED'] = dftr['TENURE'].fillna('unk')
    dfts['TENURE__FILLED'] = dfts['TENURE'].fillna('unk')
    dftr, dfts, fname = rank_target_encoding(dftr, dfts, 'TENURE__FILLED')
    new_features.append(fname)
    print(new_features[-1], end=', ')
    
    dftr['TOP_PACK__FILLED'] = dftr['TOP_PACK'].fillna('unk')
    dfts['TOP_PACK__FILLED'] = dfts['TOP_PACK'].fillna('unk')
    dftr, dfts, fname = rank_target_encoding(dftr, dfts, 'TOP_PACK__FILLED')
    dfts.loc[dfts[fname].isna(), fname] = dftr.loc[dftr['TOP_PACK'].isna(), fname].values[0]
    new_features.append(fname)
    print(new_features[-1], end=', ')
    
    
    # # # # #
    new_features.append('user_id_int')
    sha1_hashes = defaultdict(lambda: -1, {sha1(str(i).encode('utf-8')).hexdigest(): i for i in range(10000000)})
    dftr['user_id_int'] = dftr['user_id'].map(sha1_hashes)
    dfts['user_id_int'] = dfts['user_id'].map(sha1_hashes)
    print(new_features[-1])
    
    return dftr, dfts, new_features

In [None]:
dftr = pd.read_csv('../data_orig/Train.csv')
dfts = pd.read_csv('../data_orig/Test.csv')
dftr, dfts, new_features = add_features(dftr, dfts)
predictors = ['REGULARITY'] + new_features
predictors = list(sorted(list(set(dftr[predictors].columns))))

In [None]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

In [None]:
def get_feature_importances(data, shuffle, predictors, target, seed=None):
    y = data[target].copy()
    if shuffle:
        y = data[target].copy().sample(frac=1.0)
    
    dtrain = lgb.Dataset(data[predictors], y, free_raw_data=False, silent=True)
    lgb_params = {
        'objective': 'binary',
        'subsample': 0.623,
        'colsample_bytree': 0.7,
        'num_leaves': 127,
        'max_depth': 8,
        'seed': seed,
        'bagging_freq': 1,
        'n_jobs': -1
    }
    
    clf = lgb.train(params=lgb_params, train_set=dtrain, num_boost_round=200)

    imp_df = pd.DataFrame()
    imp_df["feature"] = predictors
    imp_df["importance_gain"] = clf.feature_importance(importance_type='gain')
    imp_df["importance_split"] = clf.feature_importance(importance_type='split')
    imp_df['trn_score'] = roc_auc_score(y, clf.predict(data[predictors]))
    
    return imp_df

In [None]:
null_imp_df = pd.DataFrame()
start = time.time()
nb_runs = 80
dsp = ''

for i in range(nb_runs):
    imp_df = get_feature_importances(data=dftr, shuffle=True, predictors=predictors, target='CHURN')
    imp_df['run'] = i + 1 
    
    null_imp_df = pd.concat([null_imp_df, imp_df], axis=0)
    for l in range(len(dsp)):
        print('\b', end='', flush=True)
    spent = (time.time() - start) / 60
    dsp = f'Done with {i + 1} of {nb_runs} (Spent {spent:5.1f} min)'
    print(dsp, end='', flush=True)

In [None]:
np.random.seed(3333)
actual_imp_df = get_feature_importances(data=dftr, shuffle=False, predictors=predictors, target='CHURN', seed=3333)

In [None]:
def display_distributions(actual_imp_df_, null_imp_df_, feature_):
    plt.figure(figsize=(13, 6))
    gs = gridspec.GridSpec(1, 2)
    # Plot Split importances
    ax = plt.subplot(gs[0, 0])
    a = ax.hist(null_imp_df_.loc[null_imp_df_['feature'] == feature_, 'importance_split'].values, label='Null importances')
    ax.vlines(x=actual_imp_df_.loc[actual_imp_df_['feature'] == feature_, 'importance_split'].mean(), 
               ymin=0, ymax=np.max(a[0]), color='r',linewidth=10, label='Real Target')
    ax.legend()
    ax.set_title('Split Importance of %s' % feature_.upper(), fontweight='bold')
    plt.xlabel('Null Importance (split) Distribution for %s ' % feature_.upper())
    # Plot Gain importances
    ax = plt.subplot(gs[0, 1])
    a = ax.hist(null_imp_df_.loc[null_imp_df_['feature'] == feature_, 'importance_gain'].values, label='Null importances')
    ax.vlines(x=actual_imp_df_.loc[actual_imp_df_['feature'] == feature_, 'importance_gain'].mean(), 
               ymin=0, ymax=np.max(a[0]), color='r',linewidth=10, label='Real Target')
    ax.legend()
    ax.set_title('Gain Importance of %s' % feature_.upper(), fontweight='bold')
    plt.xlabel('Null Importance (gain) Distribution for %s ' % feature_.upper())

In [None]:
for ff in predictors:
    display_distributions(actual_imp_df_=actual_imp_df, null_imp_df_=null_imp_df, feature_=ff)

In [None]:
correlation_scores = []
for _f in actual_imp_df['feature'].unique():
    f_null_imps = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_gain'].values
    f_act_imps = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_gain'].values
    gain_score = 100 * (f_null_imps < np.percentile(f_act_imps, 25)).sum() / f_null_imps.size
    f_null_imps = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_split'].values
    f_act_imps = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_split'].values
    split_score = 100 * (f_null_imps < np.percentile(f_act_imps, 25)).sum() / f_null_imps.size
    correlation_scores.append((_f, split_score, gain_score))

corr_scores_df = pd.DataFrame(correlation_scores, columns=['feature', 'split_score', 'gain_score'])

fig = plt.figure(figsize=(16, 16))
gs = gridspec.GridSpec(1, 2)
# Plot Split importances
ax = plt.subplot(gs[0, 0])
sns.barplot(x='split_score', y='feature', data=corr_scores_df.sort_values('split_score', ascending=False).iloc[0:70], ax=ax)
ax.set_title('Feature scores wrt split importances', fontweight='bold', fontsize=14)
# Plot Gain importances
ax = plt.subplot(gs[0, 1])
sns.barplot(x='gain_score', y='feature', data=corr_scores_df.sort_values('gain_score', ascending=False).iloc[0:70], ax=ax)
ax.set_title('Feature scores wrt gain importances', fontweight='bold', fontsize=14)
plt.tight_layout()
plt.suptitle("Features' split and gain scores", fontweight='bold', fontsize=16)
fig.subplots_adjust(top=0.93)

In [None]:
predictors2 = corr_scores_df[corr_scores_df['split_score'] > 0]['feature'].tolist()

In [None]:
mdl = lgb.LGBMClassifier(boosting_type='dart', class_weight='balanced', n_estimators=1000, num_leaves=64, learning_rate=0.03, reg_lambda=0.00001, reg_alpha=0.00001, subsample=0.8, colsample_bytree=0.8, random_state=3333, verbose=1)
mdl.fit(dftr[predictors2], dftr['CHURN'])
preds_test = mdl.predict_proba(dfts[predictors2])[:, 1]

In [None]:
sub = pd.DataFrame({'user_id': dfts['user_id'], 'CHURN': 1 - preds_test})  # hide score
sub.to_csv('../submissions/new_features__first.csv', index=False)