In [None]:
import numpy as np
import pandas as pd
import swifter

import re
import json
import time
from collections import defaultdict, Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import LabelEncoder
from joblib import dump, load

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

from scipy import stats
from scipy.signal import savgol_filter

from hashlib import sha1

from model_saver import save_params_scores


import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
plt.rcParams['figure.figsize'] = (10, 10)
pd.options.display.max_columns = 120
pd.options.mode.chained_assignment = None

In [None]:
def gb_features(df, cat, num):
    feature_name_diff = f'GB_DIFF_FEATURE__{cat}__{num}'
    feature_name = f'GB_FEATURE__{cat}__{num}'
    
    mapping_mean = dict(df.groupby(cat, dropna=False)[num].agg('mean'))
    
    df[feature_name] = df[cat].map(mapping_mean)
    df[feature_name_diff] = df[num] - df[feature_name]

    return df, [feature_name_diff, feature_name]

In [None]:
top_pack_mapping = {
    '1000=Unlimited7Day': [7, float('nan'), 1000, -1, float('nan'), 1],
    '1500=Unlimited7Day': [7, float('nan'), 1500, -1, float('nan'), 1],
    '150=unlimited pilot auto': [float('nan'), float('nan'), 150, -1, float('nan'), 1],
    '200=Unlimited1Day': [1, float('nan'), 200, -1, float('nan'), 1],
    '200=unlimited pilot auto': [float('nan'), float('nan'), 200, -1, float('nan'), 1],
    '200F=10mnOnNetValid1H': [0.04167, float('nan'), 200, float('nan'), float('nan'), 2],
    '301765007': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 1],
    '305155009': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 1],
    '500=Unlimited3Day': [3, float('nan'), 500, -1, float('nan'), 1],
    'APANews_monthly': [float('nan'), 30, float('nan'), float('nan'), float('nan'), 4],
    'APANews_weekly': [float('nan'), 7, float('nan'), float('nan'), float('nan'), 4],
    'All-net 1000=5000;5d': [5, float('nan'), 1000, 5000, float('nan'), 3],
    'All-net 1000F=(3000F On+3000F Off);5d': [5, float('nan'), 1000, 6000, float('nan'), 3],
    'All-net 300=600;2d': [2, float('nan'), 300, 600, float('nan'), 3],
    'All-net 5000= 20000off+20000on;30d': [30, float('nan'), 5000, 40000, float('nan'), 3],
    'All-net 500= 4000off+4000on;24H': [1, float('nan'), 500, 8000, float('nan'), 3],
    'All-net 500F =2000F_AllNet_Unlimited': [float('nan'), float('nan'), 500, -1, float('nan'), 3],
    'All-net 500F=1250F_AllNet_1250_Onnet;48h': [2, float('nan'), 500, 2500, float('nan'), 3],
    'All-net 500F=2000F;5d': [5, float('nan'), 500, 2000, float('nan'), 3],
    'All-net 500F=4000F ; 5d': [5, float('nan'), 500, 4000, float('nan'), 3],
    'All-net 600F= 3000F ;5d': [5, float('nan'), 600, 3000, float('nan'), 3],
    'CVM_100F_unlimited': [float('nan'), float('nan'), 100, -1, float('nan'), 3],
    'CVM_100f=200 MB': [float('nan'), float('nan'), 100, float('nan'), 200, 5],
    'CVM_100f=500 onNet': [float('nan'), float('nan'), 100, 500, float('nan'), 3],
    'CVM_150F_unlimited': [float('nan'), float('nan'), 150, -1, float('nan'), 3],
    'CVM_200f=400MB': [float('nan'), float('nan'), 200, float('nan'), 400, 5],
    'CVM_500f=2GB': [float('nan'), float('nan'), 500, float('nan'), 2000, 5],
    'CVM_On-net 1300f=12500': [float('nan'), float('nan'), 1300, 12500, float('nan'), 3],
    'CVM_On-net 400f=2200F': [float('nan'), float('nan'), 400, 2200, float('nan'), 3],
    'CVM_on-net bundle 500=5000': [float('nan'), float('nan'), 500, 5000, float('nan'), 3],
    'Data: 100 F=40MB,24H': [1, float('nan'), 100, float('nan'), 40, 5],
    'Data: 200 F=100MB,24H': [1, float('nan'), 200, float('nan'), 100, 5],
    'Data: 200F=1GB,24H': [1, float('nan'), 200, float('nan'), 1000, 5],
    'Data: 490F=Night,00H-08H': [0.33333, float('nan'), 490, float('nan'), -1, 5],
    'Data:1000F=2GB,30d': [30, float('nan'), 1000, float('nan'), 2000, 5],
    'Data:1000F=5GB,7d': [7, float('nan'), 1000, float('nan'), 5000, 5],
    'Data:1000F=700MB,7d': [7, float('nan'), 1000, float('nan'), 700, 5],
    'Data:1500F=3GB,30D': [30, float('nan'), 1500, float('nan'), 3000, 5],
    'Data:1500F=SPPackage1,30d': [30, float('nan'), 1500, float('nan'), float('nan'), 4],
    'Data:150F=SPPackage1,24H': [1, float('nan'), 150, float('nan'), float('nan'), 4],
    'Data:200F=Unlimited,24H': [1, float('nan'), 200, -1, -1, 4],
    'Data:3000F=10GB,30d': [30, float('nan'), 3000, float('nan'), 10000, 5],
    'Data:300F=100MB,2d': [2, float('nan'), 300, float('nan'), 100, 5],
    'Data:30Go_V 30_Days': [30, float('nan'), float('nan'), float('nan'), 30000, 5],
    'Data:490F=1GB,7d': [7, float('nan'), 490, float('nan'), 1000, 5],
    'Data:500F=2GB,24H': [1, float('nan'), 500, float('nan'), 2000, 5],
    'Data:50F=30MB_24H': [1, float('nan'), 50, float('nan'), 30, 5],
    'Data:700F=1.5GB,7d': [7, float('nan'), 700, float('nan'), 1500, 5],
    'Data:700F=SPPackage1,7d': [7, float('nan'), 700, float('nan'), float('nan'), 4],
    'Data:DailyCycle_Pilot_1.5GB': [float('nan'), 1, float('nan'), float('nan'), 1500, 5],
    'Data:New-GPRS_PKG_1500F': [float('nan'), float('nan'), float('nan'), 1500, float('nan'), 4],
    'Data:OneTime_Pilot_1.5GB': [float('nan'), float('nan'), float('nan'), float('nan'), 1500, 5],
    'DataPack_Incoming': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 1],
    'Data_EVC_2Go24H': [1, float('nan'), float('nan'), float('nan'), 2000, 5],
    'Data_Mifi_10Go': [float('nan'), float('nan'), float('nan'), float('nan'), 10000, 5],
    'Data_Mifi_10Go_Monthly': [float('nan'), 30, float('nan'), float('nan'), 10000, 5],
    'Data_Mifi_20Go': [float('nan'), float('nan'), float('nan'), float('nan'), 20000, 5],
    'ESN_POSTPAID_CLASSIC_RENT': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 1],
    'EVC_1000=6000 F': [float('nan'), float('nan'), 1000, 6000, float('nan'), 3],
    'EVC_100Mo': [float('nan'), float('nan'), float('nan'), float('nan'), 100, 5],
    'EVC_1Go': [float('nan'), float('nan'), float('nan'), float('nan'), 1000, 5],
    'EVC_4900=12000F': [float('nan'), float('nan'), 4900, 12000, float('nan'), 3],
    'EVC_500=2000F': [float('nan'), float('nan'), 500, 2000, float('nan'), 3],
    'EVC_700Mo': [float('nan'), float('nan'), float('nan'), float('nan'), 700, 5],
    'EVC_JOKKO30': [float('nan'), 30, float('nan'), float('nan'), float('nan'), 1],
    'EVC_Jokko_Weekly': [float('nan'), 7, float('nan'), float('nan'), float('nan'), 1],
    'EVC_MEGA10000F': [float('nan'), float('nan'), float('nan'), 10000, float('nan'), 3],
    'EVC_PACK_2.2Go': [float('nan'), float('nan'), float('nan'), float('nan'), 2200, 5],
    'FIFA_TS_daily': [float('nan'), 1, float('nan'), float('nan'), float('nan'), 4],
    'FIFA_TS_monthly': [float('nan'), 30, float('nan'), float('nan'), float('nan'), 4],
    'FIFA_TS_weekly': [float('nan'), 7, float('nan'), float('nan'), float('nan'), 4],
    'FNF2 ( JAPPANTE)': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 1],
    'FNF_Youth_ESN': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 1],
    'Facebook_MIX_2D': [2, float('nan'), float('nan'), float('nan'), float('nan'), 4],
    'GPRS_3000Equal10GPORTAL': [float('nan'), float('nan'), 3000, float('nan'), 10000, 5],
    'GPRS_5Go_7D_PORTAL': [7, float('nan'), float('nan'), float('nan'), 5000, 5],
    'GPRS_BKG_1000F MIFI': [float('nan'), float('nan'), float('nan'), 1000, float('nan'), 3],
    'GPRS_PKG_5GO_ILLIMITE': [float('nan'), float('nan'), float('nan'), float('nan'), 5000, 5],
    'Go-NetPro-4 Go': [float('nan'), float('nan'), float('nan'), float('nan'), 4000, 5],
    'IVR Echat_Daily_50F': [float('nan'), 1, 50, float('nan'), float('nan'), 4],
    'IVR Echat_Monthly_500F': [float('nan'), 30, 500, float('nan'), float('nan'), 4],
    'IVR Echat_Weekly_200F': [float('nan'), 7, 200, float('nan'), float('nan'), 4],
    'Incoming_Bonus_woma': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 1],
    'Internat: 1000F_Zone_1;24H		': [1, float('nan'), 1000, float('nan'), float('nan'), 2],
    'Internat: 1000F_Zone_3;24h		': [1, float('nan'), 1000, float('nan'), float('nan'), 2],
    'Internat: 2000F_Zone_2;24H		': [1, float('nan'), 2000, float('nan'), float('nan'), 2],
    'Jokko_Daily': [float('nan'), 1, float('nan'), float('nan'), float('nan'), 4],
    'Jokko_Monthly': [float('nan'), 30, float('nan'), float('nan'), float('nan'), 4],
    'Jokko_Weekly': [float('nan'), 7, float('nan'), float('nan'), float('nan'), 4],
    'Jokko_promo': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 4],
    'MIXT: 200mnoff net _unl on net _5Go;30d': [30, float('nan'), float('nan'), -1, 5000, 2],
    'MIXT: 390F=04HOn-net_400SMS_400 Mo;4h	': [0.16667, float('nan'), 390, float('nan'), 400, 2],
    'MIXT: 4900F= 10H on net_1,5Go ;30d': [30, float('nan'), 4900, float('nan'), 5000, 2],
    'MIXT: 5000F=80Konnet_20Koffnet_250Mo;30d		': [30, float('nan'), 5000, 100000, 250, 3], 
    'MIXT: 500F=75(SMS, ONNET, Mo)_1000FAllNet;24h		': [1, float('nan'), 500, 1000, 75, 3],
    'MIXT: 590F=02H_On-net_200SMS_200 Mo;24h		': [1, float('nan'), 590, float('nan'), 200, 2],
    'MIXT:10000F=10hAllnet_3Go_1h_Zone3;30d		': [30, float('nan'), 10000, float('nan'), 3000, 2],
    'MIXT:1000F=4250 Off net _ 4250F On net _100Mo; 5d': [5, float('nan'), 1000, 8500, 100, 3],
    'MIXT:500F= 2500F on net _2500F off net;2d': [2, float('nan'), 500, 2500, float('nan'), 3],
    'MROMO_TIMWES_OneDAY': [1, float('nan'), float('nan'), float('nan'), float('nan'), 4],
    'MROMO_TIMWES_RENEW': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 4],
    'MegaChrono_3000F=12500F TOUS RESEAUX': [float('nan'), float('nan'), 3000, 12500, float('nan'), 3],
    'Mixt 250F=Unlimited_call24H': [1, float('nan'), 250, -1, float('nan'), 2],
    'Mixt : 500F=2500Fonnet_2500Foffnet ;5d': [5, float('nan'), 500, 5000, float('nan'), 3],
    'NEW_CLIR_PERMANENT_LIBERTE_MOBILE': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 4],
    'NEW_CLIR_TEMPALLOWED_LIBERTE_MOBILE': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 4],
    'NEW_CLIR_TEMPRESTRICTED_LIBERTE_MOBILE': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 4],
    'New_YAKALMA_4_ALL': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 1],
    'On net 200F= 3000F_10Mo ;24H': [1, float('nan'), 200, 3000, 10, 3],
    'On net 200F=Unlimited _call24H': [1, float('nan'), 200, -1, float('nan'), 2],
    'On-net 1000F=10MilF;10d': [10, float('nan'), 1000, -1, float('nan'), 3],
    'On-net 2000f_One_Month_100H; 30d': [30, float('nan'), 2000, float('nan'), float('nan'), 2],
    'On-net 200F=60mn;1d': [1, float('nan'), 200, float('nan'), float('nan'), 2],
    'On-net 300F=1800F;3d': [3, float('nan'), 300, 1800, float('nan'), 3],
    'On-net 500=4000,10d': [10, float('nan'), 500, 4000, float('nan'), 3],
    'On-net 500F_FNF;3d': [3, float('nan'), 500, float('nan'), float('nan'), 1],
    'Package3_Monthly': [float('nan'), 30, float('nan'), float('nan'), float('nan'), 1],
    'Pilot_Youth1_290': [float('nan'), float('nan'), 290, float('nan'), float('nan'), 4],
    'Pilot_Youth4_490': [float('nan'), float('nan'), 490, float('nan'), float('nan'), 4],
    'Postpaid FORFAIT 10H Package': [0.41667, float('nan'), float('nan'), float('nan'), float('nan'), 2],
    'SMS Max': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 1],
    'SUPERMAGIK_1000': [float('nan'), float('nan'), 1000, float('nan'), float('nan'), 1],
    'SUPERMAGIK_5000': [float('nan'), float('nan'), 5000, float('nan'), float('nan'), 1],
    'Staff_CPE_Rent': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 1],
    'TelmunCRBT_daily': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 1],
    'Twter_U2opia_Daily': [float('nan'), 1, float('nan'), float('nan'), float('nan'), 4],
    'Twter_U2opia_Monthly': [float('nan'), 30, float('nan'), float('nan'), float('nan'), 4],
    'Twter_U2opia_Weekly': [float('nan'), 7, float('nan'), float('nan'), float('nan'), 4],
    'VAS(IVR_Radio_Daily)': [float('nan'), 1, float('nan'), float('nan'), float('nan'), 4],
    'VAS(IVR_Radio_Monthly)': [float('nan'), 30, float('nan'), float('nan'), float('nan'), 4],
    'VAS(IVR_Radio_Weekly)': [float('nan'), 7, float('nan'), float('nan'), float('nan'), 4],
    'WIFI_ Family _10MBPS': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 5],
    'WIFI_ Family _4MBPS': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 5],
    'WIFI_Family_2MBPS': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 5],
    'YMGX 100=1 hour FNF, 24H/1 month': [1, float('nan'), 100, float('nan'), float('nan'), 2],
    'YMGX on-net 100=700F, 24H': [1, float('nan'), 100, 700, float('nan'), 3],
    'Yewouleen_PKG': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 1],
    float('nan'): [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 1],
    'pack_chinguitel_24h': [1, float('nan'), float('nan'), float('nan'), float('nan'), 1],
    'pilot_offer4': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 1],
    'pilot_offer5': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 1],
    'pilot_offer6': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 1],
    'pilot_offer7': [float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), 1]
}

In [None]:
def add_features(df, verbose=False):
    new_features = []
    
    # # # # #
    new_features.append('NANS_NUM')
    df['NANS_NUM'] = df.isna().sum(1)
    if verbose:
        print(new_features[-1], end=', ')
    
    
    # # # # #
    new_features.append('REVENUE__ISNA')
    df['REVENUE__ISNA'] = df['REVENUE'].isna()
    df['REVENUE'] = df['REVENUE'].fillna(500)
    if verbose:
        print(new_features[-1], end=', ')
    
    new_features.append('FREQUENCE__ISNA')
    df['FREQUENCE__ISNA'] = df['FREQUENCE'].isna()
    df['FREQUENCE'] = df['FREQUENCE'].fillna(1)
    if verbose:
        print(new_features[-1], end=', ')
    
    new_features.append('FREQUENCE_RECH__ISNA')
    df['FREQUENCE_RECH__ISNA'] = df['FREQUENCE_RECH'].isna()
    df['FREQUENCE_RECH'] = df['FREQUENCE_RECH'].fillna(1)
    if verbose:
        print(new_features[-1], end=', ')
    
    new_features.append('REVENUE__*__FREQUENCE_RECH')
    df['REVENUE__*__FREQUENCE_RECH'] = df['REVENUE'] * df['FREQUENCE_RECH']
    if verbose:
        print(new_features[-1], end=', ')
    
    new_features.append('REVENUE__*__FREQUENCE')
    df['REVENUE__*__FREQUENCE'] = df['REVENUE'] * df['FREQUENCE']
    if verbose:
        print(new_features[-1], end=', ')
    
    new_features.append('REVENUE__*__FREQUENCE__-__REVENUE__*__FREQUENCE_RECH')
    df['REVENUE__*__FREQUENCE__-__REVENUE__*__FREQUENCE_RECH'] = df['REVENUE__*__FREQUENCE'] - df['REVENUE__*__FREQUENCE_RECH']
    if verbose:
        print(new_features[-1], end=', ')
    
    
    # # # # #
    new_features.append('FREQ_TOP_PACK__ISNA')
    df['FREQ_TOP_PACK__ISNA'] = df['FREQ_TOP_PACK'].isna()
    df['FREQ_TOP_PACK'] = df['FREQ_TOP_PACK'].fillna(1)
    if verbose:
        print(new_features[-1], end=', ')
    
    new_features.append('MONTANT__ISNA')
    df['MONTANT__ISNA'] = df['MONTANT'].isna()
    df['MONTANT'] = df['MONTANT'].fillna(df['MONTANT'].mode()[0])
    if verbose:
        print(new_features[-1], end=', ')
    
    new_features.append('LOG__MONTANT')
    df['LOG__MONTANT'] = df['MONTANT'].apply(np.log)
    if verbose:
        print(new_features[-1], end=', ')
    
    new_features.append('DATA_VOLUME__ISNA')
    df['DATA_VOLUME__ISNA'] = df['DATA_VOLUME'].isna()
    df['DATA_VOLUME'] = df['DATA_VOLUME'].fillna(np.median(df[~df['DATA_VOLUME'].isna()]['DATA_VOLUME']))
    if verbose:
        print(new_features[-1], end=', ')
    
    new_features.append('LOG1P__DATA_VOLUME')
    df['LOG1P__DATA_VOLUME'] = df['DATA_VOLUME'].apply(np.log1p)
    if verbose:
        print(new_features[-1], end=', ')
    
    
    # # # # #
    new_features.append('ON_NET__ISNA')
    df['ON_NET__ISNA'] = df['ON_NET'].isna()
    df['ON_NET'] = df['ON_NET'].fillna(0)
    if verbose:
        print(new_features[-1], end=', ')
    
    new_features.append('ORANGE__ISNA')
    df['ORANGE__ISNA'] = df['ORANGE'].isna()
    df['ORANGE'] = df['ORANGE'].fillna(1)
    if verbose:
        print(new_features[-1], end=', ')
    
    new_features.append('TIGO__ISNA')
    df['TIGO__ISNA'] = df['TIGO'].isna()
    df['TIGO'] = df['TIGO'].fillna(1)
    if verbose:
        print(new_features[-1], end=', ')
    
    new_features.append('ZONE1__ISNA')
    df['ZONE1__ISNA'] = df['ZONE1'].isna()
    df['ZONE1'] = df['ZONE1'].fillna(0)
    if verbose:
        print(new_features[-1], end=', ')
    
    new_features.append('ZONE2__ISNA')
    df['ZONE2__ISNA'] = df['ZONE2'].isna()
    df['ZONE2'] = df['ZONE2'].fillna(0)
    if verbose:
        print(new_features[-1], end=', ')
    
    new_features.append('ON_NET__-__ZONES')
    df['ON_NET__-__ZONES'] = df['ON_NET'] - (df['ZONE2'] + df['ZONE1'])
    if verbose:
        print(new_features[-1], end=', ')
    
    new_features.append('ON_NET__-__OT')
    df['ON_NET__-__OT'] = df['ON_NET'] - (df['ORANGE'] + df['TIGO'])
    if verbose:
        print(new_features[-1], end=', ')
    
    new_features.append('LOG__DATA_VOLUME__+1__/__ON_NET__+1')
    df['LOG__DATA_VOLUME__+1__/__ON_NET__+1'] = ((df['DATA_VOLUME'] + 1) / (df['ON_NET'] + 1)).apply(np.log)
    if verbose:
        print(new_features[-1], end=', ')
    
    
    # # # # #
    new_features.append('ARPU_SEGMENT__ISNA')
    df['ARPU_SEGMENT__ISNA'] = df['ARPU_SEGMENT'].isna()
    df['ARPU_SEGMENT'] = df['ARPU_SEGMENT'].fillna(0)
    if verbose:
        print(new_features[-1], end=', ')
    
    new_features.append('ARPU_SEGMENT__/__REVENUE')
    df['ARPU_SEGMENT__/__REVENUE'] = df['ARPU_SEGMENT'] / df['REVENUE']
    df.loc[(df['ARPU_SEGMENT'] == 0) & (df['REVENUE'] == 0), 'ARPU_SEGMENT__/__REVENUE'] = 1
    if verbose:
        print(new_features[-1], end=', ')
    
    new_features.append('LOG1P__REVENUE__-__ARPU_SEGMENT')
    df['LOG1P__REVENUE__-__ARPU_SEGMENT'] = (df['REVENUE'] - df['ARPU_SEGMENT']).apply(np.log1p)
    if verbose:
        print(new_features[-1], end=', ')
        
    # # # # # 
    POP = {
        'FATICK': 24_243, 
        np.nan: -1, 
        'DAKAR': 1_056_000, 
        'LOUGA': 104_000, 
        'TAMBACOUNDA': 872_156 , 
        'KAOLACK': 410_577, 
        'THIES': 618_436,
        'SAINT-LOUIS': 277_245, 
        'KOLDA': 62_258, 
        'KAFFRINE': 39_357, 
        'DIOURBEL': 279_667, 
        'ZIGUINCHOR': 337_295,
        'MATAM': 17_324, 
        'SEDHIOU': 24_213, 
        'KEDOUGOU': 18_860
    }
    # International wealth index
    IWI = {
        'FATICK': 51.3, 
        np.nan: 35, 
        'DAKAR': 75.3, 
        'LOUGA': 55.3, 
        'TAMBACOUNDA': 47.5 , 
        'KAOLACK': 53.5, 
        'THIES': 68.6,
        'SAINT-LOUIS': 52.3, 
        'KOLDA': 39.6, 
        'KAFFRINE': 45, 
        'DIOURBEL': 55.8, 
        'ZIGUINCHOR': 55.3,
        'MATAM': 46, 
        'SEDHIOU': 45.5, 
        'KEDOUGOU': 45    
    }
    new_features.append('IWI')
    df['IWI'] = df['REGION'].map(IWI).fillna(35)
    if verbose:
        print(new_features[-1], end=', ')
    new_features.append('POP')
    df['POP'] = df['REGION'].map(POP).fillna(-1)
    if verbose:
        print(new_features[-1], end=', ')
    
    
    # # # # #
    categorical_features = ['TENURE', 'REGION']
    continuous_features = ['REVENUE', 'MONTANT', 'ARPU_SEGMENT', 'DATA_VOLUME', 'ON_NET', 'ORANGE', 'TIGO', 'ZONE1', 'ZONE2']
    
    for conf in continuous_features:
        for catf in categorical_features:
            df, new_feature_names = gb_features(df, catf, conf)
            new_features += new_feature_names
            if verbose:
                print(new_features[-3:], end=', ')
    
    le = LabelEncoder()
    df['TENURE'] = le.fit_transform(df['TENURE'])
    
    # # # # #
    new_features += ['TOP_PACK_l', 'TOP_PACK_r', 'TOP_PACK_p', 'TOP_PACK_m', 'TOP_PACK_i', 'TOP_PACK_t']
    df = pd.concat(
        (
            df,
            pd.DataFrame(dict(zip(
                ['TOP_PACK_l', 'TOP_PACK_r', 'TOP_PACK_p', 'TOP_PACK_m', 'TOP_PACK_i', 'TOP_PACK_t'], 
                np.stack(df["TOP_PACK"].map(top_pack_mapping).values).T
            )))
        ), 
        axis=1
    )
    if verbose:
        print(['TOP_PACK_l', 'TOP_PACK_r', 'TOP_PACK_p', 'TOP_PACK_m', 'TOP_PACK_i', 'TOP_PACK_t'])
    
    return df, new_features

In [None]:
sha1_hashes = defaultdict(lambda: -1, {sha1(str(i).encode('utf-8')).hexdigest(): i for i in range(10000000)})
def add_int_uid(df):
    df['user_id_int'] = df['user_id'].map(sha1_hashes)
    churn = df['CHURN'].values[np.argsort(df['user_id_int'].values)]
    churn[np.isnan(churn)] = 0
    df['user_id_int'] = savgol_filter(churn, 24001, 2)[np.argsort(np.argsort(df['user_id_int'].values))]
    return df

In [None]:
def get_all_isna_features(df):
    return (
        df['REVENUE'].isna(), 
        df['FREQUENCE'].isna(), 
        df['FREQUENCE_RECH'].isna(), 
        df['FREQ_TOP_PACK'].isna(), 
        df['MONTANT'].isna(), 
        df['DATA_VOLUME'].isna(), 
        df['ON_NET'].isna(), 
        df['ORANGE'].isna(), 
        df['TIGO'].isna(), 
        df['ZONE1'].isna(), 
        df['ZONE2'].isna(), 
        df['ARPU_SEGMENT'].isna(), 
        df['TOP_PACK'].isna()
    )

In [None]:
dftr = pd.read_csv('../data_orig/Train.csv')
dfts = pd.read_csv('../data_orig/Test.csv')
df = pd.concat([dftr, dfts]).reset_index(drop=True)
df = add_int_uid(df)
df, new_features = add_features(df)
df.drop(columns=['TOP_PACK', 'MRG', 'REGION'], inplace=True)
dftr = df.iloc[:len(dftr), :]
dfts = df.iloc[len(dftr):, :].reset_index(drop=True)

In [None]:
predictors = [c for c in dftr.columns if not c in ['user_id', 'CHURN']]
target = ['CHURN']

In [None]:
for c in df.columns[df.nunique() < 50]:
    if c == 'CHURN':
        continue
    for u in dftr[c].unique():
        if not (dfts[c] == u).any():
            print(c, u, len(df), end=' -> ')
            dftr = dftr[~(dftr[c] == u)]
            print(len(dftr))

In [None]:
from lightgbm import LGBMClassifier
import boostaroota as br
import catboost as cb

In [None]:
X, y = dftr[predictors].astype(float).values, dftr[target].values
mdl = LGBMClassifier(class_weight='balanced', n_estimators=500, learning_rate=0.075, silent=False)
mdl.fit(X, y)

remove = [k for k, v in dict(zip(predictors, mdl.feature_importances_)).items() if v < 10]
predictors2 = [p for p in predictors if not p in remove]

In [None]:
brm = br.BoostARoota(clf=LGBMClassifier(class_weight='balanced', n_estimators=500, learning_rate=0.075, silent=False))
dftr_new = brm.fit_transform(dftr[predictors2], y)
brm = br.BoostARoota(clf=LGBMClassifier(class_weight='balanced', n_estimators=500, learning_rate=0.075, silent=False))
dftr_new_v2 = brm.fit_transform(dftr[predictors2], y)
brm = br.BoostARoota(clf=LGBMClassifier(class_weight='balanced', n_estimators=500, learning_rate=0.075, silent=False))
dftr_new_v3 = brm.fit_transform(dftr[predictors2], y)

In [None]:
less_cols = list(sorted(list(set(dftr_new.columns).intersection(set(dftr_new_v2.columns)).intersection(set(dftr_new_v3.columns)))))
more_cols = list(sorted(list(set(dftr_new.columns.tolist() + dftr_new_v2.columns.tolist() + dftr_new_v3.columns.tolist()))))

In [None]:
json.dump(less_cols, open('less_cols.json', 'w'))
json.dump(more_cols, open('more_cols.json', 'w'))
X, y = dftr[less_cols].astype(float).values, dftr[target].values

In [None]:
dftr['user_id_int2'] = dftr['user_id'].map(sha1_hashes)
dfts['user_id_int2'] = dfts['user_id'].map(sha1_hashes)
less_cols2 = less_cols[:]
less_cols2[less_cols2.index('user_id_int')] = 'user_id_int2'
X2 = dftr[less_cols2]

In [None]:
mdl = cb.CatBoostClassifier(iterations=1000, task_type='GPU', l2_leaf_reg=0.03, learning_rate=0.05)
mdl.fit(X2, y)

In [None]:
preds_ts = mdl.predict_proba(dfts[less_cols2])[:, 1]
df1 = pd.read_csv('../submissions/new_features__first.csv')
df2 = pd.read_csv('../submissions/hard_dart_weightned_local_churn.csv')

In [None]:
pred_ts1 = (1 - df1['CHURN'])  # real score
pred_ts2 = (1 - df2['CHURN'])  # real score
pred_ts3 = np.copy(preds_ts)

In [None]:
pred_ultimate = pred_ts1 + pred_ts2 + pred_ts3
pred_ultimate /= max(pred_ultimate)

In [None]:
dfts['CHURN'] = pred_ultimate
dfts[['user_id', 'CHURN']].to_csv('../submissions/3best.csv', index=False)  # best solution

In [None]:
pred_ultimate = 0.8 * pred_ts1 + 0.2 * pred_ts2
dfts['CHURN'] = pred_ultimate
dfts[['user_id', 'CHURN']].to_csv('../submissions/2best_08_02.csv', index=False)  # second best solution