In [None]:
import numpy as np
import pandas as pd
import swifter
import matplotlib.pyplot as plt
from hashlib import sha1
from collections import defaultdict
from category_encoders import LeaveOneOutEncoder, CatBoostEncoder

In [None]:
dftr = pd.read_csv('../data_orig/Train.csv')
dfts = pd.read_csv('../data_orig/Test.csv')

In [None]:
sha1_hashes = defaultdict(lambda: -1, {sha1(str(i).encode('utf-8')).hexdigest(): i for i in range(10000000)})
dftr['user_id_int'] = dftr['user_id'].map(sha1_hashes)
dfts['user_id_int'] = dfts['user_id'].map(sha1_hashes)

In [None]:
target = ['CHURN']

In [None]:
encoder = CatBoostEncoder(return_df=True)
dftr[['REGION', 'TENURE', 'MRG', 'TOP_PACK']] = encoder.fit_transform(dftr[['REGION', 'TENURE', 'MRG', 'TOP_PACK']], dftr[target])
dfts[['REGION', 'TENURE', 'MRG', 'TOP_PACK']] = encoder.transform(dfts[['REGION', 'TENURE', 'MRG', 'TOP_PACK']])

In [None]:
montant_median = np.median(dftr[~dftr['MONTANT'].isna()]['MONTANT'])
freq_rech_median = np.median(dftr[~dftr['FREQUENCE_RECH'].isna()]['FREQUENCE_RECH'])
rev_median = np.median(dftr[~dftr['REVENUE'].isna()]['REVENUE'])
arpu_median = np.median(dftr[~dftr['ARPU_SEGMENT'].isna()]['ARPU_SEGMENT'])
freq_median = np.median(dftr[~dftr['FREQUENCE'].isna()]['FREQUENCE'])
dvolume_median = np.median(dftr[~dftr['DATA_VOLUME'].isna()]['DATA_VOLUME'])
net_median = np.median(dftr[~dftr['ON_NET'].isna()]['ON_NET'])
orange_median = np.median(dftr[~dftr['ORANGE'].isna()]['ORANGE'])
tigo_median = np.median(dftr[~dftr['TIGO'].isna()]['TIGO'])
z1_median = np.median(dftr[~dftr['ZONE1'].isna()]['ZONE1'])
z2_median = np.median(dftr[~dftr['ZONE2'].isna()]['ZONE2'])
freq_top_median = np.median(dftr[~dftr['FREQ_TOP_PACK'].isna()]['FREQ_TOP_PACK'])

dftr['MONTANT'].fillna((montant_median), inplace=True)
dftr['FREQUENCE_RECH'].fillna((freq_rech_median), inplace=True)
dftr['REVENUE'].fillna((rev_median), inplace=True)
dftr['ARPU_SEGMENT'].fillna((arpu_median), inplace=True)
dftr['FREQUENCE'].fillna((freq_median), inplace=True)
dftr['DATA_VOLUME'].fillna((dvolume_median), inplace=True)
dftr['ON_NET'].fillna((net_median), inplace=True)
dftr['ORANGE'].fillna((orange_median), inplace=True)
dftr['TIGO'].fillna((tigo_median), inplace=True)
dftr['ZONE1'].fillna((z1_median), inplace=True)
dftr['ZONE2'].fillna((z2_median), inplace=True)
dftr['FREQ_TOP_PACK'].fillna((freq_top_median), inplace=True)

dfts['MONTANT'].fillna((montant_median), inplace=True)
dfts['FREQUENCE_RECH'].fillna((freq_rech_median), inplace=True)
dfts['REVENUE'].fillna((rev_median), inplace=True)
dfts['ARPU_SEGMENT'].fillna((arpu_median), inplace=True)
dfts['FREQUENCE'].fillna((freq_median), inplace=True)
dfts['DATA_VOLUME'].fillna((dvolume_median), inplace=True)
dfts['ON_NET'].fillna((net_median), inplace=True)
dfts['ORANGE'].fillna((orange_median), inplace=True)
dfts['TIGO'].fillna((tigo_median), inplace=True)
dfts['ZONE1'].fillna((z1_median), inplace=True)
dfts['ZONE2'].fillna((z2_median), inplace=True)
dfts['FREQ_TOP_PACK'].fillna((freq_top_median), inplace=True)

In [None]:
predictors = [
    'REGION', 'TENURE', 'MONTANT', 'FREQUENCE_RECH', 'REVENUE',
    'ARPU_SEGMENT', 'FREQUENCE', 'DATA_VOLUME', 'ON_NET', 'ORANGE', 'TIGO',
    'ZONE1', 'ZONE2', 'MRG', 'REGULARITY', 'TOP_PACK', 'FREQ_TOP_PACK', 'user_id_int'
]

In [None]:
from scipy import stats
from joblib import Parallel, delayed

In [None]:
space_for_check = np.linspace(0, 2000, 100000)
def get_half_window_len(std, eps=1e-6):
    return int(np.round(space_for_check[np.argwhere(stats.norm.pdf(space_for_check, loc=0, scale=std) < eps)[0, 0]]))

def get_smoothed_value2(a, v, std):
    return np.sum(stats.norm.pdf(a - v, loc=0, scale=std))

In [None]:
trw = np.argwhere(dftr[target].values[np.argsort(dftr['user_id_int'])])[:, 0]
trw2 = np.concatenate([-trw[trw < wsize], trw, wsize + trw[trw > max(trw) - wsize]])
std = 200
wsize = get_half_window_len(std)

In [None]:
smoothed_vals_dftr = Parallel(n_jobs=-1)(
    delayed(
        lambda i: get_smoothed_value2(trw2[(trw2 >= i - wsize) & (trw2 <= i + wsize)], i, std)
    )(v) for v in np.arange(max(dftr.user_id_int))
)

In [None]:
td = {i: v for i, v in enumerate(smoothed_vals_dftr)}
dftr['weightned_local_churn'] = dftr['user_id_int'].map(td)
dfts['weightned_local_churn'] = dfts['user_id_int'].map(td)
del td
predictors.append('weightned_local_churn')

In [None]:
import lightgbm as lgb

In [None]:
lgbt = lgb.LGBMClassifier(n_estimators=500, boosting_type='dart', class_weight='balanced', colsample_bytree=0.8, subsample=0.8, reg_alpha=0.1, reg_lambda=0.1, learning_rate=0.05)
lgbt.fit(dftr[predictors], dftr[target])

In [None]:
ts_preds = lgbt.predict_proba(dfts[predictors])[:, 0]  # hide score
sub1 = pd.DataFrame({'user_id': dfts['user_id'], 'CHURN': ts_preds})
sub1.to_csv('../submissions/hard_dart_weightned_local_churn.csv', index=False)