In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
import warnings
import numpy as np
import dask.dataframe as dd
import os
import itertools
import matplotlib.ticker as ticker
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE 

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
alt.renderers.enable('default')
sns.set_theme(style="darkgrid")
warnings.filterwarnings('ignore')
sns.set_palette(sns.color_palette("Set3"))

In [2]:
dfMasterTable = pd.read_csv('dfMasterTable.csv')

In [3]:
dfMasterTable.head()

Unnamed: 0,msno,is_churn,date_pred,bd,gender,city_1,city_4,city_5,city_13,city_15,city_22,city_other,registered_via_3,registered_via_4,registered_via_7,registered_via_9,registered_via_11,registered_via_13,registered_via_other,tenure_days,actual_amount_paid_min,actual_amount_paid_max,actual_amount_paid_mean,actual_amount_paid_std,actual_amount_paid_sum,is_auto_renew_mean,is_auto_renew_sum,is_cancel_sum,is_cancel_mean,msno_count_x,diff_dates_min,diff_dates_max,diff_dates_mean,diff_dates_std,churn_mean,churn_sum,days_last_trx,total_secs_sum,total_secs_mean,total_secs_std,num_unq_sum,num_unq_mean,num_unq_std,msno_count_y,nbr_logins_monthly_mean,nbr_logins_monthly_max,nbr_logins_monthly_min,nbr_logins_monthly_std,nbr_logins_monthly_count,last_login_days
0,waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1,201702,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4319.0,0.0,149.0,74.5,105.35891,149.0,0.0,0.0,0.0,0.0,2.0,68.0,68.0,68.0,,1.0,1.0,25.0,92279.21,4613.9604,4240.848005,394.0,19.7,16.012807,20.0,8.666667,18.0,2.0,8.326664,3.0,3.0
1,QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=,1,201702,38.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4318.0,149.0,180.0,150.409091,6.609222,3309.0,0.954545,21.0,1.0,0.045455,22.0,16.0,95.0,34.809524,16.0954,0.25,1.0,1.0,2614515.0,5027.913888,5564.554567,7229.0,13.901923,14.006454,520.0,20.84,31.0,1.0,9.485427,25.0,33.0
2,fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=,1,201702,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4126.0,149.0,150.0,149.2,0.421637,1492.0,0.8,8.0,1.0,0.1,10.0,12.0,118.0,38.333333,30.512293,0.333333,1.0,20.0,2597385.0,10959.428928,8172.306431,11533.0,48.662447,35.829472,237.0,19.75,30.0,4.0,8.945542,12.0,1.0
3,mT5V8rEpa+8wuqi6x0DoVd3H5icMKkE9Prt49UlmK+4=,1,201702,23.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4109.0,0.0,1788.0,894.0,1264.306925,1788.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,,,,392.0,4840589.0,6779.536436,6303.076898,17943.0,25.130252,24.978054,714.0,28.269231,31.0,21.0,3.244166,26.0,2.0
4,XaPhtGLk/5UvvOYHcONTwsnH97P4eGECeq+BARGItRw=,1,201702,27.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4053.0,149.0,1788.0,447.0,601.297882,3576.0,0.0,0.0,0.0,0.0,8.0,3.0,409.0,108.142857,151.25853,0.0,0.0,4.0,18378210.0,25038.428218,11939.337626,71565.0,97.5,45.000447,734.0,29.153846,31.0,21.0,2.460769,26.0,2.0


In [4]:
dfMasterTable.groupby('date_pred')['is_churn'].value_counts()

date_pred  is_churn
201702     0           929460
           1            63471
201703     0           883630
           1            87330
201704     0           907471
Name: is_churn, dtype: int64

In [5]:
dfMasterTable.columns

Index(['msno', 'is_churn', 'date_pred', 'bd', 'gender', 'city_1', 'city_4',
       'city_5', 'city_13', 'city_15', 'city_22', 'city_other',
       'registered_via_3', 'registered_via_4', 'registered_via_7',
       'registered_via_9', 'registered_via_11', 'registered_via_13',
       'registered_via_other', 'tenure_days', 'actual_amount_paid_min',
       'actual_amount_paid_max', 'actual_amount_paid_mean',
       'actual_amount_paid_std', 'actual_amount_paid_sum',
       'is_auto_renew_mean', 'is_auto_renew_sum', 'is_cancel_sum',
       'is_cancel_mean', 'msno_count_x', 'diff_dates_min', 'diff_dates_max',
       'diff_dates_mean', 'diff_dates_std', 'churn_mean', 'churn_sum',
       'days_last_trx', 'total_secs_sum', 'total_secs_mean', 'total_secs_std',
       'num_unq_sum', 'num_unq_mean', 'num_unq_std', 'msno_count_y',
       'nbr_logins_monthly_mean', 'nbr_logins_monthly_max',
       'nbr_logins_monthly_min', 'nbr_logins_monthly_std',
       'nbr_logins_monthly_count', 'last_login_days

In [6]:
cont_cols = [x for x in dfMasterTable.columns if not (x.startswith('city') or x.startswith('registered') or x in ['gender', 'msno', 'is_churn', 'date_pred'])]

In [7]:
dfMasterTable.fillna(0, inplace=True)

## Training using all columns

### Basic Default Parameters XGBoost Model

In [41]:
train_df = dfMasterTable[dfMasterTable.date_pred <= 201702]
validation_df = dfMasterTable[dfMasterTable.date_pred == 201703]

X_train = train_df.drop(['msno', 'is_churn', 'date_pred'], 1).values
y_train = train_df['is_churn'].values

X_val = validation_df.drop(['msno', 'is_churn', 'date_pred'], 1).values
y_val = validation_df['is_churn'].values

In [42]:
xgb_all_cols = XGBClassifier(n_jobs=-1, tree_method="hist", random_state=0, eval_metric="logloss")
xgb_all_cols.fit(X_train, y_train)

In [43]:
def evaluate_model(model):
    y_pred = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    return auc

In [44]:
evaluate_model(xgb_all_cols)

0.8375388410642726

In [46]:
rf_all_cols = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
rf_all_cols.fit(X_train, y_train)

In [47]:
evaluate_model(rf_all_cols)

0.8345624302096066

### Training using only continuous columns

In [12]:
train_df = dfMasterTable[dfMasterTable.date_pred <= 201702]
validation_df = dfMasterTable[dfMasterTable.date_pred == 201703]

X_train = train_df.drop(['msno', 'is_churn', 'date_pred'], 1)[cont_cols].values
y_train = train_df['is_churn'].values

X_val = validation_df.drop(['msno', 'is_churn', 'date_pred'], 1)[cont_cols].values
y_val = validation_df['is_churn'].values

In [13]:
xgb_cont_cols = XGBClassifier(n_jobs=-1, tree_method="hist", random_state=0, eval_metric="logloss")
xgb_cont_cols.fit(X_train, y_train)

In [14]:
evaluate_model(xgb_cont_cols)

0.8383515160420465

### Oversampling the minority class using SMOTE (continuous variables only)


In [15]:
sm = SMOTE(random_state=42, k_neighbors=25)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [16]:
X_train.shape, X_res.shape

((992931, 32), (1858920, 32))

In [17]:
xgb_cont_cols_res = XGBClassifier(n_jobs=-1, tree_method="hist", random_state=0, eval_metric="logloss")
xgb_cont_cols_res.fit(X_res, y_res)

In [18]:
evaluate_model(xgb_cont_cols_res)

0.8161753543298687

###  Basic For Loop for Feature Selection  

In [19]:
auc_feature_selection = []

for i, column in enumerate(cont_cols):
    keep_cols = cont_cols[:i+1]
    
    X_train, y_train = train_df[keep_cols], train_df['is_churn']
    X_val, y_val = validation_df[keep_cols], validation_df['is_churn']
    evaluation = [( X_train, y_train), ( X_val, y_val)]
    
    
    clf = XGBClassifier(n_jobs=-1, tree_method="hist", random_state=0, eval_metric="logloss")
    clf.fit(X_train, y_train,
            eval_set=evaluation,
            early_stopping_rounds=10,verbose=False)

    auc = evaluate_model(clf)
    auc_feature_selection.append((list(keep_cols), auc))
    print(auc_feature_selection[i])


(['bd'], 0.6159114065758324)
(['bd', 'tenure_days'], 0.6379051564073593)
(['bd', 'tenure_days', 'actual_amount_paid_min'], 0.7680373258591725)
(['bd', 'tenure_days', 'actual_amount_paid_min', 'actual_amount_paid_max'], 0.7862980927794)
(['bd', 'tenure_days', 'actual_amount_paid_min', 'actual_amount_paid_max', 'actual_amount_paid_mean'], 0.7897941704544412)
(['bd', 'tenure_days', 'actual_amount_paid_min', 'actual_amount_paid_max', 'actual_amount_paid_mean', 'actual_amount_paid_std'], 0.7907600573233717)
(['bd', 'tenure_days', 'actual_amount_paid_min', 'actual_amount_paid_max', 'actual_amount_paid_mean', 'actual_amount_paid_std', 'actual_amount_paid_sum'], 0.7989305859008905)
(['bd', 'tenure_days', 'actual_amount_paid_min', 'actual_amount_paid_max', 'actual_amount_paid_mean', 'actual_amount_paid_std', 'actual_amount_paid_sum', 'is_auto_renew_mean'], 0.8110515772034894)
(['bd', 'tenure_days', 'actual_amount_paid_min', 'actual_amount_paid_max', 'actual_amount_paid_mean', 'actual_amount_pai

(['bd', 'tenure_days', 'actual_amount_paid_min', 'actual_amount_paid_max', 'actual_amount_paid_mean', 'actual_amount_paid_std', 'actual_amount_paid_sum', 'is_auto_renew_mean', 'is_auto_renew_sum', 'is_cancel_sum', 'is_cancel_mean', 'msno_count_x', 'diff_dates_min', 'diff_dates_max', 'diff_dates_mean', 'diff_dates_std', 'churn_mean', 'churn_sum', 'days_last_trx', 'total_secs_sum', 'total_secs_mean', 'total_secs_std', 'num_unq_sum', 'num_unq_mean', 'num_unq_std', 'msno_count_y', 'nbr_logins_monthly_mean', 'nbr_logins_monthly_max', 'nbr_logins_monthly_min'], 0.8383552603430133)
(['bd', 'tenure_days', 'actual_amount_paid_min', 'actual_amount_paid_max', 'actual_amount_paid_mean', 'actual_amount_paid_std', 'actual_amount_paid_sum', 'is_auto_renew_mean', 'is_auto_renew_sum', 'is_cancel_sum', 'is_cancel_mean', 'msno_count_x', 'diff_dates_min', 'diff_dates_max', 'diff_dates_mean', 'diff_dates_std', 'churn_mean', 'churn_sum', 'days_last_trx', 'total_secs_sum', 'total_secs_mean', 'total_secs_std'

In [20]:
city_columns = [x for x in train_df.columns if x.startswith('city')]
registration_columns = [x for x in train_df.columns if x.startswith('registered')]

categ_columns = [city_columns, registration_columns]

auc_copy = auc_feature_selection.copy()

auc_feature_selection = auc_copy

keep_cols = list(cont_cols)

In [21]:
for i, categ_feature in enumerate(categ_columns):
    keep_cols += categ_feature
    
    X_train, y_train = train_df[keep_cols], train_df['is_churn']
    X_val, y_val = validation_df[keep_cols], validation_df['is_churn']
    evaluation = [( X_train, y_train), ( X_val, y_val)]
    
    
    clf = XGBClassifier(n_jobs=-1, tree_method="hist", random_state=0, eval_metric="logloss")
    clf.fit(X_train, y_train,
            eval_set=evaluation,
            early_stopping_rounds=10,verbose=False)
    
    
    auc = evaluate_model(clf)
    auc_feature_selection.append((list(keep_cols), auc))
    print(auc_feature_selection[len(cont_cols) + i])
    

(['bd', 'tenure_days', 'actual_amount_paid_min', 'actual_amount_paid_max', 'actual_amount_paid_mean', 'actual_amount_paid_std', 'actual_amount_paid_sum', 'is_auto_renew_mean', 'is_auto_renew_sum', 'is_cancel_sum', 'is_cancel_mean', 'msno_count_x', 'diff_dates_min', 'diff_dates_max', 'diff_dates_mean', 'diff_dates_std', 'churn_mean', 'churn_sum', 'days_last_trx', 'total_secs_sum', 'total_secs_mean', 'total_secs_std', 'num_unq_sum', 'num_unq_mean', 'num_unq_std', 'msno_count_y', 'nbr_logins_monthly_mean', 'nbr_logins_monthly_max', 'nbr_logins_monthly_min', 'nbr_logins_monthly_std', 'nbr_logins_monthly_count', 'last_login_days', 'city_1', 'city_4', 'city_5', 'city_13', 'city_15', 'city_22', 'city_other'], 0.8379866982547692)
(['bd', 'tenure_days', 'actual_amount_paid_min', 'actual_amount_paid_max', 'actual_amount_paid_mean', 'actual_amount_paid_std', 'actual_amount_paid_sum', 'is_auto_renew_mean', 'is_auto_renew_sum', 'is_cancel_sum', 'is_cancel_mean', 'msno_count_x', 'diff_dates_min', 'd

In [22]:
final_auc = [x[1] for x in auc_feature_selection]

In [23]:
print(final_auc)

[0.6159114065758324, 0.6379051564073593, 0.7680373258591725, 0.7862980927794, 0.7897941704544412, 0.7907600573233717, 0.7989305859008905, 0.8110515772034894, 0.8165552789094008, 0.8390008153753211, 0.8362458612659969, 0.8356292745645018, 0.8379178243526305, 0.8404104480681927, 0.8457484191133495, 0.8396134719914572, 0.8450958009177862, 0.8402960774065342, 0.8435332432554079, 0.8428005328529377, 0.8454476447432931, 0.843603909125215, 0.8428075168441675, 0.8434922543057197, 0.8433644251318695, 0.8468989242089082, 0.8430265213106892, 0.8419235164111298, 0.8383552603430133, 0.83896050966745, 0.832512755407714, 0.8390873057043555, 0.8379866982547692, 0.8390647256145557]


In [24]:
keep_features = [True] + [True if final_auc[i] > final_auc[i-1] else False for i in range(1, len(final_auc))] 

In [25]:
print(keep_features)

[True, True, True, True, True, True, True, True, True, True, False, False, True, True, True, False, True, False, True, False, True, False, False, True, False, True, False, False, False, True, False, True, False, True]


In [29]:
result_feature_selection = list(zip(auc_feature_selection, keep_features))

result_continous = result_feature_selection[:-2]

final_keep_cols = [x[0][0][-1] for x in result_continous if x[1] ]

# continuous columns kept after feature selection
cont_keep_cols = final_keep_cols.copy()

# we keep all the continuous columns that we got from the feature selection and we agg the registration columns
registration_columns = [x for x in train_df.columns if x.startswith('registered')]

final_keep_cols += registration_columns

In [33]:
print(cont_keep_cols)

['bd', 'tenure_days', 'actual_amount_paid_min', 'actual_amount_paid_max', 'actual_amount_paid_mean', 'actual_amount_paid_std', 'actual_amount_paid_sum', 'is_auto_renew_mean', 'is_auto_renew_sum', 'is_cancel_sum', 'diff_dates_min', 'diff_dates_max', 'diff_dates_mean', 'churn_mean', 'days_last_trx', 'total_secs_mean', 'num_unq_mean', 'msno_count_y', 'nbr_logins_monthly_std', 'last_login_days']


In [36]:
X_train, y_train = train_df[cont_keep_cols], train_df['is_churn']
X_val, y_val = validation_df[cont_keep_cols], validation_df['is_churn']
evaluation = [( X_train, y_train), ( X_val, y_val)]


clf = XGBClassifier(n_jobs=-1, tree_method="hist", random_state=0, eval_metric="logloss")
clf.fit(X_train, y_train,
        eval_set=evaluation,
        early_stopping_rounds=10,verbose=False)


evaluate_model(clf)

0.8474713248855934

In [37]:
sm = SMOTE(random_state=42, k_neighbors=25)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [38]:
xgb_cont_cols_res = XGBClassifier(n_jobs=-1, tree_method="hist", random_state=0, eval_metric="logloss")
xgb_cont_cols_res.fit(X_res, y_res)

In [39]:
evaluate_model(xgb_cont_cols_res)

0.8094146524066411

In [35]:
X_train, y_train = train_df[final_keep_cols], train_df['is_churn']
X_val, y_val = validation_df[final_keep_cols], validation_df['is_churn']
evaluation = [( X_train, y_train), ( X_val, y_val)]


clf = XGBClassifier(n_jobs=-1, tree_method="hist", random_state=0, eval_metric="logloss")
clf.fit(X_train, y_train,
        eval_set=evaluation,
        early_stopping_rounds=10,verbose=False)


evaluate_model(clf)

0.849724805412312

In [48]:
old = pd.read_csv('old_data_dfMasterTable.csv')

In [49]:
old.head()

Unnamed: 0,msno,is_churn,date_pred,bd,gender,city_1,city_4,city_5,city_13,city_15,city_22,city_other,registered_via_3,registered_via_4,registered_via_7,registered_via_9,registered_via_11,registered_via_13,registered_via_other,tenure_days,actual_amount_paid_min,actual_amount_paid_max,actual_amount_paid_mean,actual_amount_paid_std,actual_amount_paid_sum,is_auto_renew_mean,is_auto_renew_sum,is_cancel_sum,is_cancel_mean,msno_count_x,diff_dates_min,diff_dates_max,diff_dates_mean,diff_dates_std,churn_mean,churn_sum,days_last_trx,total_secs_sum,total_secs_mean,total_secs_std,num_unq_sum,num_unq_mean,num_unq_std,msno_count_y,nbr_logins_monthly_mean,nbr_logins_monthly_max,nbr_logins_monthly_min,nbr_logins_monthly_std,nbr_logins_monthly_count,last_login_days
0,++4RuqBw0Ss6bQU4oMxaRlbBPoWzoEiIZaxPM04Y4+U=,0,201603,27.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,596.0,129,149,140.428571,10.271052,1966,1.0,14,0,0.0,14,28.0,31.0,30.461538,0.877058,0.0,0,17,734871.354,4175.40542,4063.203137,3793.0,21.551136,19.23443,176.0,16.576923,29.0,1.0,8.782588,26.0,1.0
1,++OepqRK4wiYg4Chl+qqo7TrwM+i9KZc3Ez/Swbjjew=,0,201603,27.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,36.0,0,149,74.5,105.35891,149,1.0,2,0,0.0,2,30.0,30.0,30.0,,0.0,0,6,324433.423,10138.544469,8666.467367,1217.0,38.03125,29.233419,32.0,16.75,29.0,5.0,13.022417,4.0,1.0
2,+/namlXq+u3izRjHCFJV4MgqcXcLidZYszVsROOq/y4=,0,201603,31.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3559.0,149,149,149.0,0.0,1490,1.0,10,0,0.0,10,29.0,91.0,37.222222,20.17906,0.0,0,1,69004.891,9857.841571,14085.226397,234.0,33.428571,31.775359,7.0,6.882353,25.0,1.0,8.469669,17.0,1.0
3,+0/X9tkmyHyet9X80G6GTrDFHnJqvai8d1ZPhayT0os=,0,201603,31.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4354.0,149,149,149.0,0.0,1490,1.0,10,0,0.0,10,29.0,91.0,37.222222,20.17906,0.0,0,1,2648108.911,8301.281853,6947.821911,10616.0,33.278997,26.210037,319.0,24.269231,31.0,14.0,4.229021,26.0,1.0
4,+09YGn842g6h2EZUXe0VWeC4bBoCbDGfUboitc0vIHw=,0,201603,29.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2901.0,149,149,149.0,0.0,1490,1.0,10,0,0.0,10,29.0,91.0,37.222222,20.17906,0.0,0,1,4114332.184,12172.580426,10113.703549,14881.0,44.026627,35.22887,338.0,25.038462,30.0,19.0,3.504064,26.0,1.0


In [51]:
old.date_pred.value_counts()

201701    867529
201603    806704
201604    730156
Name: date_pred, dtype: int64