In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import statistics
import pickle
import os

import matplotlib.pylab as plt
import lightgbm as lgb

from sklearn.metrics import roc_auc_score

from sklearn.model_selection import (
    train_test_split,
    TimeSeriesSplit,
    KFold,
    StratifiedKFold,
    GroupKFold,
    StratifiedGroupKFold,
)

from pandas.tseries.offsets import DateOffset

plt.style.use('classic')

In [2]:
# read in data for pre-processing
df = pd.read_csv('ecoshare_sales_v3.csv', low_memory=False) #opened original file in excel and saved as CSV in excel


#engineer features based on order day
df['order_day'] = pd.to_datetime(df['order_day'])
df['order_day_month'] = df['order_day'].dt.month_name(locale='English')
df['order_day_day_of_week'] = df['order_day'].dt.day_name()
df['order_day_day_of_month'] = df['order_day'].dt.day
df['order_day_quarter'] = pd.PeriodIndex(df.order_day, freq='Q').astype(str).str[-2:]

#fill blank values of categorical variables as its own category
df['tos_flg'].fillna('blank_value', inplace=True)
df['tos_flg'] = df['tos_flg'].astype('category')

df['disconotice_flg'].fillna('blank_value', inplace=True)
df['disconotice_flg'] = df['disconotice_flg'].astype('category')

df['latefee_flg'].fillna('blank_value', inplace=True)
df['latefee_flg'] = df['latefee_flg'].astype('category')

df['dwelling_type_cd'].fillna('blank_value', inplace=True)
df['dwelling_type_cd'] = df['dwelling_type_cd'].astype('category')

df['product_type_cd'].fillna('blank_value', inplace=True)
df['product_type_cd'] = df['product_type_cd'].astype('category')

df['automatic_payment_flg'].fillna('blank_value', inplace=True)
df['automatic_payment_flg'] = df['automatic_payment_flg'].astype('category')

df['weblog_flg'].fillna('blank_value', inplace=True)
df['weblog_flg'] = df['weblog_flg'].astype('category')

df['risk_level'].fillna('blank_value', inplace=True)
df['risk_level'] = df['risk_level'].astype('category')

df['ebill_enroll_flag'].fillna('blank_value', inplace=True)
df['ebill_enroll_flag'] = df['ebill_enroll_flag'].astype('category')

df['called_flg'].fillna('blank_value', inplace=True)
df['called_flg'] = df['called_flg'].astype('category')

df['oam_flg'].fillna('blank_value', inplace=True)
df['oam_flg'] = df['oam_flg'].astype('category')

df['sap_productname'].fillna('blank_value', inplace=True)
df['sap_productname'] = df['sap_productname'].astype('category')

df['disconnects_flg'].fillna('blank_value', inplace=True)
df['disconnects_flg'] = df['disconnects_flg'].astype('category')

df['load_profile'].fillna('blank_value', inplace=True)
df['load_profile'] = df['load_profile'].astype('category')

df['city'].fillna('blank_value', inplace=True)
df['city'] = df['city'].astype('category')

df['county'].fillna('blank_value', inplace=True)
df['county'] = df['county'].astype('category')

df['tdsp'].fillna('blank_value', inplace=True)
df['tdsp'] = df['tdsp'].astype('category')

df['dma'].fillna('blank_value', inplace=True)
df['dma'] = df['dma'].astype('category')

df['segment'] = 'S' + df['segment'].astype(str)
df['segment'].fillna('blank_value', inplace=True)
df['segment'] = df['segment'].astype('category')

df['order_day_month'].fillna('blank_value', inplace=True)
df['order_day_month'] = df['order_day_month'].astype('category')

df['order_day_day_of_week'].fillna('blank_value', inplace=True)
df['order_day_day_of_week'] = df['order_day_day_of_week'].astype('category')

df['order_day_quarter'].fillna('blank_value', inplace=True)
df['order_day_quarter'] = df['order_day_quarter'].astype('category')


# fill binary blank values with assumed NO (categorical) or 0 (numerical) if blank
df['pool'].fillna('N', inplace=True)
df['pool'] = df['pool'].astype('category')
df['deposit_onhand_amt'] = df['deposit_onhand_amt'].fillna(0)
df['deposit_onhand_amt'] = df['deposit_onhand_amt'].astype(np.int64)
df['ev_driver'].fillna('N', inplace=True)
df['ev_driver'] = df['ev_driver'].astype('category')


#fill median values for numerical variables
oam_active_login_cnt_median = statistics.median(df['oam_activelogin_cnt'].dropna().to_list())
df['oam_activelogin_cnt'] = df['oam_activelogin_cnt'].fillna(oam_active_login_cnt_median)
called_numcalls_cnt_median = statistics.median(df['called_numcalls_cnt'].dropna().to_list())
df['called_numcalls_cnt'] = df['called_numcalls_cnt'].fillna(called_numcalls_cnt_median)
curr_usage_median = statistics.median(df['curr_usage'].dropna().to_list())
df['curr_usage'] = df['curr_usage'].fillna(curr_usage_median)
df['curr_usage'] = df['curr_usage'].astype(np.int64)
numweblog_cnt_median = statistics.median(df['numweblog_cnt'].dropna().to_list())
df['numweblog_cnt'] = df['numweblog_cnt'].fillna(numweblog_cnt_median)
home_value_median = statistics.median(df['home_value'].dropna().to_list())
df['home_value'] = df['home_value'].fillna(home_value_median)
df['home_value'] = df['home_value'].astype(np.int64)

# unique cases
#lots of unique values for term_length so exporting to a CSV and opening in MS Excel
#df['term_length'].value_counts(dropna=False).reset_index().rename(columns={"index": "term_length", "term_length": "count"}).to_csv('term_length.csv', index=False)
#upon review, see 1 null record, 32 records with MM string, and 17 records with C& string
#will fill null, and string records with imputed value (median)
term_length_impute_list = df['term_length'].dropna().to_list()
term_length_impute_list = [i for i in term_length_impute_list if (i != 'MM' and i != 'C&')]
term_length_impute_list = list(map(int, term_length_impute_list))
term_length_imputed_value = statistics.median(term_length_impute_list)
df['term_length'] = df['term_length'].fillna(term_length_imputed_value)
df['term_length'] = df['term_length'].replace(['MM', 'C&'], term_length_imputed_value)
df['term_length'] = df['term_length'].astype(np.int64)

df['order_day_day_of_month'] = df['order_day_day_of_month'].astype(np.int64)

df['zipcode'] = 'Z'+df['zipcode'].astype(str).str[:3]
df['zipcode'] = df['zipcode'].astype('category')

In [3]:
#complex feature engineering
df['year_month'] = pd.DatetimeIndex(df['order_day']).year.astype(str) + '-' + pd.DatetimeIndex(df['order_day']).month.astype(str)

accept_rate_by_month = df.groupby('year_month')['accept'].agg(['sum','count']).reset_index()
accept_rate_by_month['accept_rate'] = accept_rate_by_month['sum'] / accept_rate_by_month['count']
del accept_rate_by_month['sum']
#del accept_rate_by_month['count']
accept_rate_by_month.columns = ['yr_m', 'call_count','accept_rate']

df['order_day_minus_one'] = df['order_day'] - DateOffset(months=1)
df['year_month_minus_one'] = pd.DatetimeIndex(df['order_day_minus_one']).year.astype(str) + '-' + pd.DatetimeIndex(df['order_day_minus_one']).month.astype(str)
df = df.merge(accept_rate_by_month, how='left', left_on=['year_month_minus_one'], right_on=['yr_m'])
df['accept_rate'] = df['accept_rate'].fillna(0)
df.rename(columns={"accept_rate": "prior_month_accept_rate", "call_count":"prior_month_call_count"}, inplace=True)
df['prior_month_call_count'] = df['prior_month_call_count'].fillna(statistics.median(df['prior_month_call_count'].dropna().to_list()))
df['prior_month_call_count'] = df['prior_month_call_count'].astype(np.int64)
del df['yr_m']


# df['order_day_minus_three'] = df['order_day'] - DateOffset(months=3)
# df['year_month_minus_three'] = pd.DatetimeIndex(df['order_day_minus_three']).year.astype(str) + '-' + pd.DatetimeIndex(df['order_day_minus_three']).month.astype(str)
# df = df.merge(accept_rate_by_month, how='left', left_on=['year_month_minus_three'], right_on=['yr_m'])
# del df['yr_m']
# df['accept_rate'] = df['accept_rate'].fillna(0)
# df.rename(columns={"accept_rate": "prior_three_month_accept_rate"}, inplace=True)


df_test = df.copy()
call_attempt = []
for index, row in df.iterrows():
    c_id = row['customer_id']
    or_day = row['order_day']
    df_temp = df_test[df_test['customer_id']==c_id]
    df_temp = df_temp[df_temp['order_day']<or_day]
    call_attempt.append(len(df_temp))
df['call_attempt'] = call_attempt


In [4]:
features = ['tos_flg', 'disconotice_flg', 'oam_activelogin_cnt',
       'term_length', 'called_numcalls_cnt', 'latefee_flg', 'dwelling_type_cd',
       'curr_usage', 'product_type_cd', 'pool', 'automatic_payment_flg',
       'weblog_flg', 'risk_level', 'deposit_onhand_amt', 'ebill_enroll_flag',
       'called_flg', 'oam_flg', 'sap_productname', 'numweblog_cnt',
       'disconnects_flg', 'load_profile', 'city', 'home_value', 'county',
       'tdsp', 'dma', 'ev_driver', 'segment', 'order_day_month',
       'order_day_day_of_week', 'order_day_day_of_month', 'order_day_quarter'
        ,'zipcode','call_attempt'
            #,'prior_month_accept_rate', 'prior_month_call_count'
           ]

target = 'accept'

In [5]:
tss = TimeSeriesSplit(n_splits=5, test_size=15000)
train_min = []
train_max = []
val_min = []
val_max = []

for train_idx, val_idx in tss.split(df):
    train_min.append(train_idx.min())
    train_max.append(train_idx.max())
    val_min.append(val_idx.min())
    val_max.append(val_idx.max())

print(train_min)
print(train_max)
print(val_min)
print(val_max)

[0, 0, 0, 0, 0]
[6314, 21314, 36314, 51314, 66314]
[6315, 21315, 36315, 51315, 66315]
[21314, 36314, 51314, 66314, 81314]


In [6]:
# from sklearn.model_selection import train_test_split, GridSearchCV
# param_grid = {
#     'num_leaves': [50, 100],
#     'reg_alpha': [0.1, 0.5],
#     'min_data_in_leaf': [30, 50, 100, 300, 400],
#     'lambda_l1': [0, 1, 1.5],
#     'lambda_l2': [0, 1]
#     }
    
    
# clf = lgb.LGBMClassifier(n_estimators = 100)
# grid = GridSearchCV(clf, param_grid, cv=tss, scoring="roc_auc")

# # Fit the grid search object to the training data
# grid.fit(df[features], df[target])
# # Print best parameters and score
# print(f"Best parameters: {grid.best_params_}")
# print(f"Best score: {grid.best_score_}")

#best score returned from this using these parameters:
#     clf = lgb.LGBMClassifier(lambda_l1= 0
#                              ,lambda_l2= 0
#                              ,min_data_in_leaf= 30
#                              ,num_leaves= 100
#                              ,reg_alpha= 0.1)

#pd.DataFrame(grid.cv_results_).to_csv('Grid Search Results.csv', index=False)

In [7]:
fold = 0
preds = []
auc_scores = []
balanced_accuracy_scores = []
accuracy_scores = []
f1_scores = []
precision_scores = []
recall_scores = []

for train_idx, val_idx in tss.split(df):
    tr = df.iloc[train_idx]
    te = df.iloc[val_idx]

    X_tr = tr[features]
    y_tr = tr[target]

    X_te = te[features]
    y_te = te[target]
    
    clf = lgb.LGBMClassifier(lambda_l1= 0
                             ,lambda_l2= 0
                             ,min_data_in_leaf= 30
                             ,num_leaves= 100
                             ,reg_alpha= 0.1)
    
    clf.fit(X_tr, y_tr)
    
    #pred = clf.predict(X_te)

    pred_prob = clf.predict_proba(X_te)[:, 1]
    
    auc_score = roc_auc_score(y_te, pred_prob)    
    auc_scores.append(auc_score)  

[LightGBM] [Info] Number of positive: 539, number of negative: 5776
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001055 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1047
[LightGBM] [Info] Number of data points in the train set: 6315, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.085352 -> initscore=-2.371751
[LightGBM] [Info] Start training from score -2.371751
[LightGBM] [Info] Number of positive: 7022, number of negative: 14293
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002638 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1440
[LightGBM] [Info] Number of data points in the train set: 21315, number of used features: 32
[LightGBM] [Info] [bina

[LightGBM] [Info] Number of positive: 12998, number of negative: 53317
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021249 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1624
[LightGBM] [Info] Number of data points in the train set: 66315, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.196004 -> initscore=-1.411460
[LightGBM] [Info] Start training from score -1.411460


In [8]:
statistics.mean(auc_scores)

0.7854438360326522

In [9]:
#potential concerns:
#forecasting horizon in test set is not good

#ideas that were explored but did not work:
#prior month call count
#prior month 3 month lag

#what i had to do with the feature engineering
#my concerns about the test set

In [10]:
# process test data
NRG_test_set = pd.read_csv('ecoshare_sales_test_ap.csv', low_memory=False) #the "_ap" is because i opened the file in excel and saved as CSV to make the formatting consistent with the training data that was originally provided in xlsx format

#engineer features based on order day
NRG_test_set['order_day'] = pd.to_datetime(NRG_test_set['order_day'])
NRG_test_set['order_day_month'] = NRG_test_set['order_day'].dt.month_name(locale='English')
NRG_test_set['order_day_day_of_week'] = NRG_test_set['order_day'].dt.day_name()
NRG_test_set['order_day_day_of_month'] = NRG_test_set['order_day'].dt.day
NRG_test_set['order_day_quarter'] = pd.PeriodIndex(NRG_test_set.order_day, freq='Q').astype(str).str[-2:]

#fill blank values of categorical variables as its own category
NRG_test_set['tos_flg'].fillna('blank_value', inplace=True)
NRG_test_set['tos_flg'] = NRG_test_set['tos_flg'].astype('category')

NRG_test_set['disconotice_flg'].fillna('blank_value', inplace=True)
NRG_test_set['disconotice_flg'] = NRG_test_set['disconotice_flg'].astype('category')

NRG_test_set['latefee_flg'].fillna('blank_value', inplace=True)
NRG_test_set['latefee_flg'] = NRG_test_set['latefee_flg'].astype('category')

NRG_test_set['dwelling_type_cd'].fillna('blank_value', inplace=True)
NRG_test_set['dwelling_type_cd'] = NRG_test_set['dwelling_type_cd'].astype('category')

NRG_test_set['product_type_cd'].fillna('blank_value', inplace=True)
NRG_test_set['product_type_cd'] = NRG_test_set['product_type_cd'].astype('category')

NRG_test_set['automatic_payment_flg'].fillna('blank_value', inplace=True)
NRG_test_set['automatic_payment_flg'] = NRG_test_set['automatic_payment_flg'].astype('category')

NRG_test_set['weblog_flg'].fillna('blank_value', inplace=True)
NRG_test_set['weblog_flg'] = NRG_test_set['weblog_flg'].astype('category')

NRG_test_set['risk_level'].fillna('blank_value', inplace=True)
NRG_test_set['risk_level'] = NRG_test_set['risk_level'].astype('category')

NRG_test_set['ebill_enroll_flag'].fillna('blank_value', inplace=True)
NRG_test_set['ebill_enroll_flag'] = NRG_test_set['ebill_enroll_flag'].astype('category')

NRG_test_set['called_flg'].fillna('blank_value', inplace=True)
NRG_test_set['called_flg'] = NRG_test_set['called_flg'].astype('category')

NRG_test_set['oam_flg'].fillna('blank_value', inplace=True)
NRG_test_set['oam_flg'] = NRG_test_set['oam_flg'].astype('category')

NRG_test_set['sap_productname'].fillna('blank_value', inplace=True)
NRG_test_set['sap_productname'] = NRG_test_set['sap_productname'].astype('category')

NRG_test_set['disconnects_flg'].fillna('blank_value', inplace=True)
NRG_test_set['disconnects_flg'] = NRG_test_set['disconnects_flg'].astype('category')

NRG_test_set['load_profile'].fillna('blank_value', inplace=True)
NRG_test_set['load_profile'] = NRG_test_set['load_profile'].astype('category')

NRG_test_set['city'].fillna('blank_value', inplace=True)
NRG_test_set['city'] = NRG_test_set['city'].astype('category')

NRG_test_set['county'].fillna('blank_value', inplace=True)
NRG_test_set['county'] = NRG_test_set['county'].astype('category')

NRG_test_set['tdsp'].fillna('blank_value', inplace=True)
NRG_test_set['tdsp'] = NRG_test_set['tdsp'].astype('category')

NRG_test_set['dma'].fillna('blank_value', inplace=True)
NRG_test_set['dma'] = NRG_test_set['dma'].astype('category')

NRG_test_set['segment'] = 'S' + NRG_test_set['segment'].astype(str)
NRG_test_set['segment'].fillna('blank_value', inplace=True)
NRG_test_set['segment'] = NRG_test_set['segment'].astype('category')

NRG_test_set['order_day_month'].fillna('blank_value', inplace=True)
NRG_test_set['order_day_month'] = NRG_test_set['order_day_month'].astype('category')

NRG_test_set['order_day_day_of_week'].fillna('blank_value', inplace=True)
NRG_test_set['order_day_day_of_week'] = NRG_test_set['order_day_day_of_week'].astype('category')

NRG_test_set['order_day_quarter'].fillna('blank_value', inplace=True)
NRG_test_set['order_day_quarter'] = NRG_test_set['order_day_quarter'].astype('category')


# fill binary blank values with assumed NO (categorical) or 0 (numerical) if blank
NRG_test_set['pool'].fillna('N', inplace=True)
NRG_test_set['pool'] = NRG_test_set['pool'].astype('category')
NRG_test_set['deposit_onhand_amt'] = NRG_test_set['deposit_onhand_amt'].fillna(0)
NRG_test_set['deposit_onhand_amt'] = NRG_test_set['deposit_onhand_amt'].astype(np.int64)
NRG_test_set['ev_driver'].fillna('N', inplace=True)
NRG_test_set['ev_driver'] = NRG_test_set['ev_driver'].astype('category')


#fill median values for numerical variables
NRG_test_set['oam_activelogin_cnt'] = NRG_test_set['oam_activelogin_cnt'].fillna(oam_active_login_cnt_median)
NRG_test_set['called_numcalls_cnt'] = NRG_test_set['called_numcalls_cnt'].fillna(called_numcalls_cnt_median)
NRG_test_set['curr_usage'] = NRG_test_set['curr_usage'].fillna(curr_usage_median)
NRG_test_set['curr_usage'] = NRG_test_set['curr_usage'].astype(np.int64)
NRG_test_set['numweblog_cnt'] = NRG_test_set['numweblog_cnt'].fillna(numweblog_cnt_median)
NRG_test_set['home_value'] = NRG_test_set['home_value'].fillna(home_value_median)
NRG_test_set['home_value'] = NRG_test_set['home_value'].astype(np.int64)


# unique cases
NRG_test_set['term_length'] = NRG_test_set['term_length'].fillna(term_length_imputed_value)
NRG_test_set['term_length'] = NRG_test_set['term_length'].replace(['MM', 'C&'], term_length_imputed_value)
NRG_test_set['term_length'] = NRG_test_set['term_length'].astype(np.int64)

NRG_test_set['order_day_day_of_month'] = NRG_test_set['order_day_day_of_month'].astype(np.int64)

NRG_test_set['zipcode'] = 'Z'+NRG_test_set['zipcode'].astype(str).str[:3]
NRG_test_set['zipcode'] = NRG_test_set['zipcode'].astype('category')

#complex feature engineering
call_attempt = []
for index, row in NRG_test_set.iterrows():
    c_id = row['customer_id']
    or_day = row['order_day']
    base_number = len(df[df['customer_id']==c_id])
    temp = NRG_test_set[NRG_test_set['customer_id']==c_id]
    temp = temp[temp['order_day']<or_day]
    call_attempt.append(len(df_temp)+base_number)
NRG_test_set['call_attempt'] = call_attempt

del NRG_test_set['customer_id']
del NRG_test_set['order_day']
del NRG_test_set['meter_id']

In [11]:
# fit on whole training set now
clf = lgb.LGBMClassifier(lambda_l1= 0
                         ,lambda_l2= 0
                         ,min_data_in_leaf= 30
                         ,num_leaves= 100
                         ,reg_alpha= 0.1)

clf.fit(df[features], df[target])

#predict on test set
NRG_final_preds = clf.predict_proba(NRG_test_set)[:, 1]

[LightGBM] [Info] Number of positive: 13724, number of negative: 67591
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008470 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1683
[LightGBM] [Info] Number of data points in the train set: 81315, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.168776 -> initscore=-1.594329
[LightGBM] [Info] Start training from score -1.594329


In [12]:
NRG_final_preds_df = pd.DataFrame(NRG_final_preds, columns=["probability"])
NRG_final_preds_df.to_pickle('aditya_parikh_predictions.pkl')
NRG_final_preds_df.to_csv('aditya_parikh_predictions.csv', index=False)