In [2]:
%load_ext autoreload
%autoreload 2

import optuna
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import shap
import numpy as np
import scipy
import seaborn as sns

from steps.prepare_data import load_processed_data, load_split_processed_data
from utils.model import predict, load_model, predict_booster, predict_booster_model
from steps.load_data import LoadData
from utils.helpers import reduce_mem_usage

In [9]:
data_loader = LoadData()

project_step_df = data_loader.df_test_fe.copy()

## Calculate diff

In [10]:
columns_groups = {}

for column in project_step_df.columns[project_step_df.columns.str.contains(pat="_mnt|_wk")].to_list():
    column_name = "_".join(column.split("_")[:-1])

    if column_name not in columns_groups:
        columns_groups[column_name] = []

    columns_groups[column_name].append(column)

columns_groups

{'Ama_rchrgmnt_sum_max': ['Ama_rchrgmnt_sum_max_mnt1',
  'Ama_rchrgmnt_sum_max_mnt3'],
 'content_clc_mea': ['content_clc_mea_mnt1',
  'content_clc_mea_mnt3',
  'content_clc_mea_wk1'],
 'content_cnt_max': ['content_cnt_max_mnt1', 'content_cnt_max_mnt3'],
 'voice_out_short_part_max': ['voice_out_short_part_max_mnt1',
  'voice_out_short_part_max_mnt3'],
 'voice_mts_in_nrest_part_std': ['voice_mts_in_nrest_part_std_mnt1',
  'voice_mts_in_nrest_part_std_mnt3'],
 'num_act_days_max': ['num_act_days_max_mnt1', 'num_act_days_max_mnt3'],
 'sms_roam_clc_min': ['sms_roam_clc_min_mnt1', 'sms_roam_clc_min_mnt3'],
 'voice_in_cmpttrs_avg_durmin': ['voice_in_cmpttrs_avg_durmin_mnt1',
  'voice_in_cmpttrs_avg_durmin_mnt3'],
 'com_num_part_mea': ['com_num_part_mea_mnt1',
  'com_num_part_mea_mnt3',
  'com_num_part_mea_wk1'],
 'pay_avg_mea': ['pay_avg_mea_mnt1', 'pay_avg_mea_wk1', 'pay_avg_mea_mnt3'],
 'voice_out_tar_dur_std': ['voice_out_tar_dur_std_mnt1',
  'voice_out_tar_dur_std_mnt3'],
 'voice_out_tar_d

In [12]:
df = project_step_df.copy()
new_features_df = pd.DataFrame()

new_features_df["abon_id"] = df["abon_id"]

In [91]:
new_features = {}

def calculate_diff_for_group(x):
    if x[-1] == 0:
        return 0

    value = round(((x[0] - x[-1]) / x[-1]) * 100, 2)

    if value == 0:
        return 0

    return value


def calculate_diff(dataframe, columns):
    y = df[columns].ffill(axis=1).values.T

    return np.array(np.apply_along_axis(calculate_diff_for_group, 0, y))

for column in columns_groups.keys():
    statistic_columns = columns_groups[column]
    print(f"Processing {column}...")

    new_features[f"{column}_diff"] = calculate_diff(df, statistic_columns)

new_features_df = pd.concat([new_features_df, pd.DataFrame(new_features)], axis=1)

Processing Ama_rchrgmnt_sum_max...
Processing content_clc_mea...
Processing content_cnt_max...
Processing voice_out_short_part_max...
Processing voice_mts_in_nrest_part_std...
Processing num_act_days_max...
Processing sms_roam_clc_min...
Processing voice_in_cmpttrs_avg_durmin...
Processing com_num_part_mea...
Processing pay_avg_mea...
Processing voice_out_tar_dur_std...
Processing voice_out_tar_dur_min...
Processing gprs_clc_mea...
Processing pay_max_min...
Processing voice_in_short_part_mea...
Processing voice_in_roam_clc_std...
Processing conn_out_uniq_cnt_std...
Processing voice_mts_in_dwork_part_td...
Processing data_3g_tar_vol_mea...
Processing voice_in_fix_tar_dur_mea...
Processing voice_in_tar_dur_max...
Processing ks_num_part_td...
Processing data_3g_dou_mea...
Processing voice_mts_out_drest_partmea...
Processing conn_com_part_min...
Processing Ama_rchrgmnt_sum_mea...
Processing voice_mts_in_dwork_part_mea...
Processing voice_mts_in_dwork_part_std...
Processing gprs_clc_std...


In [94]:
new_features_df

Unnamed: 0,abon_id,Ama_rchrgmnt_sum_max_diff,content_clc_mea_diff,content_cnt_max_diff,voice_out_short_part_max_diff,voice_mts_in_nrest_part_std_diff,num_act_days_max_diff,sms_roam_clc_min_diff,voice_in_cmpttrs_avg_durmin_diff,com_num_part_mea_diff,...,accum_oth_dur_td_diff,data_3g_tv_cnt_td_diff,voice_in_td_cnt_min_diff,abon_part_td_diff,com_num_part_std_diff,sms_in_cnt_std_diff,pay_p2p_out_sum_td_diff,pay_max_td_diff,voice_in_short_part_td_diff,voice_out_cmpttrs_avg_dumin_diff
0,1518212,0,0,0,-100.00,0.01,0,0,0,0,...,0,0,-8.64,0,-0.11,8.37,0,0,0,0
1,1535923,0,0,-29,-0.03,-0.28,0,0,0,-1,...,0,0,-8.62,0,1.24,-62.94,0,0,0,-9
2,1538501,0,0,-7,0.00,0.33,0,0,0,1,...,0,0,0.00,0,-0.17,-47.11,0,0,0,-100
3,1544214,0,0,-13,-0.02,-0.04,0,0,0,0,...,0,0,-21.75,0,0.01,-17.48,0,0,0,0
4,1545752,0,0,-4,-0.34,0.93,0,0,0,-8,...,0,0,0.00,0,-0.78,-41.91,0,0,0,-100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,130753142,0,0,0,0.00,0.02,0,0,0,-3,...,0,0,0.00,0,0.00,0.00,0,0,0,-100
149996,130753520,0,0,0,0.00,0.13,0,0,0,0,...,0,0,0.00,0,0.00,0.00,0,0,0,0
149997,130753807,0,0,-9,0.00,0.00,0,0,0,9,...,0,0,0.00,0,0.68,-0.97,0,0,0,-100
149998,130754180,0,0,-34,-31.70,0.00,0,0,0,0,...,0,0,-100.00,0,0.87,8.43,0,0,0,0


In [95]:
import pickle

with open("cache/project_step_5_test.pkl", "wb") as f:
    pickle.dump(new_features_df, f)

with open("cache/project_step_5_test.pkl", "rb") as f:
    df_new_loaded = pickle.load(f)

df_new_loaded.shape

(150000, 301)

In [11]:
df = project_step_df.copy()
new_total_features_df = pd.DataFrame()

new_total_features_df["abon_id"] = df["abon_id"]

In [12]:
new_total_features = {}


def calculate_total_sum(dataframe, columns):
    y = df[columns].ffill(axis=1).values.T

    return np.array(np.apply_along_axis(lambda x: x.sum(), 0, y))


for column in columns_groups.keys():
    statistic_columns = columns_groups[column]
    print(f"Processing {column}...")

    new_total_features[f"{column}_total"] = calculate_total_sum(df, statistic_columns)

new_total_features_df = pd.concat([new_total_features_df, pd.DataFrame(new_total_features)], axis=1)

Processing Ama_rchrgmnt_sum_max...
Processing content_clc_mea...
Processing content_cnt_max...
Processing voice_out_short_part_max...
Processing voice_mts_in_nrest_part_std...
Processing num_act_days_max...
Processing sms_roam_clc_min...
Processing voice_in_cmpttrs_avg_durmin...
Processing com_num_part_mea...
Processing pay_avg_mea...
Processing voice_out_tar_dur_std...
Processing voice_out_tar_dur_min...
Processing gprs_clc_mea...
Processing pay_max_min...
Processing voice_in_short_part_mea...
Processing voice_in_roam_clc_std...
Processing conn_out_uniq_cnt_std...
Processing voice_mts_in_dwork_part_td...
Processing data_3g_tar_vol_mea...
Processing voice_in_fix_tar_dur_mea...
Processing voice_in_tar_dur_max...
Processing ks_num_part_td...
Processing data_3g_dou_mea...
Processing voice_mts_out_drest_partmea...
Processing conn_com_part_min...
Processing Ama_rchrgmnt_sum_mea...
Processing voice_mts_in_dwork_part_mea...
Processing voice_mts_in_dwork_part_std...
Processing gprs_clc_std...


In [13]:
new_total_features_df

Unnamed: 0,abon_id,Ama_rchrgmnt_sum_max_total,content_clc_mea_total,content_cnt_max_total,voice_out_short_part_max_total,voice_mts_in_nrest_part_std_total,num_act_days_max_total,sms_roam_clc_min_total,voice_in_cmpttrs_avg_durmin_total,com_num_part_mea_total,...,accum_oth_dur_td_total,data_3g_tv_cnt_td_total,voice_in_td_cnt_min_total,abon_part_td_total,com_num_part_std_total,sms_in_cnt_std_total,pay_p2p_out_sum_td_total,pay_max_td_total,voice_in_short_part_td_total,voice_out_cmpttrs_avg_dumin_total
0,1518212,0,0.0,24.207174,1.007571,2.000426,10.648154,0,0.000000,2.008994,...,-1.0,0.000000,11.151873,0.000000,2.019277,2.607890,0.0,2.004043,1.000003,0.000000
1,1535923,0,0.0,22.579814,2.002413,2.002785,10.648154,0,0.000000,3.213744,...,-1.0,0.000000,30.594029,-1.000300,2.041957,5.016078,0.0,4.416582,1.000000,46.917114
2,1538501,0,0.0,29.185354,2.009520,2.010591,10.648154,0,15.532889,3.062195,...,0.0,-1.223784,22.722323,-1.000012,2.012333,3.365791,0.0,3.259911,-1.000000,17.899284
3,1544214,0,0.0,18.003257,2.001698,2.000595,10.648154,0,27.675014,3.008709,...,0.0,0.000000,24.672932,0.000000,2.001995,7.302661,0.0,0.000000,1.000000,0.000000
4,1545752,0,0.0,27.473947,2.004937,2.021197,10.648154,0,9.873995,3.562493,...,1.0,-3.315847,15.929247,1.001039,2.043050,30.454277,0.0,-4.676958,1.000071,26.820670
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,130753142,0,0.0,23.725081,2.005847,2.000782,10.648154,0,9.669721,3.402279,...,1.0,1.526821,21.662648,0.000000,2.016016,2.415410,0.0,0.000000,1.000006,17.899284
149996,130753520,0,0.0,17.374496,2.047525,2.018002,10.648154,0,0.000000,2.058089,...,1.0,1.566312,13.499803,0.000000,2.028530,4.731064,0.0,0.000000,1.000005,0.000000
149997,130753807,0,0.0,23.929305,2.008330,2.000070,10.648154,0,15.233725,3.554850,...,1.0,2.347327,8.420804,0.000000,2.066427,2.725903,0.0,4.557832,0.000000,32.163078
149998,130754180,0,0.0,21.125804,2.491554,0.000000,10.648154,0,0.000000,2.022957,...,0.0,1.109204,1.480453,0.000000,2.090517,3.536108,0.0,0.000000,0.000000,0.000000


In [14]:
import pickle

with open("cache/project_step_5_total_sum_test.pkl", "wb") as f:
    pickle.dump(new_total_features_df, f)

with open("cache/project_step_5_total_sum_test.pkl", "rb") as f:
    df_new_loaded = pickle.load(f)

df_new_loaded.shape

(150000, 301)