In [2]:
%load_ext autoreload
%autoreload 2

import optuna
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import shap
import numpy as np
import scipy
import seaborn as sns

from steps.prepare_data import load_processed_data, load_split_processed_data
from utils.model import predict, load_model, predict_booster, predict_booster_model
from steps.load_data import LoadData
from utils.helpers import reduce_mem_usage

In [10]:
data_loader = LoadData()

project_step_df = data_loader.df_train_fe.copy()

## Calculate diff

In [11]:
columns_groups = {}

for column in project_step_df.columns[project_step_df.columns.str.contains(pat="_mnt|_wk")].to_list():
    column_name = "_".join(column.split("_")[:-1])

    if column_name not in columns_groups:
        columns_groups[column_name] = []

    columns_groups[column_name].append(column)

columns_groups

{'Ama_rchrgmnt_sum_max': ['Ama_rchrgmnt_sum_max_mnt1',
  'Ama_rchrgmnt_sum_max_mnt3'],
 'content_clc_mea': ['content_clc_mea_mnt1',
  'content_clc_mea_mnt3',
  'content_clc_mea_wk1'],
 'content_cnt_max': ['content_cnt_max_mnt1', 'content_cnt_max_mnt3'],
 'voice_out_short_part_max': ['voice_out_short_part_max_mnt1',
  'voice_out_short_part_max_mnt3'],
 'voice_mts_in_nrest_part_std': ['voice_mts_in_nrest_part_std_mnt1',
  'voice_mts_in_nrest_part_std_mnt3'],
 'num_act_days_max': ['num_act_days_max_mnt1', 'num_act_days_max_mnt3'],
 'sms_roam_clc_min': ['sms_roam_clc_min_mnt1', 'sms_roam_clc_min_mnt3'],
 'voice_in_cmpttrs_avg_durmin': ['voice_in_cmpttrs_avg_durmin_mnt1',
  'voice_in_cmpttrs_avg_durmin_mnt3'],
 'com_num_part_mea': ['com_num_part_mea_mnt1',
  'com_num_part_mea_mnt3',
  'com_num_part_mea_wk1'],
 'pay_avg_mea': ['pay_avg_mea_mnt1', 'pay_avg_mea_wk1', 'pay_avg_mea_mnt3'],
 'voice_out_tar_dur_std': ['voice_out_tar_dur_std_mnt1',
  'voice_out_tar_dur_std_mnt3'],
 'voice_out_tar_d

In [12]:
df = project_step_df.copy()
new_features_df = pd.DataFrame()

new_features_df["abon_id"] = df["abon_id"]

In [91]:
new_features = {}

def calculate_diff_for_group(x):
    if x[-1] == 0:
        return 0

    value = round(((x[0] - x[-1]) / x[-1]) * 100, 2)

    if value == 0:
        return 0

    return value


def calculate_diff(dataframe, columns):
    y = df[columns].ffill(axis=1).values.T

    return np.array(np.apply_along_axis(calculate_diff_for_group, 0, y))

for column in columns_groups.keys():
    statistic_columns = columns_groups[column]
    print(f"Processing {column}...")

    new_features[f"{column}_diff"] = calculate_diff(df, statistic_columns)

new_features_df = pd.concat([new_features_df, pd.DataFrame(new_features)], axis=1)

Processing Ama_rchrgmnt_sum_max...
Processing content_clc_mea...
Processing content_cnt_max...
Processing voice_out_short_part_max...
Processing voice_mts_in_nrest_part_std...
Processing num_act_days_max...
Processing sms_roam_clc_min...
Processing voice_in_cmpttrs_avg_durmin...
Processing com_num_part_mea...
Processing pay_avg_mea...
Processing voice_out_tar_dur_std...
Processing voice_out_tar_dur_min...
Processing gprs_clc_mea...
Processing pay_max_min...
Processing voice_in_short_part_mea...
Processing voice_in_roam_clc_std...
Processing conn_out_uniq_cnt_std...
Processing voice_mts_in_dwork_part_td...
Processing data_3g_tar_vol_mea...
Processing voice_in_fix_tar_dur_mea...
Processing voice_in_tar_dur_max...
Processing ks_num_part_td...
Processing data_3g_dou_mea...
Processing voice_mts_out_drest_partmea...
Processing conn_com_part_min...
Processing Ama_rchrgmnt_sum_mea...
Processing voice_mts_in_dwork_part_mea...
Processing voice_mts_in_dwork_part_std...
Processing gprs_clc_std...


In [94]:
new_features_df

Unnamed: 0,abon_id,Ama_rchrgmnt_sum_max_diff,content_clc_mea_diff,content_cnt_max_diff,voice_out_short_part_max_diff,voice_mts_in_nrest_part_std_diff,num_act_days_max_diff,sms_roam_clc_min_diff,voice_in_cmpttrs_avg_durmin_diff,com_num_part_mea_diff,...,accum_oth_dur_td_diff,data_3g_tv_cnt_td_diff,voice_in_td_cnt_min_diff,abon_part_td_diff,com_num_part_std_diff,sms_in_cnt_std_diff,pay_p2p_out_sum_td_diff,pay_max_td_diff,voice_in_short_part_td_diff,voice_out_cmpttrs_avg_dumin_diff
0,1518212,0,0,0,-100.00,0.01,0,0,0,0,...,0,0,-8.64,0,-0.11,8.37,0,0,0,0
1,1535923,0,0,-29,-0.03,-0.28,0,0,0,-1,...,0,0,-8.62,0,1.24,-62.94,0,0,0,-9
2,1538501,0,0,-7,0.00,0.33,0,0,0,1,...,0,0,0.00,0,-0.17,-47.11,0,0,0,-100
3,1544214,0,0,-13,-0.02,-0.04,0,0,0,0,...,0,0,-21.75,0,0.01,-17.48,0,0,0,0
4,1545752,0,0,-4,-0.34,0.93,0,0,0,-8,...,0,0,0.00,0,-0.78,-41.91,0,0,0,-100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,130753142,0,0,0,0.00,0.02,0,0,0,-3,...,0,0,0.00,0,0.00,0.00,0,0,0,-100
149996,130753520,0,0,0,0.00,0.13,0,0,0,0,...,0,0,0.00,0,0.00,0.00,0,0,0,0
149997,130753807,0,0,-9,0.00,0.00,0,0,0,9,...,0,0,0.00,0,0.68,-0.97,0,0,0,-100
149998,130754180,0,0,-34,-31.70,0.00,0,0,0,0,...,0,0,-100.00,0,0.87,8.43,0,0,0,0


In [95]:
import pickle

with open("cache/project_step_5_test.pkl", "wb") as f:
    pickle.dump(new_features_df, f)

with open("cache/project_step_5_test.pkl", "rb") as f:
    df_new_loaded = pickle.load(f)

df_new_loaded.shape

(150000, 301)

In [13]:
df = project_step_df.copy()
new_total_features_df = pd.DataFrame()

new_total_features_df["abon_id"] = df["abon_id"]

In [14]:
new_total_features = {}


def calculate_total_sum(dataframe, columns):
    y = df[columns].ffill(axis=1).values.T

    return np.array(np.apply_along_axis(lambda x: x.sum(), 0, y))


for column in columns_groups.keys():
    statistic_columns = columns_groups[column]
    print(f"Processing {column}...")

    new_total_features[f"{column}_total"] = calculate_total_sum(df, statistic_columns)

new_total_features_df = pd.concat([new_total_features_df, pd.DataFrame(new_total_features)], axis=1)

Processing Ama_rchrgmnt_sum_max...
Processing content_clc_mea...
Processing content_cnt_max...
Processing voice_out_short_part_max...
Processing voice_mts_in_nrest_part_std...
Processing num_act_days_max...
Processing sms_roam_clc_min...
Processing voice_in_cmpttrs_avg_durmin...
Processing com_num_part_mea...
Processing pay_avg_mea...
Processing voice_out_tar_dur_std...
Processing voice_out_tar_dur_min...
Processing gprs_clc_mea...
Processing pay_max_min...
Processing voice_in_short_part_mea...
Processing voice_in_roam_clc_std...
Processing conn_out_uniq_cnt_std...
Processing voice_mts_in_dwork_part_td...
Processing data_3g_tar_vol_mea...
Processing voice_in_fix_tar_dur_mea...
Processing voice_in_tar_dur_max...
Processing ks_num_part_td...
Processing data_3g_dou_mea...
Processing voice_mts_out_drest_partmea...
Processing conn_com_part_min...
Processing Ama_rchrgmnt_sum_mea...
Processing voice_mts_in_dwork_part_mea...
Processing voice_mts_in_dwork_part_std...
Processing gprs_clc_std...


In [15]:
new_total_features_df

Unnamed: 0,abon_id,Ama_rchrgmnt_sum_max_total,content_clc_mea_total,content_cnt_max_total,voice_out_short_part_max_total,voice_mts_in_nrest_part_std_total,num_act_days_max_total,sms_roam_clc_min_total,voice_in_cmpttrs_avg_durmin_total,com_num_part_mea_total,...,accum_oth_dur_td_total,data_3g_tv_cnt_td_total,voice_in_td_cnt_min_total,abon_part_td_total,com_num_part_std_total,sms_in_cnt_std_total,pay_p2p_out_sum_td_total,pay_max_td_total,voice_in_short_part_td_total,voice_out_cmpttrs_avg_dumin_total
0,1545052,0,0.0,27.683216,2.960906,0.000000,10.648154,0,33.657543,4.886765,...,-1.0,0.000000,2.960906,0.0,2.174367,3.167671,0.0,-5.320441,0.000000,17.899284
1,1545235,0,0.0,25.001661,2.007266,2.000448,10.648154,0,0.000000,3.009318,...,0.0,2.293848,24.740526,0.0,2.003137,4.419537,0.0,-3.945089,-1.000001,0.000000
2,1549358,0,0.0,20.538234,1.000529,0.000000,10.648154,0,0.000000,3.053266,...,0.0,0.000000,4.402265,0.0,2.017439,4.180687,0.0,-1.514559,0.000000,0.000000
3,1549591,0,0.0,21.836953,2.066482,2.000246,10.648154,0,0.000000,3.015916,...,-1.0,0.000000,16.266213,0.0,2.006835,3.422190,0.0,1.514559,-1.000009,0.000000
4,1558772,0,0.0,15.437150,2.001282,2.001425,10.648154,0,28.858494,3.170624,...,0.0,1.002963,18.696819,0.0,2.007375,2.619909,0.0,0.000000,1.000000,35.798568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,130753784,0,0.0,18.307948,2.020033,2.043722,10.648154,0,0.000000,3.939870,...,1.0,1.091466,10.038198,0.0,2.035632,5.300809,0.0,2.656705,1.000001,19.424822
149996,130753819,0,0.0,28.077456,2.012156,2.092287,10.648154,0,0.000000,3.761001,...,1.0,1.112542,7.800692,0.0,2.235331,3.383737,0.0,3.977385,1.000002,0.000000
149997,130754333,0,0.0,26.451130,2.002676,2.001251,10.648154,0,0.000000,3.240886,...,-1.0,1.676536,15.497517,0.0,2.054923,11.377113,0.0,1.047934,1.000001,0.000000
149998,130754629,0,0.0,39.049953,2.002463,0.000000,10.648154,0,15.464618,3.786021,...,1.0,2.068344,14.144834,0.0,2.006151,9.149862,0.0,3.603785,1.000001,21.074264


In [16]:
import pickle

with open("cache/project_step_5_total_sum_train.pkl", "wb") as f:
    pickle.dump(new_total_features_df, f)

with open("cache/project_step_5_total_sum_train.pkl", "rb") as f:
    df_new_loaded = pickle.load(f)

df_new_loaded.shape

(150000, 301)