In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

### Specify Features and Data Types

In [2]:
column_types = {'md5_cust_party_key': 'object', 'province_cd': 'object', 'z_age': 'float64', 'Gender_CD': 'object', 
                'z_census_household_1p_pct': 'float64', 'z_census_education_high_pct': 'float64', 
                'z_census_purchase_household': 'float64', 'z_census_purchase_capita': 'float64', 
                'z_census_household_cnt': 'float64', 'prod_monodual_cd': 'object', 'multiplay_cnt': 'float64', 
                'z_line_cnt': 'float64', 'z_sim_cnt': 'float64', 'fixed_prod_cat1_ind': 'float64', 'tenure_fixed_month': 'float64', 
                'tenure_mobile_month': 'float64', 'z_line_voice_cat1_cnt': 'float64', 'fixed_data_cat1_ind': 'float64', 
                'fixed_data_cat2_ind': 'float64', 'z_fixed_prod_cat2_cnt': 'float64', 'z_fixed_prod_cat1_cnt': 'float64', 
                'z_fixed_data_cat3_cnt': 'float64', 'fixed_prod_cat3_cnt': 'float64', 'device_smartphone_cnt': 'float64', 
                'z_mobile_voice_cat1_cnt': 'float64', 'z_mobile_data_cat1_cnt': 'float64', 'mobile_data_cat2_cnt': 'float64', 
                'z_mobile_voice_cat3_cnt': 'float64', 'z_mobile_data_cat3_cnt': 'float64', 'z_usg_fv_3m_avg': 'float64', 
                'z_usg_fd_mb_1m_sum': 'float64', 'z_usg_fd_mb_3m_avg': 'float64', 'z_usg_mv_ib_a_3m_avg': 'float64', 
                'z_usg_md_sms_ib_a_3m_avg': 'float64', 'z_usg_md_ib_mb_3m_avg': 'float64', 'payment_method_cash_cnt': 'float64', 
                'customer_value_cd': 'object', 'z_rev_1m_sum': 'float64', 'z_device_netcube_cnt': 'float64', 
                'z_tariff_netcube_cnt': 'float64', 'z_min_Prog_Max_BB_Down': 'float64', 'z_line_Fib2h_CNT': 'float64', 
                'z_min_Speed_Product_KBit': 'float64', 'z_Max_Speed_Missing_KBit': 'float64', 
                'z_Min_Speed_Reserve_KBit': 'float64', 'z_Max_DSL_OOS_PCT': 'float64', 'z_PR_Relocation_CNT': 'float64', 
                'z_PR_Relocation_Days': 'float64', 'z_PR_ActivationSupportOpt_CNT': 'float64', 
                'z_PR_ActivationSupportOpt_Days': 'float64', 'z_PR_DeactivationThreat_CNT': 'float64', 
                'z_PR_DeactivationSupport_CNT': 'float64', 'z_PR_DeactivationProdOpt_CNT': 'float64', 
                'z_PR_DeactivationProdOpt_Days': 'float64', 'z_PR_OtherWOTopic_CNT': 'float64', 
                'z_PR_OtherWOTopic_Days': 'float64', 'z_PR_AddressChange_CNT': 'float64', 
                'z_PR_AddressChange_Days': 'float64', 'z_PR_ServiceDisruption_CNT': 'float64', 
                'z_PR_ServiceDisruption_Days': 'float64', 'z_PR_BasketSupport_CNT': 'float64', 
                'z_PR_BasketSupport_Days': 'float64', 'z_PR_SellingSalesSupport_CNT': 'float64', 
                'z_PR_SellingSalesSupport_Days': 'float64', 'z_PR_DigitalUsage_CNT': 'float64', 
                'z_PR_DigitalUsage_Days': 'float64', 'z_TNPS_Last_Days': 'float64', 'z_TNPS_Score_Avg': 'float64', 
                'target_ind': 'float64'}

### Parse Dates, Specify Special NA Format, and Load Training Data

In [3]:
parser = lambda date: datetime.strptime(date, '%Y%m')

In [4]:
training_data = pd.read_csv('history.csv', dtype=column_types, na_values= ['*******'], parse_dates=['report_period_m_cd'], date_parser=parser)

### Preliminary Exploration

In [7]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
training_data.head(50)

Unnamed: 0,report_period_m_cd,md5_cust_party_key,province_cd,z_age,Gender_CD,z_census_household_1p_pct,z_census_education_high_pct,z_census_purchase_household,z_census_purchase_capita,z_census_household_cnt,prod_monodual_cd,multiplay_cnt,z_line_cnt,z_sim_cnt,fixed_prod_cat1_ind,tenure_fixed_month,tenure_mobile_month,z_line_voice_cat1_cnt,fixed_data_cat1_ind,fixed_data_cat2_ind,z_fixed_prod_cat2_cnt,z_fixed_prod_cat1_cnt,z_fixed_data_cat3_cnt,fixed_prod_cat3_cnt,device_smartphone_cnt,z_mobile_voice_cat1_cnt,z_mobile_data_cat1_cnt,mobile_data_cat2_cnt,z_mobile_voice_cat3_cnt,z_mobile_data_cat3_cnt,z_usg_fv_3m_avg,z_usg_fd_mb_1m_sum,z_usg_fd_mb_3m_avg,z_usg_mv_ib_a_3m_avg,z_usg_md_sms_ib_a_3m_avg,z_usg_md_ib_mb_3m_avg,payment_method_cash_cnt,customer_value_cd,z_rev_1m_sum,z_device_netcube_cnt,z_tariff_netcube_cnt,z_min_Prog_Max_BB_Down,z_line_Fib2h_CNT,z_min_Speed_Product_KBit,z_Max_Speed_Missing_KBit,z_Min_Speed_Reserve_KBit,z_Max_DSL_OOS_PCT,z_PR_Relocation_CNT,z_PR_Relocation_Days,z_PR_ActivationSupportOpt_CNT,z_PR_ActivationSupportOpt_Days,z_PR_DeactivationThreat_CNT,z_PR_DeactivationSupport_CNT,z_PR_DeactivationProdOpt_CNT,z_PR_DeactivationProdOpt_Days,z_PR_OtherWOTopic_CNT,z_PR_OtherWOTopic_Days,z_PR_AddressChange_CNT,z_PR_AddressChange_Days,z_PR_ServiceDisruption_CNT,z_PR_ServiceDisruption_Days,z_PR_BasketSupport_CNT,z_PR_BasketSupport_Days,z_PR_SellingSalesSupport_CNT,z_PR_SellingSalesSupport_Days,z_PR_DigitalUsage_CNT,z_PR_DigitalUsage_Days,z_TNPS_Last_Days,z_TNPS_Score_Avg,target_ind
0,2018-10-01,3E04B4AE041ADB76C059,D,1.4077,F,0.4236,0.4746,0.4081,-0.2102,-0.3446,F,2.0,-0.1555,-0.6735,0.0,213.0,0.0,-0.0658,0.0,1.0,0.8654,-0.159,-0.2875,0.0,0.0,-0.3024,-0.155,0.0,-0.1979,-0.0628,-0.055,-0.5597,-0.5818,-0.3814,-0.088,-0.1015,0.0,MEDIUM,0.0148,-0.0912,-0.0799,0.4407,-0.1374,0.032,-0.2854,0.3807,3.6245,-0.0981,-0.0973,-0.235,-0.2652,-0.0429,-0.0738,-0.198,-0.204,-0.1146,-0.1216,-0.0909,-0.1009,-0.3854,-0.4208,-0.0926,-0.1012,-0.2405,-0.26,-0.1672,0.9908,-0.3377,-0.3868,0.0
1,2019-06-01,E887D9E85019AAD32315,G,-0.6065,M,0.2456,-0.0154,-0.3068,0.1433,-0.2343,D,3.0,-0.1555,1.5909,0.0,96.0,44.0,-0.0658,0.0,1.0,0.8654,-0.159,-0.2875,0.0,2.0,1.8029,-0.155,0.0,-0.1979,-0.0628,-0.301,-0.2809,-0.2629,0.621,0.0487,0.5252,0.0,MEDIUM,0.1216,-0.0912,-0.0799,0.6817,-0.1374,-0.1629,-0.2854,0.6343,0.2923,-0.0981,-0.0973,-0.235,-0.2652,-0.0429,-0.0738,-0.198,-0.204,-0.1146,-0.1216,-0.0909,-0.1009,-0.3854,-0.4208,-0.0926,-0.1012,-0.2405,-0.26,0.3856,-0.4229,4.7913,2.843,0.0
2,2019-06-01,08B9FC0BC79ABF8531D5,I,-1.4248,F,-0.2529,-0.3296,1.2839,0.0081,-0.4742,D,3.0,-0.1555,0.4587,1.0,66.0,144.0,-0.0658,0.0,1.0,-1.0954,-0.159,3.407,0.0,1.0,-0.3024,-0.155,0.0,-0.1979,-0.0628,-0.301,-0.4455,-0.4773,-0.1892,0.1854,-0.0592,0.0,HIGH,0.1196,-0.0912,-0.0799,-0.2592,-0.1374,0.032,-0.2854,-0.2692,-0.2411,-0.0981,-0.0973,1.9183,5.7945,-0.0429,-0.0738,-0.198,-0.204,-0.1146,-0.1216,-0.0909,-0.1009,-0.3854,-0.4208,-0.0926,-0.1012,2.2396,1.3249,1.7952,-0.3876,-0.3377,-0.3868,0.0
3,2018-08-01,628CA396332EC05545A5,C,0.9671,F,0.4045,1.3985,1.1191,0.6785,-0.3727,F,1.0,-0.1555,-0.6735,0.0,53.0,0.0,-0.0658,0.0,1.0,-1.0954,-0.159,-0.2875,0.0,0.0,-0.3024,-0.155,0.0,-0.1979,-0.0628,-0.301,-0.2531,-0.2006,-0.3814,-0.088,-0.1015,0.0,MEDIUM,-0.2217,-0.0912,-0.0799,-0.1016,-0.1374,0.032,-0.2854,-0.1229,-0.3743,-0.0981,-0.0973,2.9949,1.8522,-0.0429,-0.0738,-0.198,-0.204,-0.1146,-0.1216,-0.0909,-0.1009,-0.3854,-0.4208,-0.0926,-0.1012,-0.2405,-0.26,-0.181,-0.5878,1.2053,2.843,0.0
4,2018-11-01,82FFB9F7CAC1786E4835,B,1.156,M,-0.2969,0.1716,1.4036,0.5833,-0.5379,D,3.0,-0.1555,1.5909,0.0,358.0,111.0,-0.0658,0.0,1.0,0.8654,-0.159,-0.2875,0.0,1.0,3.9081,-0.155,0.0,-0.1979,-0.0628,-0.2846,-0.5475,-0.5653,0.067,-0.0507,-0.1015,0.0,HIGH,0.228,-0.0912,-0.0799,-0.3058,-0.1374,-0.1629,-0.2854,-0.2825,-0.3743,-0.0981,-0.0973,-0.235,-0.2652,-0.0429,-0.0738,-0.198,-0.204,-0.1146,-0.1216,-0.0909,-0.1009,-0.3854,-0.4208,-0.0926,-0.1012,-0.2405,-0.26,-0.181,-0.5878,-0.3377,-0.3868,0.0
5,2018-07-01,D4BE640D33B941D54B9D,H,-0.103,F,-0.5965,-0.3408,0.1875,-1.3955,-0.5438,F,2.0,-0.1555,-0.6735,0.0,87.0,0.0,-0.0658,0.0,1.0,0.8654,-0.159,-0.2875,0.0,0.0,-0.3024,-0.155,0.0,-0.1979,-0.0628,-0.301,0.3399,0.0844,-0.3814,-0.088,-0.1015,0.0,MEDIUM,-0.1027,-0.0912,-0.0799,0.7627,-0.1374,0.5194,-0.2854,0.6049,-0.3743,-0.0981,-0.0973,-0.235,-0.2652,-0.0429,-0.0738,-0.198,-0.204,-0.1146,-0.1216,-0.0909,-0.1009,-0.3854,-0.4208,-0.0926,-0.1012,-0.2405,-0.26,-0.0566,1.8154,-0.3377,-0.3868,0.0
6,2019-05-01,3CAA1D0802A2BAC0563A,D,-1.7396,M,0.4447,-0.5821,-0.0473,-0.2246,-0.5714,F,2.0,-0.1555,-0.6735,1.0,2.0,0.0,-0.0658,0.0,1.0,-1.0954,-0.159,3.407,0.0,0.0,-0.3024,-0.155,0.0,-0.1979,-0.0628,-0.301,0.645,0.1744,-0.3814,-0.088,-0.1015,1.0,NEW,-0.1637,-0.0912,-0.0799,-0.4369,-0.1374,1.0067,-0.2854,-0.3636,-0.3743,-0.0981,-0.0973,-0.235,-0.2652,-0.0429,-0.0738,-0.198,-0.204,-0.1146,-0.1216,-0.0909,-0.1009,-0.3854,-0.4208,-0.0926,-0.1012,-0.2405,-0.26,-0.181,-0.5878,0.8196,1.5511,0.0
7,2018-07-01,0FEB43C0730E5B1325F7,E,-0.6065,M,-0.6194,-0.2118,0.202,-0.9188,-0.3795,F,2.0,-0.1555,-0.6735,0.0,114.0,0.0,-0.0658,0.0,0.0,0.8654,-0.159,-0.2875,0.0,0.0,-0.3024,-0.155,0.0,-0.1979,-0.0628,-0.1206,1.9267,1.7961,-0.3814,-0.088,-0.1015,0.0,MEDIUM,-0.2078,-0.0912,-0.0799,-0.43,-0.1374,-0.5528,-0.2854,-0.338,0.3995,-0.0981,-0.0973,-0.235,-0.2652,-0.0429,-0.0738,-0.198,-0.204,-0.1146,-0.1216,-0.0909,-0.1009,-0.3854,-0.4208,-0.0926,-0.1012,-0.2405,-0.26,-0.1672,1.0261,-0.3377,-0.3868,0.0
8,2018-12-01,B38806E4E854CABD20E3,D,0.2747,M,-0.0472,-0.6494,-0.1551,-0.3487,-0.4907,D,3.0,-0.1555,0.4587,0.0,281.0,127.0,-0.0658,0.0,0.0,0.8654,-0.159,-0.2875,0.0,0.0,1.8029,-0.155,0.0,-0.1979,-0.0628,-0.2081,-0.4143,-0.4681,-0.3776,-0.088,-0.1015,0.0,MEDIUM,-0.2475,-0.0912,-0.0799,-0.4254,-0.1374,-0.5528,-0.2854,-0.3338,-0.3743,-0.0981,-0.0973,-0.235,-0.2652,-0.0429,-0.0738,-0.198,-0.204,-0.1146,-0.1216,-0.0909,-0.1009,-0.3854,-0.4208,-0.0926,-0.1012,-0.2405,-0.26,-0.181,-0.5878,-0.3377,-0.3868,0.0
9,2018-03-01,3608C7D45BA2607D6156,A,-1.5507,M,0.6227,0.8487,0.9808,1.5844,0.9136,F,2.0,-0.1555,-0.6735,1.0,13.0,0.0,-0.0658,0.0,0.0,-1.0954,-0.159,-0.2875,0.0,0.0,-0.3024,-0.155,0.0,-0.1979,-0.0628,-0.301,0.7579,0.2974,-0.3814,-0.088,-0.1015,0.0,LOW,-3.5799,-0.0912,-0.0799,-0.4777,-0.1374,-0.5528,0.0317,-0.3636,-0.3743,-0.0981,-0.0973,-0.235,-0.2652,-0.0429,-0.0738,-0.198,-0.204,-0.1146,-0.1216,-0.0909,-0.1009,1.7141,-0.4208,-0.0926,-0.1012,-0.2405,-0.26,-0.181,-0.5878,-0.3377,-0.3868,0.0


In [None]:
training_data.head(100)

In [6]:
n_rows = len(training_data.index)

In [7]:
print('NA Count by Feature')
training_data.isna().sum()

NA Count by Feature


report_period_m_cd                    0
md5_cust_party_key                    0
province_cd                        2321
z_age                             76035
Gender_CD                         67982
z_census_household_1p_pct             0
z_census_education_high_pct           0
z_census_purchase_household           0
z_census_purchase_capita          15896
z_census_household_cnt                0
prod_monodual_cd                      0
multiplay_cnt                         0
z_line_cnt                         4358
z_sim_cnt                          1196
fixed_prod_cat1_ind                   0
tenure_fixed_month                    0
tenure_mobile_month                   0
z_line_voice_cat1_cnt             29195
fixed_data_cat1_ind                   0
fixed_data_cat2_ind                   0
z_fixed_prod_cat2_cnt                59
z_fixed_prod_cat1_cnt               612
z_fixed_data_cat3_cnt               161
fixed_prod_cat3_cnt                   0
device_smartphone_cnt               278


### Data Cleaning: Delete All Rows with Any Missing Data

In [8]:
no_nan = training_data.dropna()

In [9]:
n_rows_no_na = len(no_nan.index)
print('Number of Rows with NAs Dropped: ' + str(n_rows_no_na))

Number of Rows with NAs Dropped: 5900486


In [10]:
print('Number of Rows (Original): ' + str(n_rows))
print('Number of Rows (Cleaned): ' + str(n_rows_no_na))
print('Percentage of Retained Data After Cleaning: ' + str(n_rows_no_na/n_rows))
print('Percentage of Class 1 (Original): ' + str(training_data['target_ind'].sum()/n_rows))
print('Percentage of Class 1 (Cleaned): ' + str(no_nan['target_ind'].sum()/n_rows_no_na))
print('Training Data Shape (Original): ', training_data.shape)
print('Training Data Shape (Cleaned): ', no_nan.shape)

Number of Rows (Original): 6421184
Number of Rows (Cleaned): 5900486
Percentage of Retained Data After Cleaning: 0.9189093475595778
Percentage of Class 1 (Original): 0.018711190957929256
Percentage of Class 1 (Cleaned): 0.018260021293161274
Training Data Shape (Original):  (6421184, 70)
Training Data Shape (Cleaned):  (5900486, 70)


### Drop Uninformative Features

In [11]:
no_nan = no_nan.drop(['report_period_m_cd', 'md5_cust_party_key'], axis=1)

In [12]:
no_nan.columns

Index(['province_cd', 'z_age', 'Gender_CD', 'z_census_household_1p_pct',
       'z_census_education_high_pct', 'z_census_purchase_household',
       'z_census_purchase_capita', 'z_census_household_cnt',
       'prod_monodual_cd', 'multiplay_cnt', 'z_line_cnt', 'z_sim_cnt',
       'fixed_prod_cat1_ind', 'tenure_fixed_month', 'tenure_mobile_month',
       'z_line_voice_cat1_cnt', 'fixed_data_cat1_ind', 'fixed_data_cat2_ind',
       'z_fixed_prod_cat2_cnt', 'z_fixed_prod_cat1_cnt',
       'z_fixed_data_cat3_cnt', 'fixed_prod_cat3_cnt', 'device_smartphone_cnt',
       'z_mobile_voice_cat1_cnt', 'z_mobile_data_cat1_cnt',
       'mobile_data_cat2_cnt', 'z_mobile_voice_cat3_cnt',
       'z_mobile_data_cat3_cnt', 'z_usg_fv_3m_avg', 'z_usg_fd_mb_1m_sum',
       'z_usg_fd_mb_3m_avg', 'z_usg_mv_ib_a_3m_avg',
       'z_usg_md_sms_ib_a_3m_avg', 'z_usg_md_ib_mb_3m_avg',
       'payment_method_cash_cnt', 'customer_value_cd', 'z_rev_1m_sum',
       'z_device_netcube_cnt', 'z_tariff_netcube_cnt',
     

In [13]:
pd.set_option('display.max_columns', 500)

### One-Hot Encoding of Categorical Features

In [14]:
one_hot_columns = {'province_cd': 'object', 'Gender_CD': 'object', 'prod_monodual_cd': 'object'}

In [15]:
no_nan = pd.get_dummies(no_nan, columns=one_hot_columns.keys(), dtype=float)

#### Drop Gender_CD_M (Complement of Gender_CD_F) and prod_monodual_cd_F (Complement of prod_monodual_cd_D)

In [16]:
no_nan = no_nan.drop(['Gender_CD_M', 'prod_monodual_cd_F'], axis=1)

### Label Encoding of customer_value_cd Column

In [17]:
from pandas.api.types import CategoricalDtype
cat_type = CategoricalDtype(categories=["NEW", "LOW", "MEDIUM", "HIGH", "HIGHEST"], ordered=True)
no_nan['customer_value_cd'] = no_nan['customer_value_cd'].astype(cat_type)
no_nan['customer_value_cd_cat'] = no_nan['customer_value_cd'].cat.codes
no_nan = no_nan.drop(['customer_value_cd'], axis=1)

In [18]:
no_nan.columns

Index(['z_age', 'z_census_household_1p_pct', 'z_census_education_high_pct',
       'z_census_purchase_household', 'z_census_purchase_capita',
       'z_census_household_cnt', 'multiplay_cnt', 'z_line_cnt', 'z_sim_cnt',
       'fixed_prod_cat1_ind', 'tenure_fixed_month', 'tenure_mobile_month',
       'z_line_voice_cat1_cnt', 'fixed_data_cat1_ind', 'fixed_data_cat2_ind',
       'z_fixed_prod_cat2_cnt', 'z_fixed_prod_cat1_cnt',
       'z_fixed_data_cat3_cnt', 'fixed_prod_cat3_cnt', 'device_smartphone_cnt',
       'z_mobile_voice_cat1_cnt', 'z_mobile_data_cat1_cnt',
       'mobile_data_cat2_cnt', 'z_mobile_voice_cat3_cnt',
       'z_mobile_data_cat3_cnt', 'z_usg_fv_3m_avg', 'z_usg_fd_mb_1m_sum',
       'z_usg_fd_mb_3m_avg', 'z_usg_mv_ib_a_3m_avg',
       'z_usg_md_sms_ib_a_3m_avg', 'z_usg_md_ib_mb_3m_avg',
       'payment_method_cash_cnt', 'z_rev_1m_sum', 'z_device_netcube_cnt',
       'z_tariff_netcube_cnt', 'z_min_Prog_Max_BB_Down', 'z_line_Fib2h_CNT',
       'z_min_Speed_Product_KBit', 

In [19]:
print('Final Number of Features + Class: ')
no_nan.shape

Final Number of Features + Class: 


(5900486, 76)

### Baseline No. 1 (Always Predict Class 0)

In [20]:
print('Classification Accuracy = ' + str(1 - no_nan['target_ind'].sum()/no_nan['target_ind'].count()))

Classification Accuracy = 0.9817399787068387


### Save Final Training CSV

In [21]:
# no_nan.to_csv('training_data_final.csv', index=False) (Done Already)

### Initial Mean Analysis of Class-Conditional Features

In [30]:
cond_mean = no_nan.groupby(by=["target_ind"]).mean()
cond_mean.head()

Unnamed: 0_level_0,z_age,z_census_household_1p_pct,z_census_education_high_pct,z_census_purchase_household,z_census_purchase_capita,z_census_household_cnt,multiplay_cnt,z_line_cnt,z_sim_cnt,fixed_prod_cat1_ind,tenure_fixed_month,tenure_mobile_month,z_line_voice_cat1_cnt,fixed_data_cat1_ind,fixed_data_cat2_ind,z_fixed_prod_cat2_cnt,z_fixed_prod_cat1_cnt,z_fixed_data_cat3_cnt,fixed_prod_cat3_cnt,device_smartphone_cnt,z_mobile_voice_cat1_cnt,z_mobile_data_cat1_cnt,mobile_data_cat2_cnt,z_mobile_voice_cat3_cnt,z_mobile_data_cat3_cnt,z_usg_fv_3m_avg,z_usg_fd_mb_1m_sum,z_usg_fd_mb_3m_avg,z_usg_mv_ib_a_3m_avg,z_usg_md_sms_ib_a_3m_avg,z_usg_md_ib_mb_3m_avg,payment_method_cash_cnt,z_rev_1m_sum,z_device_netcube_cnt,z_tariff_netcube_cnt,z_min_Prog_Max_BB_Down,z_line_Fib2h_CNT,z_min_Speed_Product_KBit,z_Max_Speed_Missing_KBit,z_Min_Speed_Reserve_KBit,z_Max_DSL_OOS_PCT,z_PR_Relocation_CNT,z_PR_Relocation_Days,z_PR_ActivationSupportOpt_CNT,z_PR_ActivationSupportOpt_Days,z_PR_DeactivationThreat_CNT,z_PR_DeactivationSupport_CNT,z_PR_DeactivationProdOpt_CNT,z_PR_DeactivationProdOpt_Days,z_PR_OtherWOTopic_CNT,z_PR_OtherWOTopic_Days,z_PR_AddressChange_CNT,z_PR_AddressChange_Days,z_PR_ServiceDisruption_CNT,z_PR_ServiceDisruption_Days,z_PR_BasketSupport_CNT,z_PR_BasketSupport_Days,z_PR_SellingSalesSupport_CNT,z_PR_SellingSalesSupport_Days,z_PR_DigitalUsage_CNT,z_PR_DigitalUsage_Days,z_TNPS_Last_Days,z_TNPS_Score_Avg,province_cd_A,province_cd_B,province_cd_C,province_cd_D,province_cd_E,province_cd_F,province_cd_G,province_cd_H,province_cd_I,Gender_CD_F,prod_monodual_cd_D,customer_value_cd_cat
target_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1
0.0,-0.147939,0.020901,0.007708,-0.025858,-0.014186,0.023898,2.490101,-0.016966,0.19274,0.271932,163.746872,57.598861,-0.0658,0.000343,0.634444,-0.050648,-0.034697,0.027289,0.001578,0.498288,0.088164,0.007954,0.012176,-0.025316,-0.0628,-0.083949,0.062512,0.070255,0.10897,0.024987,-0.002133,0.143391,0.041187,0.009442,0.0057,0.012285,-0.008385,0.108414,-0.005165,0.012701,-0.067431,-0.057468,-0.062142,0.144324,0.16365,-0.0429,-0.0738,0.004859,0.01845,-0.018885,-0.026642,-0.037488,-0.048901,0.085336,0.076453,-0.013934,-0.023529,0.064593,0.057612,0.097201,0.096284,0.151423,0.200889,0.160954,0.236497,0.032104,0.153609,0.164004,0.078414,0.046761,0.08566,0.041997,0.341587,0.532833,2.388085
1.0,-0.620345,0.223482,0.128339,-0.088736,0.060298,0.219519,2.299156,0.130513,0.050833,0.350742,108.254643,43.484635,-0.0658,0.000575,0.678578,-0.392832,-0.05826,0.125282,0.002859,0.448521,-0.09604,0.024735,0.008353,-0.07987,-0.0628,-0.226683,0.080258,0.108301,0.086507,0.032551,0.06417,0.220395,0.084639,0.113201,0.079719,0.080401,0.03561,0.136324,-0.014377,0.079458,0.285785,-0.023455,-0.047204,0.231468,0.267264,-0.0429,-0.0738,0.086354,0.063959,-0.019837,-0.02767,-0.013553,-0.030614,0.046823,0.047905,-0.021877,-0.02896,0.084979,0.072538,0.112732,0.064791,0.128771,0.12242,0.209916,0.183641,0.030072,0.145021,0.16028,0.078966,0.056134,0.095245,0.040727,0.387283,0.468003,2.49245


In [32]:
cond_std = no_nan.groupby(by=["target_ind"]).std()
cond_std.head()

Unnamed: 0_level_0,z_age,z_census_household_1p_pct,z_census_education_high_pct,z_census_purchase_household,z_census_purchase_capita,z_census_household_cnt,multiplay_cnt,z_line_cnt,z_sim_cnt,fixed_prod_cat1_ind,tenure_fixed_month,tenure_mobile_month,z_line_voice_cat1_cnt,fixed_data_cat1_ind,fixed_data_cat2_ind,z_fixed_prod_cat2_cnt,z_fixed_prod_cat1_cnt,z_fixed_data_cat3_cnt,fixed_prod_cat3_cnt,device_smartphone_cnt,z_mobile_voice_cat1_cnt,z_mobile_data_cat1_cnt,mobile_data_cat2_cnt,z_mobile_voice_cat3_cnt,z_mobile_data_cat3_cnt,z_usg_fv_3m_avg,z_usg_fd_mb_1m_sum,z_usg_fd_mb_3m_avg,z_usg_mv_ib_a_3m_avg,z_usg_md_sms_ib_a_3m_avg,z_usg_md_ib_mb_3m_avg,payment_method_cash_cnt,z_rev_1m_sum,z_device_netcube_cnt,z_tariff_netcube_cnt,z_min_Prog_Max_BB_Down,z_line_Fib2h_CNT,z_min_Speed_Product_KBit,z_Max_Speed_Missing_KBit,z_Min_Speed_Reserve_KBit,z_Max_DSL_OOS_PCT,z_PR_Relocation_CNT,z_PR_Relocation_Days,z_PR_ActivationSupportOpt_CNT,z_PR_ActivationSupportOpt_Days,z_PR_DeactivationThreat_CNT,z_PR_DeactivationSupport_CNT,z_PR_DeactivationProdOpt_CNT,z_PR_DeactivationProdOpt_Days,z_PR_OtherWOTopic_CNT,z_PR_OtherWOTopic_Days,z_PR_AddressChange_CNT,z_PR_AddressChange_Days,z_PR_ServiceDisruption_CNT,z_PR_ServiceDisruption_Days,z_PR_BasketSupport_CNT,z_PR_BasketSupport_Days,z_PR_SellingSalesSupport_CNT,z_PR_SellingSalesSupport_Days,z_PR_DigitalUsage_CNT,z_PR_DigitalUsage_Days,z_TNPS_Last_Days,z_TNPS_Score_Avg,province_cd_A,province_cd_B,province_cd_C,province_cd_D,province_cd_E,province_cd_F,province_cd_G,province_cd_H,province_cd_I,Gender_CD_F,prod_monodual_cd_D,customer_value_cd_cat
target_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1
0.0,0.951594,0.987375,0.970665,0.921186,0.828255,1.020247,0.818728,0.821288,1.029139,0.444955,122.8442,75.254035,0.0,0.018522,0.481586,1.008598,0.883211,1.036006,0.039925,0.687774,1.159182,0.834284,0.112802,0.882906,0.0,0.69779,0.947489,0.964409,1.001383,0.445115,0.408217,0.483167,0.82031,0.967809,0.738849,1.043736,0.963682,0.97817,0.92085,1.07174,0.876821,0.476645,0.48141,1.053938,1.228081,0.0,0.0,0.895208,1.037167,0.713462,0.78896,0.554569,0.587934,1.046266,1.058052,0.62597,0.717119,1.000937,1.080715,0.819493,1.020905,1.154911,1.182733,0.367489,0.424931,0.176278,0.360574,0.370279,0.268822,0.211126,0.279862,0.200583,0.474242,0.498921,0.933773
1.0,0.939256,1.009463,1.005686,0.928775,0.816551,1.156927,0.859199,1.175577,0.955407,0.477205,101.012568,64.282159,0.0,0.023982,0.467025,0.986447,0.796602,1.180429,0.05339,0.655795,0.864023,0.871698,0.093826,0.732406,0.0,0.419804,0.979581,0.996792,1.019556,0.492071,0.632468,0.607803,0.883449,1.371538,1.00306,1.227848,1.112554,0.954711,0.927682,1.234605,1.358013,0.644076,0.552069,1.195546,1.374869,0.0,0.0,1.054639,1.104099,0.7117,0.786937,0.676792,0.68214,1.023671,1.046624,0.594002,0.700106,1.037437,1.109799,0.817657,1.010254,1.149369,1.112725,0.40725,0.387193,0.170785,0.352124,0.366867,0.269686,0.230181,0.293555,0.197657,0.487131,0.498977,0.931234


### Train/Validation Set Split

In [1]:
# training_data = pd.read_csv('training_data_final.csv', dtype=column_types)
# X = training_data.drop('target_ind', axis=1)
# y = training_data['target_ind']
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, shuffle=True, stratify=y)
# X_train.to_csv('X_train.csv', index=False)
# X_val.to_csv('X_val.csv', index=False)
# y_train.to_csv('y_train.csv', index=False)
# y_val.to_csv('y_val.csv', index=False)