In [13]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

import pickle


In [2]:
data = pd.read_csv('cleanse_data_file.csv', parse_dates=['date_activ','date_end','date_modif_prod','date_renewal'])

data.dtypes

Unnamed: 0                           int64
id                                  object
channel_sales                       object
cons_12m                             int64
cons_gas_12m                         int64
cons_last_month                      int64
date_activ                  datetime64[ns]
date_end                    datetime64[ns]
date_modif_prod             datetime64[ns]
date_renewal                datetime64[ns]
forecast_cons_12m                  float64
forecast_cons_year                   int64
forecast_discount_energy           float64
forecast_meter_rent_12m            float64
forecast_price_energy_p1           float64
forecast_price_energy_p2           float64
forecast_price_pow_p1              float64
has_gas                             object
imp_cons                           float64
margin_gross_pow_ele               float64
margin_net_pow_ele                 float64
nb_prod_act                          int64
net_margin                         float64
num_years_a

In [3]:
data.sample(5)

Unnamed: 0.1,Unnamed: 0,id,channel_sales,cons_12m,cons_gas_12m,cons_last_month,date_activ,date_end,date_modif_prod,date_renewal,forecast_cons_12m,forecast_cons_year,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_p1,forecast_price_energy_p2,forecast_price_pow_p1,has_gas,imp_cons,margin_gross_pow_ele,margin_net_pow_ele,nb_prod_act,net_margin,num_years_antig,origin_up,pow_max,churn,total_power_price,total_energy_price
13530,13530,fe39c6aa6d535b73d6041f838959ac95,foosdfpfkusacimwkcsosbicdxkicaua,279,0,21,2010-02-05,2016-06-15,2014-12-01,2015-06-23,40.51,21,0.0,16.96,0.144902,0.0,44.311378,f,3.08,26.04,26.04,1,3.68,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,10.392,1,481.104602,1.60795
10418,10418,9055ff958a909f0c9be2b3009adc8036,missing_channel_sales,35519,0,0,2009-11-13,2016-11-13,2014-09-18,2015-11-18,5265.71,0,0.0,13.97,0.144631,0.0,44.311378,f,0.0,6.18,6.18,1,498.29,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.2,0,533.33652,1.772148
5426,5426,ff232ea90c874c735d047a9d778a8be3,foosdfpfkusacimwkcsosbicdxkicaua,42695,0,3230,2009-09-01,2016-09-01,2014-06-26,2015-09-02,4489.25,3230,0.0,129.61,0.115174,0.098837,40.606701,f,327.53,23.99,23.99,1,492.34,6,lxidpiddsbxsbosboudacockeimpuepw,50.0,0,977.49324,3.57357
10346,10346,db6b63ba024c5240deaf5430cf78cec9,missing_channel_sales,57850,0,2263,2009-07-17,2016-12-31,2015-08-14,2016-01-01,0.0,0,0.0,168.94,0.092453,0.086393,59.173468,f,0.0,0.12,0.12,1,0.0,7,ldkssxwpmemidmecebumciepifcamkci,50.0,0,1248.382656,3.146838
447,447,c2fa31883fbf46a4e99f6f28f6af7122,usilxuppasemubllopkaafesmlibmsdf,3769,0,0,2012-10-08,2016-11-12,2014-11-12,2015-11-13,558.82,0,0.0,16.5,0.144902,0.0,44.311378,f,0.0,6.96,6.96,1,62.88,3,lxidpiddsbxsbosboudacockeimpuepw,13.15,0,531.736506,1.777435


In [4]:
data['active_years'] = (data['date_end'] - data['date_activ'])/np.timedelta64(1,'Y')

In [5]:
cat_col = [x for x in data.columns if data[x].nunique() < 10 and data[x].dtype == object]

cat_col

['channel_sales', 'has_gas', 'origin_up']

In [6]:
label_encoder = LabelEncoder()

for cat in cat_col:
    data[cat] = label_encoder.fit_transform(data[cat].values)

In [7]:
data.head(2)

Unnamed: 0.1,Unnamed: 0,id,channel_sales,cons_12m,cons_gas_12m,cons_last_month,date_activ,date_end,date_modif_prod,date_renewal,forecast_cons_12m,forecast_cons_year,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_p1,forecast_price_energy_p2,forecast_price_pow_p1,has_gas,imp_cons,margin_gross_pow_ele,margin_net_pow_ele,nb_prod_act,net_margin,num_years_antig,origin_up,pow_max,churn,total_power_price,total_energy_price,active_years
0,0,d29c2c54acc38ff3c0614d0a653813dd,4,4660,0,0,2009-08-21,2016-08-30,2009-08-21,2015-08-31,189.95,0,0.0,16.27,0.145711,0.0,44.311378,0,0.0,16.38,16.38,1,18.89,6,1,13.8,0,487.469573,1.649274,7.025469
1,1,764c75f661154dac3a6c254cd082ea7d,2,544,0,0,2010-04-16,2016-04-16,2010-04-16,2015-04-17,47.96,0,0.0,38.72,0.165794,0.087899,44.311378,0,0.0,28.6,28.6,1,6.6,6,1,13.856,0,532.625404,3.107195,6.001492


In [8]:
X = data.drop(['Unnamed: 0','id','date_activ','date_end','date_modif_prod','date_renewal','churn'],axis=1)

In [9]:
y = data['churn']

In [10]:
X.head()

Unnamed: 0,channel_sales,cons_12m,cons_gas_12m,cons_last_month,forecast_cons_12m,forecast_cons_year,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_p1,forecast_price_energy_p2,forecast_price_pow_p1,has_gas,imp_cons,margin_gross_pow_ele,margin_net_pow_ele,nb_prod_act,net_margin,num_years_antig,origin_up,pow_max,total_power_price,total_energy_price,active_years
0,4,4660,0,0,189.95,0,0.0,16.27,0.145711,0.0,44.311378,0,0.0,16.38,16.38,1,18.89,6,1,13.8,487.469573,1.649274,7.025469
1,2,544,0,0,47.96,0,0.0,38.72,0.165794,0.087899,44.311378,0,0.0,28.6,28.6,1,6.6,6,1,13.856,532.625404,3.107195,6.001492
2,3,1584,0,0,240.04,0,0.0,19.83,0.146694,0.0,44.311378,0,0.0,30.22,30.22,1,25.46,6,1,13.2,532.803183,1.814521,6.001492
3,2,121335,0,12400,10865.02,12400,0.0,170.74,0.110083,0.093746,40.606701,0,1052.37,-3.18,-3.18,1,823.18,6,3,75.0,976.189915,3.428996,6.001492
4,4,4425,0,526,445.75,526,0.0,131.73,0.1169,0.100015,40.606701,0,52.32,44.91,44.91,1,47.98,6,1,19.8,976.515746,3.608133,6.146601


In [11]:
scaler = MinMaxScaler()

X = scaler.fit_transform(X)

In [16]:
with open('data_pick.pkl','wb') as pickle_file:
    pickle.dump((X,y), pickle_file)

In [12]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)