### This is some of our feature engineering part. For easier using, we make the final feature to final_train.csv and final_test.csv in the end

In [161]:
import numpy as np
import pandas as pd
from datetime import datetime
from skopt import gp_minimize
import scipy.stats as stats
from lightgbm import LGBMClassifier
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### Training Dataset
* To use the training dataset, we need to merge them on 'msno' which is the user id to use the features and labels from our large dataset.

In [3]:
train_1 = pd.read_csv('train.csv')
train_2 = pd.read_csv('train_v2.csv')
train1_notin_2 = train_1[~train_1['msno'].isin(train_2['msno'])]
train = pd.concat((train1_notin_2, train_2), axis=0, ignore_index=True).reset_index(drop=True)
train.head()

Unnamed: 0,msno,is_churn
0,waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1
1,fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=,1
2,mT5V8rEpa+8wuqi6x0DoVd3H5icMKkE9Prt49UlmK+4=,1
3,XaPhtGLk/5UvvOYHcONTwsnH97P4eGECeq+BARGItRw=,1
4,GBy8qSz16X5iYWD+3CMxv/Hm6OPSrXBYtmbnlRtknW0=,1


* We need to add count of transactions and count of logs to train

In [146]:
# count of transactions
transactions = pd.read_csv('transactions.csv', usecols=['msno'])
transactions = pd.concat((transactions, pd.read_csv('transactions_v2.csv', usecols=['msno'])), axis=0)
transactions = pd.DataFrame(transactions['msno'].value_counts().reset_index())
transactions.columns = ['msno','trans_count']
train = pd.merge(train, transactions, how='left', on='msno')
# count of user_logs
user_logs = pd.read_csv('user_logs_v2.csv', usecols=['msno'])
user_logs = pd.DataFrame(user_logs['msno'].value_counts().reset_index())
user_logs.columns = ['msno','logs_count']
train = pd.merge(train, user_logs, how='left', on='msno')

In [148]:
train.drop(['trans_count_y'],axis = 1,inplace = True)

In [151]:
train.columns = ['msno','is_churn','trans_count','logs_count']

## Transactions ( For Training )
* First, we are going to use transactions.csv for training

In [4]:
transactions = pd.read_csv('transactions.csv')

## Add new features

### Create discount, is_discount, amt_per_day features

In [5]:
transactions['discount'] = transactions['plan_list_price'] - transactions['actual_amount_paid']
transactions['is_discount'] = transactions['discount'].apply(lambda x: 1 if x > 0 else 0)
transactions['amt_per_day'] = transactions['actual_amount_paid'] / transactions['payment_plan_days']

In [6]:
transactions.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,discount,is_discount,amt_per_day
0,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,20150930,20151101,0,0,0,4.3
1,AZtu6Wl0gPojrEQYB8Q3vBSmE2wnZ3hi1FbK1rQQ0A4=,41,30,149,149,1,20150930,20151031,0,0,0,4.966667
2,UkDFI97Qb6+s2LWcijVVv4rMAsORbVDT2wNXF0aVbns=,41,30,129,129,1,20150930,20160427,0,0,0,4.3
3,M1C56ijxozNaGD0t2h68PnH2xtx5iO5iR2MVYQB6nBI=,39,30,149,149,1,20150930,20151128,0,0,0,4.966667
4,yvj6zyBUaqdbUQSrKsrZ+xNDVM62knauSZJzakS9OW4=,39,30,149,149,1,20150930,20151121,0,0,0,4.966667


### Make the dates from int into datetime format

In [7]:
transactions['transaction_date'] = pd.to_datetime(transactions['transaction_date'])
transactions['membership_expire_date'] = pd.to_datetime(transactions['membership_expire_date'])
transactions.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,discount,is_discount,amt_per_day
0,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,2015-09-30,2015-11-01,0,0,0,4.3
1,AZtu6Wl0gPojrEQYB8Q3vBSmE2wnZ3hi1FbK1rQQ0A4=,41,30,149,149,1,2015-09-30,2015-10-31,0,0,0,4.966667
2,UkDFI97Qb6+s2LWcijVVv4rMAsORbVDT2wNXF0aVbns=,41,30,129,129,1,2015-09-30,2016-04-27,0,0,0,4.3
3,M1C56ijxozNaGD0t2h68PnH2xtx5iO5iR2MVYQB6nBI=,39,30,149,149,1,2015-09-30,2015-11-28,0,0,0,4.966667
4,yvj6zyBUaqdbUQSrKsrZ+xNDVM62knauSZJzakS9OW4=,39,30,149,149,1,2015-09-30,2015-11-21,0,0,0,4.966667


### Create day in week features

In [11]:
transactions['transaction_day']=transactions['transaction_date'].dt.weekday
transactions['membership_expire_day']=transactions['membership_expire_date'].dt.weekday
transactions = transactions.drop(['transaction_date', 'membership_expire_date'],axis = 1)

In [12]:
transactions.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,is_cancel,discount,is_discount,amt_per_day,membership_duration,transaction_day,membership_expire_day
0,YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc=,41,30,129,129,1,0,0,0,4.3,32,2,6
1,AZtu6Wl0gPojrEQYB8Q3vBSmE2wnZ3hi1FbK1rQQ0A4=,41,30,149,149,1,0,0,0,4.966667,31,2,5
2,UkDFI97Qb6+s2LWcijVVv4rMAsORbVDT2wNXF0aVbns=,41,30,129,129,1,0,0,0,4.3,210,2,2
3,M1C56ijxozNaGD0t2h68PnH2xtx5iO5iR2MVYQB6nBI=,39,30,149,149,1,0,0,0,4.966667,59,2,5
4,yvj6zyBUaqdbUQSrKsrZ+xNDVM62knauSZJzakS9OW4=,39,30,149,149,1,0,0,0,4.966667,52,2,5


## Members(For Training)
* Then we are going to use members_v3.csv for more features.

In [17]:
members = pd.read_csv('members_v3.csv')

### Create day in week features

In [18]:
members['registration_init_time'] = pd.to_datetime(members['registration_init_time'], format='%Y%m%d')
members['registration_day']=members['registration_init_time'].dt.weekday
members = members.drop(['registration_init_time','city','bd','gender'],axis = 1)

In [19]:
members.head()

Unnamed: 0,msno,registered_via,registration_day
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,11,6
1,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,7,2
2,cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=,11,3
3,9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=,11,3
4,WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=,9,3


## Create: Feature = 'Transaction' merge 'Members'
* This step is to merge the transacation and members together, combine them to feature.

In [20]:
feature = pd.merge(transactions, members, on='msno', how='inner')

In [34]:
feature.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18891703 entries, 0 to 18891702
Data columns (total 15 columns):
msno                     object
payment_method_id        int64
payment_plan_days        int64
plan_list_price          int64
actual_amount_paid       int64
is_auto_renew            int64
is_cancel                int64
discount                 int64
is_discount              int64
amt_per_day              float32
membership_duration      int64
transaction_day          int64
membership_expire_day    int64
registered_via           int64
registration_day         int64
dtypes: float32(1), int64(13), object(1)
memory usage: 2.2+ GB


## Create: Auto_renewed and didn't cancel, as well as didn't auto renew but canceled

In [47]:
feature['autorenew_&_not_cancel'] = ((feature.is_auto_renew == 1) == (feature.is_cancel == 0))
feature['notAutorenew_&_cancel'] = ((feature.is_auto_renew == 0) == (feature.is_cancel == 1))

## Memory Reduction

In [48]:
def change_datatype(df):
    int_cols = list(df.loc[:, df.dtypes == np.int64].columns)
    #print(int_cols)
    for col in int_cols:
        if ((np.max(df[col]) <= 127) and(np.min(df[col] >= -128))):
            df[col] = df[col].astype(np.int8)
        elif ((np.max(df[col]) <= 32767) and(np.min(df[col] >= -32768))):
            df[col] = df[col].astype(np.int16)
        elif ((np.max(df[col]) <= 2147483647) and(np.min(df[col] >= -2147483648))):
            df[col] = df[col].astype(np.int32)
        else:
            df[col] = df[col].astype(np.int64)
    float_cols = list(df.loc[:, df.dtypes == np.float64].columns)
    #print(float_cols)
    for col in float_cols:
        df[col] = df[col].astype(np.float32)

In [49]:
change_datatype(feature)

In [50]:
feature.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18891703 entries, 0 to 18891702
Data columns (total 17 columns):
msno                      object
payment_method_id         int8
payment_plan_days         int16
plan_list_price           int16
actual_amount_paid        int16
is_auto_renew             int8
is_cancel                 int8
discount                  int16
is_discount               int8
amt_per_day               float32
membership_duration       int16
transaction_day           int8
membership_expire_day     int8
registered_via            int8
registration_day          int8
autorenew_&_not_cancel    bool
notAutorenew_&_cancel     bool
dtypes: bool(2), float32(1), int16(5), int8(8), object(1)
memory usage: 720.7+ MB


## Groupby and Agg

In [51]:
grouped_feature = feature.groupby('msno')

In [61]:
agg_func = {'payment_method_id':[lambda x: stats.mode(x)[0][0]], # the mode of payment_method_id
            'payment_plan_days':['mean'], # the mean of payment_plan_days
            'plan_list_price':['mean'], # the mean of payment_plan_days
            'actual_amount_paid':['mean'], # the mean of actual_amount_paid
            'is_auto_renew':[lambda x: stats.mode(x)[0][0]], # the mode of is_auto_renew
            'is_cancel':[lambda x: stats.mode(x)[0][0]], # the mode of is_cancel or not
            'discount':['mean'], # the mean of discount
            'is_discount':[lambda x: stats.mode(x)[0][0]], # the mode of is discount or not
            'amt_per_day':['mean'], # the mean of amount paid per day
            'membership_duration':['mean'], # the mean of membership_duration
            'registered_via':[lambda x: stats.mode(x)[0][0]], # the mode of the way of register
            'autorenew_&_not_cancel':[lambda x: stats.mode(x)[0][0]], # the mode of autorenew_&_not_cancel
            'notAutorenew_&_cancel':[lambda x: stats.mode(x)[0][0]]} # the mode of notAutorenew_&_cancel

In [109]:
agged_feature = grouped_feature.agg(agg_func, as_index=False)
agged_feature.head()

Unnamed: 0_level_0,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,is_cancel,discount,is_discount,amt_per_day,membership_duration,registered_via,autorenew_&_not_cancel,notAutorenew_&_cancel
Unnamed: 0_level_1,<lambda>,mean,mean,mean,<lambda>,<lambda>,mean,<lambda>,mean,mean,<lambda>,<lambda>,<lambda>
msno,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
+++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY=,35,7.0,0.0,0.0,0,0,0.0,0,0.0,5.0,9,False,False
+++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=,38,410.0,1788.0,1788.0,0,0,0.0,0,4.360976,410.0,3,False,False
+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,41,30.0,99.0,99.0,1,0,0.0,0,3.3,29.75,7,True,True
+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,39,28.578947,141.157895,149.0,1,0,-7.842105,0,inf,49.052632,9,True,True
+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,41,28.846154,143.269231,149.0,1,0,-5.730769,0,inf,30.384615,7,True,True


In [114]:
save = agged_feature

In [115]:
agged_feature.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1931003 entries, +++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY= to zzzyOgMk9MljCerbCCYrVtvu85aSCiy7yCMjAEgNYMs=
Data columns (total 13 columns):
payment_method_id_mode         int8
payment_plan_days_mean         float32
plan_list_price_mean           float32
actual_amount_paid_mean        float32
is_auto_renew_mean             int8
is_cancel_mode                 int8
discount_mean                  float32
is_discount_mode               int8
amt_per_day_mean               float32
membership_duration_mean       float32
registered_via_mode            int8
autorenew_&_not_cancel_mode    bool
notAutorenew_&_cancel_mode     bool
dtypes: bool(2), float32(6), int8(5)
memory usage: 71.8+ MB


In [116]:
change_datatype(agged_feature)
agged_feature.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1931003 entries, +++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY= to zzzyOgMk9MljCerbCCYrVtvu85aSCiy7yCMjAEgNYMs=
Data columns (total 13 columns):
payment_method_id_mode         int8
payment_plan_days_mean         float32
plan_list_price_mean           float32
actual_amount_paid_mean        float32
is_auto_renew_mean             int8
is_cancel_mode                 int8
discount_mean                  float32
is_discount_mode               int8
amt_per_day_mean               float32
membership_duration_mean       float32
registered_via_mode            int8
autorenew_&_not_cancel_mode    bool
notAutorenew_&_cancel_mode     bool
dtypes: bool(2), float32(6), int8(5)
memory usage: 71.8+ MB


### Rename

In [117]:
agged_feature.columns = ['payment_method_id_mode',
                        'payment_plan_days_mean',
                        'plan_list_price_mean',
                        'actual_amount_paid_mean',
                        'is_auto_renew_mean',
                        'is_cancel_mode',
                        'discount_mean',
                        'is_discount_mode',
                        'amt_per_day_mean',
                        'membership_duration_mean',
                        'registered_via_mode',
                        'autorenew_&_not_cancel_mode',
                        'notAutorenew_&_cancel_mode']

In [118]:
agged_feature.columns

Index(['payment_method_id_mode', 'payment_plan_days_mean',
       'plan_list_price_mean', 'actual_amount_paid_mean', 'is_auto_renew_mean',
       'is_cancel_mode', 'discount_mean', 'is_discount_mode',
       'amt_per_day_mean', 'membership_duration_mean', 'registered_via_mode',
       'autorenew_&_not_cancel_mode', 'notAutorenew_&_cancel_mode'],
      dtype='object')

### Mapping True or False into 1 or 0

In [119]:
foo = {True:1, False:0}
agged_feature['autorenew_&_not_cancel_mode']=agged_feature['autorenew_&_not_cancel_mode'].map(foo)
agged_feature['notAutorenew_&_cancel_mode']=agged_feature['notAutorenew_&_cancel_mode'].map(foo)

In [124]:
agged_feature['payment_method_id_mode'].unique()

array([35, 38, 41, 39, 40, 14, 32, 16, 36, 25, 37, 23, 30, 28, 34, 31, 22,
       29, 33, 20, 27, 21, 17, 13, 12, 19, 18, 24,  5, 26,  8,  4, 15, 10,
        6, 11,  7,  2,  3])

In [125]:
agged_feature['registered_via_mode'].unique()

array([ 9,  3,  7,  4, 13, 10, 16])

### Make the categorical features one-hot

In [136]:
agged_feature_payment_id = pd.get_dummies(agged_feature['payment_method_id_mode'], prefix = 'payment_id')
agged_feature_payment_id.head()

Unnamed: 0_level_0,payment_id_2,payment_id_3,payment_id_4,payment_id_5,payment_id_6,payment_id_7,payment_id_8,payment_id_10,payment_id_11,payment_id_12,...,payment_id_32,payment_id_33,payment_id_34,payment_id_35,payment_id_36,payment_id_37,payment_id_38,payment_id_39,payment_id_40,payment_id_41
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
+++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY=,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
+++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [137]:
agged_feature_reg_via = pd.get_dummies(agged_feature['registered_via_mode'], prefix = 'reg_via')
agged_feature_reg_via.head()

Unnamed: 0_level_0,reg_via_3,reg_via_4,reg_via_7,reg_via_9,reg_via_10,reg_via_13,reg_via_16
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
+++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY=,0,0,0,1,0,0,0
+++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=,1,0,0,0,0,0,0
+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,0,0,1,0,0,0,0
+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,0,0,1,0,0,0
+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,0,0,1,0,0,0,0


In [138]:
agged_feature = pd.concat([agged_feature, agged_feature_payment_id], axis=1)
agged_feature = pd.concat([agged_feature, agged_feature_reg_via], axis=1)

In [143]:
agged_feature.drop( ['payment_method_id_mode','registered_via_mode'], axis = 1 ,inplace = True)

### Train the training data

In [156]:
change_datatype(train)

* Merge train and features

In [158]:
agged_feature.reset_index(inplace =True)
train = pd.merge(train, agged_feature, how='left', on='msno')

In [159]:
train = train.replace([np.inf,-np.inf], np.nan).fillna(0)
train.head()

Unnamed: 0,msno,is_churn,trans_count,logs_count,payment_plan_days_mean,plan_list_price_mean,actual_amount_paid_mean,is_auto_renew_mean,is_cancel_mode,discount_mean,...,payment_id_39,payment_id_40,payment_id_41,reg_via_3,reg_via_4,reg_via_7,reg_via_9,reg_via_10,reg_via_13,reg_via_16
0,waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1,2,0.0,18.5,74.5,74.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=,1,10,0.0,30.0,149.199997,149.199997,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,mT5V8rEpa+8wuqi6x0DoVd3H5icMKkE9Prt49UlmK+4=,1,3,5.0,410.0,894.0,894.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,XaPhtGLk/5UvvOYHcONTwsnH97P4eGECeq+BARGItRw=,1,9,17.0,98.125,447.0,447.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,GBy8qSz16X5iYWD+3CMxv/Hm6OPSrXBYtmbnlRtknW0=,1,22,0.0,21.818182,108.36364,149.0,1.0,0.0,-40.636364,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
train.columns

#### LGBM and Keras Test ( it's just some test of some models, just to make sure our features are not bad )

In [283]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [270]:
train = train.reindex_axis(sorted(train.columns), axis=1)

  """Entry point for launching an IPython kernel.


In [281]:
X_train, X_test, y_train, y_test = train_test_split(train[cols],train['is_churn'])

In [456]:
lgb_train = lgb.Dataset(X_train, y_train)  
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) 

params = {  
    'learning_rate': 0.05,
    'application': 'binary',
    'max_depth': 5,
    'num_leaves': 128,
    'verbosity': -1,
    'metric': 'binary_logloss'
}  

gbm = lgb.train(params,  
                lgb_train,  
                num_boost_round=300,  
                valid_sets=lgb_eval)

[1]	valid_0's binary_logloss: 0.360543
[2]	valid_0's binary_logloss: 0.345162
[3]	valid_0's binary_logloss: 0.332874
[4]	valid_0's binary_logloss: 0.322635
[5]	valid_0's binary_logloss: 0.313258
[6]	valid_0's binary_logloss: 0.305142
[7]	valid_0's binary_logloss: 0.297991
[8]	valid_0's binary_logloss: 0.291592
[9]	valid_0's binary_logloss: 0.285845
[10]	valid_0's binary_logloss: 0.28061
[11]	valid_0's binary_logloss: 0.275932
[12]	valid_0's binary_logloss: 0.271801
[13]	valid_0's binary_logloss: 0.267892
[14]	valid_0's binary_logloss: 0.264348
[15]	valid_0's binary_logloss: 0.261024
[16]	valid_0's binary_logloss: 0.258118
[17]	valid_0's binary_logloss: 0.255313
[18]	valid_0's binary_logloss: 0.252764
[19]	valid_0's binary_logloss: 0.250301
[20]	valid_0's binary_logloss: 0.248129
[21]	valid_0's binary_logloss: 0.2461
[22]	valid_0's binary_logloss: 0.244164
[23]	valid_0's binary_logloss: 0.242373
[24]	valid_0's binary_logloss: 0.240761
[25]	valid_0's binary_logloss: 0.239172
[26]	valid_0

[207]	valid_0's binary_logloss: 0.202709
[208]	valid_0's binary_logloss: 0.20269
[209]	valid_0's binary_logloss: 0.202678
[210]	valid_0's binary_logloss: 0.20261
[211]	valid_0's binary_logloss: 0.202577
[212]	valid_0's binary_logloss: 0.202549
[213]	valid_0's binary_logloss: 0.202506
[214]	valid_0's binary_logloss: 0.202452
[215]	valid_0's binary_logloss: 0.202412
[216]	valid_0's binary_logloss: 0.202378
[217]	valid_0's binary_logloss: 0.202369
[218]	valid_0's binary_logloss: 0.20231
[219]	valid_0's binary_logloss: 0.202291
[220]	valid_0's binary_logloss: 0.202281
[221]	valid_0's binary_logloss: 0.202221
[222]	valid_0's binary_logloss: 0.202193
[223]	valid_0's binary_logloss: 0.202181
[224]	valid_0's binary_logloss: 0.202172
[225]	valid_0's binary_logloss: 0.20213
[226]	valid_0's binary_logloss: 0.202081
[227]	valid_0's binary_logloss: 0.202059
[228]	valid_0's binary_logloss: 0.202029
[229]	valid_0's binary_logloss: 0.202022
[230]	valid_0's binary_logloss: 0.201971
[231]	valid_0's bina

In [537]:
lgb_params = {}
lgb_params['learning_rate'] = 0.013078
lgb_params['n_estimators'] = 2189
lgb_params['num_leaves'] = 357


In [540]:
lgb_model = LGBMClassifier(**lgb_params)
cols = [c for c in train.columns if c not in ['is_churn','msno']]
#bst = lgb_model.fit(X_train, y_train)
bst = lgb_model.fit(train[cols], train['is_churn'])

In [541]:
y = bst.predict_proba(test[cols])
sum(y)

array([816498.1833545 ,  90972.81664558])

In [458]:
from keras.models import Sequential
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import LambdaCallback
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD
from keras.constraints import maxnorm
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Lambda
from keras.layers.core import Dropout
from keras import regularizers
from keras.models import Model, load_model
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Activation, MaxPooling1D
from sklearn.preprocessing import StandardScaler,MinMaxScaler

cols = [c for c in train.columns if c not in ['is_churn','msno']]

X_train = StandardScaler().fit_transform(train[cols].as_matrix())
y_train = train['is_churn'].as_matrix()
X_test = StandardScaler().fit_transform(test[cols].as_matrix())

lsize = 128
model = Sequential()
model.add(Dense(lsize, input_dim=int(X_train.shape[1]),activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(rate=0.25))
model.add(Dense(int(lsize/2), activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(rate=0.25))
model.add(Dense(int(lsize/4),kernel_regularizer=regularizers.l2(0.1), activation='relu'))
model.add(Dropout(rate=0.1))
model.add(Dense(1, activation='sigmoid'))

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_29 (Dense)             (None, 128)               7680      
_________________________________________________________________
batch_normalization_1 (Batch (None, 128)               512       
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_30 (Dense)             (None, 64)                8256      
_________________________________________________________________
batch_normalization_2 (Batch (None, 64)                256       
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_31 (Dense)             (None, 32)                2080      
__________

In [459]:
history = model.fit(X_train, y_train, epochs=10, batch_size=1026,#512, 
                    validation_split=0.2, verbose=1)

Train on 865752 samples, validate on 216438 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [464]:
predictions = model.predict(test[cols])

## Prepare the test data

In [240]:
test = pd.read_csv('sample_submission_v2.csv')
test = pd.merge(test, transactions, how='left', on='msno')
test = pd.merge(test, user_logs, how='left', on='msno')
test.head()

Unnamed: 0,msno,is_churn,trans_count,logs_count
0,4n+fXlyJvfQnTeKXTWT507Ll4JVYGrOC8LHCfwBmPE4=,0,21,
1,aNmbC1GvFUxQyQUidCVmfbQ0YeCuwkPzEdQ0RwWyeZM=,0,23,31.0
2,rFC9eSG/tMuzpre6cwcMLZHEYM89xY02qcz7HL4//jc=,0,11,10.0
3,WZ59dLyrQcE7ft06MZ5dj40BnlYQY7PHgg/54+HaCSE=,0,17,
4,aky/Iv8hMp1/V/yQHLtaVuEmmAxkB5GuasQZePJ7NU4=,0,4,


In [178]:
transactions_v2 = pd.read_csv('transactions_v2.csv')
transactions_v2['discount'] = transactions_v2['plan_list_price'] - transactions_v2['actual_amount_paid']
transactions_v2['is_discount'] = transactions_v2['discount'].apply(lambda x: 1 if x > 0 else 0)
transactions_v2['amt_per_day'] = transactions_v2['actual_amount_paid'] / transactions_v2['payment_plan_days']

In [180]:
transactions_v2['transaction_date'] = pd.to_datetime(transactions_v2['transaction_date'], format='%Y%m%d')
transactions_v2['membership_expire_date'] = pd.to_datetime(transactions_v2['membership_expire_date'], format='%Y%m%d')
transactions_v2.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,discount,is_discount,amt_per_day
0,++6eU4LsQ3UQ20ILS7d99XK8WbiVgbyYL4FUgzZR134=,32,90,298,298,0,2017-01-31,2017-05-04,0,0,0,3.311111
1,++lvGPJOinuin/8esghpnqdljm6NXS8m8Zwchc7gOeA=,41,30,149,149,1,2015-08-09,2019-04-12,0,0,0,4.966667
2,+/GXNtXWQVfKrEDqYAzcSw2xSPYMKWNj22m+5XkVQZc=,36,30,180,180,1,2017-03-03,2017-04-22,0,0,0,6.0
3,+/w1UrZwyka4C9oNH3+Q8fUf3fD8R3EwWrx57ODIsqk=,36,30,180,180,1,2017-03-29,2017-03-31,1,0,0,6.0
4,+00PGzKTYqtnb65mPKPyeHXcZEwqiEzktpQksaaSC3c=,41,30,99,99,1,2017-03-23,2017-04-23,0,0,0,3.3


In [181]:
transactions_v2['membership_duration'] = ((transactions_v2['membership_expire_date'] - transactions_v2['transaction_date'])/ np.timedelta64(1, 'D')).astype(int)

In [182]:
transactions_v2['transaction_day']=transactions_v2['transaction_date'].dt.weekday
transactions_v2['membership_expire_day']=transactions_v2['membership_expire_date'].dt.weekday
transactions_v2 = transactions_v2.drop(['transaction_date', 'membership_expire_date'],axis = 1)

In [183]:
feature_test = pd.merge(transactions_v2, members, on='msno', how='inner')

In [184]:
feature_test['autorenew_&_not_cancel'] = ((feature_test.is_auto_renew == 1) == (feature_test.is_cancel == 0))
feature_test['notAutorenew_&_cancel'] = ((feature_test.is_auto_renew == 0) == (feature_test.is_cancel == 1))

In [185]:
change_datatype(feature_test)

In [186]:
feature_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1303156 entries, 0 to 1303155
Data columns (total 17 columns):
msno                      1303156 non-null object
payment_method_id         1303156 non-null int8
payment_plan_days         1303156 non-null int16
plan_list_price           1303156 non-null int16
actual_amount_paid        1303156 non-null int16
is_auto_renew             1303156 non-null int8
is_cancel                 1303156 non-null int8
discount                  1303156 non-null int16
is_discount               1303156 non-null int8
amt_per_day               1303153 non-null float32
membership_duration       1303156 non-null int16
transaction_day           1303156 non-null int8
membership_expire_day     1303156 non-null int8
registered_via            1303156 non-null int8
registration_day          1303156 non-null int8
autorenew_&_not_cancel    1303156 non-null bool
notAutorenew_&_cancel     1303156 non-null bool
dtypes: bool(2), float32(1), int16(5), int8(8), object(1)
mem

In [218]:
grouped_feature_test = feature_test.groupby('msno')
agged_feature_test = grouped_feature_test.agg(agg_func, as_index=False)
agged_feature_test.head()

Unnamed: 0_level_0,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,is_cancel,discount,is_discount,amt_per_day,membership_duration,registered_via,autorenew_&_not_cancel,notAutorenew_&_cancel
Unnamed: 0_level_1,<lambda>,mean,mean,mean,<lambda>,<lambda>,mean,<lambda>,mean,mean,<lambda>,<lambda>,<lambda>
msno,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
+++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=,22,395.0,1599.0,1599.0,0,0,0.0,0,4.048101,471.0,3,False,False
+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,41,30.0,99.0,99.0,1,0,0.0,0,3.3,31.0,7,True,True
+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,39,30.0,149.0,149.0,1,0,0.0,0,4.966667,49.5,9,True,True
+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,41,30.0,149.0,149.0,1,0,0.0,0,4.966667,31.0,7,True,True
++/9R3sX37CjxbY/AaGvbwr3QkwElKBCtSvVzhCBDOk=,41,30.0,149.0,149.0,1,0,0.0,0,4.966667,31.0,7,True,True


In [216]:
save2 = agged_feature_test

In [219]:
change_datatype(agged_feature_test)
agged_feature_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1077434 entries, +++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s= to zzzF1KsGfHH3qI6qiSNSXC35UXmVKMVFdxkp7xmDMc0=
Data columns (total 13 columns):
(payment_method_id, <lambda>)         1077434 non-null int8
(payment_plan_days, mean)             1077434 non-null float32
(plan_list_price, mean)               1077434 non-null float32
(actual_amount_paid, mean)            1077434 non-null float32
(is_auto_renew, <lambda>)             1077434 non-null int8
(is_cancel, <lambda>)                 1077434 non-null int8
(discount, mean)                      1077434 non-null float32
(is_discount, <lambda>)               1077434 non-null int8
(amt_per_day, mean)                   1077434 non-null float32
(membership_duration, mean)           1077434 non-null float32
(registered_via, <lambda>)            1077434 non-null int8
(autorenew_&_not_cancel, <lambda>)    1077434 non-null bool
(notAutorenew_&_cancel, <lambda>)     1077434 non-null bool
dtypes: bool(2

In [220]:
agged_feature_test.columns = ['payment_method_id_mode',
                        'payment_plan_days_mean',
                        'plan_list_price_mean',
                        'actual_amount_paid_mean',
                        'is_auto_renew_mean',
                        'is_cancel_mode',
                        'discount_mean',
                        'is_discount_mode',
                        'amt_per_day_mean',
                        'membership_duration_mean',
                        'registered_via_mode',
                        'autorenew_&_not_cancel_mode',
                        'notAutorenew_&_cancel_mode']

In [221]:
foo = {True:1, False:0}
agged_feature_test['autorenew_&_not_cancel_mode']=agged_feature_test['autorenew_&_not_cancel_mode'].map(foo)
agged_feature_test['notAutorenew_&_cancel_mode']=agged_feature_test['notAutorenew_&_cancel_mode'].map(foo)

In [222]:
sorted(agged_feature_test['registered_via_mode'].unique())

[3, 4, 7, 9, 13]

In [223]:
sorted([ 9,  3,  7,  4, 13, 10, 16])

[3, 4, 7, 9, 10, 13, 16]

In [224]:
agged_feature_test_payment_id = pd.get_dummies(agged_feature_test['payment_method_id_mode'], prefix = 'payment_id')
agged_feature_test_reg_via = pd.get_dummies(agged_feature_test['registered_via_mode'], prefix = 'reg_via')

In [225]:
agged_feature_test = pd.concat([agged_feature_test, agged_feature_test_payment_id], axis=1)
agged_feature_test = pd.concat([agged_feature_test, agged_feature_test_reg_via], axis=1)
agged_feature_test.drop( ['payment_method_id_mode','registered_via_mode'], axis = 1 ,inplace = True)

In [226]:
agged_feature_test['payment_id_4'] = 0
agged_feature_test['payment_id_5'] = 0
agged_feature_test['payment_id_7'] = 0
agged_feature_test['payment_id_24'] = 0

agged_feature_test['reg_via_10'] = 0
agged_feature_test['reg_via_16'] = 0

In [227]:
agged_feature_test.head()

Unnamed: 0_level_0,payment_plan_days_mean,plan_list_price_mean,actual_amount_paid_mean,is_auto_renew_mean,is_cancel_mode,discount_mean,is_discount_mode,amt_per_day_mean,membership_duration_mean,autorenew_&_not_cancel_mode,...,reg_via_4,reg_via_7,reg_via_9,reg_via_13,payment_id_4,payment_id_5,payment_id_7,payment_id_24,reg_via_10,reg_via_16
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
+++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=,395.0,1599.0,1599.0,0,0,0.0,0,4.048101,471.0,0,...,0,0,0,0,0,0,0,0,0,0
+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,30.0,99.0,99.0,1,0,0.0,0,3.3,31.0,1,...,0,1,0,0,0,0,0,0,0,0
+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,30.0,149.0,149.0,1,0,0.0,0,4.966667,49.5,1,...,0,0,1,0,0,0,0,0,0,0
+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,30.0,149.0,149.0,1,0,0.0,0,4.966667,31.0,1,...,0,1,0,0,0,0,0,0,0,0
++/9R3sX37CjxbY/AaGvbwr3QkwElKBCtSvVzhCBDOk=,30.0,149.0,149.0,1,0,0.0,0,4.966667,31.0,1,...,0,1,0,0,0,0,0,0,0,0


In [214]:
#agged_feature_test = agged_feature_test.reindex_axis(sorted(agged_feature_test.columns), axis=1)

  """Entry point for launching an IPython kernel.


In [241]:
agged_feature_test.reset_index(inplace =True)
test = pd.merge(test, agged_feature_test, how='left', on='msno')

In [247]:
test = test.replace([np.inf,-np.inf], np.nan).fillna(0)
test.drop(['level_0','index'],axis = 1,inplace = True)
test.head()

Unnamed: 0,msno,is_churn,trans_count,logs_count,payment_plan_days_mean,plan_list_price_mean,actual_amount_paid_mean,is_auto_renew_mean,is_cancel_mode,discount_mean,...,reg_via_4,reg_via_7,reg_via_9,reg_via_13,payment_id_4,payment_id_5,payment_id_7,payment_id_24,reg_via_10,reg_via_16
0,4n+fXlyJvfQnTeKXTWT507Ll4JVYGrOC8LHCfwBmPE4=,0,21,0.0,30.0,99.0,99.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,aNmbC1GvFUxQyQUidCVmfbQ0YeCuwkPzEdQ0RwWyeZM=,0,23,31.0,30.0,149.0,149.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,rFC9eSG/tMuzpre6cwcMLZHEYM89xY02qcz7HL4//jc=,0,11,10.0,30.0,99.0,99.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,WZ59dLyrQcE7ft06MZ5dj40BnlYQY7PHgg/54+HaCSE=,0,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,aky/Iv8hMp1/V/yQHLtaVuEmmAxkB5GuasQZePJ7NU4=,0,4,0.0,30.0,129.0,129.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [262]:

test = test.reindex_axis(sorted(test.columns), axis=1)

  


In [525]:
predictions = bst.predict_proba(test[cols])

In [526]:
predictions

array([[0.95586529, 0.04413471],
       [0.97048021, 0.02951979],
       [0.96853388, 0.03146612],
       ...,
       [0.96853388, 0.03146612],
       [0.95417045, 0.04582955],
       [0.97135293, 0.02864707]])

In [542]:
sample_submission = pd.read_csv('sample_submission_v2.csv')
sample_submission['is_churn'] = y[:,1:2]
sample_submission.to_csv('submission_test.csv',index=False)

In [528]:
count = 0
for i in predictions[:,1:2]:
    if i > 0.5:
        count+=1
count

28246

In [355]:
y_pred

array([0.0823829 , 0.0205161 , 0.03071002, ..., 0.01349572, 0.03485314,
       0.04898566])

In [251]:
train.head()

Unnamed: 0,msno,is_churn,trans_count,logs_count,payment_plan_days_mean,plan_list_price_mean,actual_amount_paid_mean,is_auto_renew_mean,is_cancel_mode,discount_mean,...,payment_id_39,payment_id_40,payment_id_41,reg_via_3,reg_via_4,reg_via_7,reg_via_9,reg_via_10,reg_via_13,reg_via_16
0,waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1,2,0.0,18.5,74.5,74.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=,1,10,0.0,30.0,149.199997,149.199997,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,mT5V8rEpa+8wuqi6x0DoVd3H5icMKkE9Prt49UlmK+4=,1,3,5.0,410.0,894.0,894.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,XaPhtGLk/5UvvOYHcONTwsnH97P4eGECeq+BARGItRw=,1,9,17.0,98.125,447.0,447.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,GBy8qSz16X5iYWD+3CMxv/Hm6OPSrXBYtmbnlRtknW0=,1,22,0.0,21.818182,108.36364,149.0,1.0,0.0,-40.636364,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [400]:
test.describe()

Unnamed: 0,actual_amount_paid_mean,amt_per_day_mean,autorenew_&_not_cancel_mode,discount_mean,is_auto_renew_mean,is_cancel_mode,is_churn,is_discount_mode,logs_count,membership_duration_mean,...,payment_plan_days_mean,plan_list_price_mean,reg_via_10,reg_via_13,reg_via_16,reg_via_3,reg_via_4,reg_via_7,reg_via_9,trans_count
count,907471.0,907471.0,907471.0,907471.0,907471.0,907471.0,907471.0,907471.0,907471.0,907471.0,...,907471.0,907471.0,907471.0,907471.0,907471.0,907471.0,907471.0,907471.0,907471.0,907471.0
mean,135.442825,3.782038,0.749105,0.31343,0.758196,4.1e-05,0.0,0.002319,13.675057,35.280056,...,31.463923,135.759171,0.0,0.003802,0.0,0.094792,0.052548,0.5235,0.201517,16.028259
std,173.438812,1.700939,0.433528,6.412093,0.428176,0.006385,0.0,0.048095,11.29878,60.052673,...,40.039433,173.340561,0.0,0.061541,0.0,0.292928,0.22313,0.499448,0.401134,9.090499
min,0.0,0.0,0.0,-99.333336,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,99.0,3.3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,30.0,...,30.0,99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
50%,129.0,4.048101,1.0,0.0,1.0,0.0,0.0,0.0,13.0,31.0,...,30.0,129.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,16.0
75%,149.0,4.966667,1.0,0.0,1.0,0.0,0.0,0.0,25.0,31.0,...,30.0,149.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,24.0
max,2000.0,6.0,1.0,533.0,1.0,1.0,0.0,1.0,31.0,2118.795898,...,450.0,2000.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,78.0


In [401]:
train.describe()

Unnamed: 0,actual_amount_paid_mean,amt_per_day_mean,autorenew_&_not_cancel_mode,discount_mean,is_auto_renew_mean,is_cancel_mode,is_churn,is_discount_mode,logs_count,membership_duration_mean,...,payment_plan_days_mean,plan_list_price_mean,reg_via_10,reg_via_13,reg_via_16,reg_via_3,reg_via_4,reg_via_7,reg_via_9,trans_count
count,1082190.0,1082190.0,1082190.0,1082190.0,1082190.0,1082190.0,1082190.0,1082190.0,1082190.0,1082190.0,...,1082190.0,1082190.0,1082190.0,1082190.0,1082190.0,1082190.0,1082190.0,1082190.0,1082190.0,1082190.0
mean,148.5782,2.521229,0.727436,-2.74749,0.7317246,2.31013e-05,0.1242665,0.001503433,13.55284,36.28483,...,33.46945,146.1111,0.0,0.003366322,0.0,0.1158697,0.05904046,0.4621582,0.2461,16.29348
std,192.5306,2.162715,0.4452786,13.99609,0.443062,0.004806328,0.3298855,0.03874499,11.41146,110.5211,...,44.85985,192.2747,0.0,0.0579223,0.0,0.3200687,0.2357005,0.4985662,0.4307377,8.833111
min,0.0,0.0,0.0,-1788.0,0.0,0.0,0.0,0.0,0.0,-8600.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,99.0,0.0,0.0,-3.068965,0.0,0.0,0.0,0.0,1.0,29.04,...,28.57895,99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0
50%,139.7692,3.3,1.0,0.0,1.0,0.0,0.0,0.0,13.0,30.28572,...,30.0,132.4444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0
75%,149.0,4.611905,1.0,0.0,1.0,0.0,0.0,0.0,25.0,30.57895,...,30.0,149.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,23.0
max,2000.0,7.361111,1.0,174.75,1.0,1.0,1.0,1.0,31.0,814.0,...,450.0,2000.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,244.0


* Showing the features of train and test data

In [371]:
train[cols].columns

Index(['actual_amount_paid_mean', 'amt_per_day_mean',
       'autorenew_&_not_cancel_mode', 'discount_mean', 'is_auto_renew_mean',
       'is_cancel_mode', 'is_discount_mode', 'logs_count',
       'membership_duration_mean', 'notAutorenew_&_cancel_mode',
       'payment_id_10', 'payment_id_11', 'payment_id_12', 'payment_id_13',
       'payment_id_14', 'payment_id_15', 'payment_id_16', 'payment_id_17',
       'payment_id_18', 'payment_id_19', 'payment_id_2', 'payment_id_20',
       'payment_id_21', 'payment_id_22', 'payment_id_23', 'payment_id_24',
       'payment_id_25', 'payment_id_26', 'payment_id_27', 'payment_id_28',
       'payment_id_29', 'payment_id_3', 'payment_id_30', 'payment_id_31',
       'payment_id_32', 'payment_id_33', 'payment_id_34', 'payment_id_35',
       'payment_id_36', 'payment_id_37', 'payment_id_38', 'payment_id_39',
       'payment_id_4', 'payment_id_40', 'payment_id_41', 'payment_id_5',
       'payment_id_6', 'payment_id_7', 'payment_id_8',
       'payment_pla

In [372]:
test[cols].columns

Index(['actual_amount_paid_mean', 'amt_per_day_mean',
       'autorenew_&_not_cancel_mode', 'discount_mean', 'is_auto_renew_mean',
       'is_cancel_mode', 'is_discount_mode', 'logs_count',
       'membership_duration_mean', 'notAutorenew_&_cancel_mode',
       'payment_id_10', 'payment_id_11', 'payment_id_12', 'payment_id_13',
       'payment_id_14', 'payment_id_15', 'payment_id_16', 'payment_id_17',
       'payment_id_18', 'payment_id_19', 'payment_id_2', 'payment_id_20',
       'payment_id_21', 'payment_id_22', 'payment_id_23', 'payment_id_24',
       'payment_id_25', 'payment_id_26', 'payment_id_27', 'payment_id_28',
       'payment_id_29', 'payment_id_3', 'payment_id_30', 'payment_id_31',
       'payment_id_32', 'payment_id_33', 'payment_id_34', 'payment_id_35',
       'payment_id_36', 'payment_id_37', 'payment_id_38', 'payment_id_39',
       'payment_id_4', 'payment_id_40', 'payment_id_41', 'payment_id_5',
       'payment_id_6', 'payment_id_7', 'payment_id_8',
       'payment_pla

In [405]:
train.to_csv('final_train.csv',index=False)
test.to_csv('final_test.csv',index=False)