In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load the data into pandas dataframes
members = pd.read_csv('./Filtered_Dataset/members_filtered.csv')
transactions = pd.read_csv('./Filtered_Dataset/transactions_filtered.csv')
user_logs = pd.read_csv('./Filtered_Dataset/user_logs_filtered.csv')
labels = pd.read_csv('./Filtered_Dataset/labels_filtered.csv')
features_all = pd.read_csv('./features_all.csv')


In [None]:
# Getting the date in YMD format

def pd_to_date(df_col):
    df_col = pd.to_datetime(df_col, format = '%Y%m%d')
    return df_col

In [None]:
# Getting the timestamps in a consistent format
# Great for EDA, not so much for analysis

# user_logs['date'] = pd_to_date(user_logs['date'])
# transactions['transaction_date'] = pd_to_date(transactions['transaction_date'])
# transactions['membership_expire_date'] = pd_to_date(transactions['membership_expire_date'])
# members['registration_init_time'] = pd_to_date(members['registration_init_time'])

In [3]:
# Getting more info on data

print('Members: \n')
members.info()
print()
print('Labels: \n')
labels.info()
print()
print('Transactions: \n')
transactions.info()
print()
print('User Logs: \n')
user_logs.info()
print()
print('All Features: \n')
features_all.info()


Members: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17375 entries, 0 to 17374
Data columns (total 6 columns):
msno                      17375 non-null object
city                      17375 non-null int64
bd                        17375 non-null int64
gender                    7772 non-null object
registerd_via             17375 non-null int64
registration_init_time    17375 non-null int64
dtypes: int64(4), object(2)
memory usage: 814.5+ KB

Labels: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17375 entries, 0 to 17374
Data columns (total 2 columns):
msno        17375 non-null object
is_churn    17375 non-null int64
dtypes: int64(1), object(1)
memory usage: 271.6+ KB

Transactions: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282160 entries, 0 to 282159
Data columns (total 9 columns):
msno                      282160 non-null object
payment_method_id         282160 non-null int64
payment_plan_days         282160 non-null int64
plan_list_price           282160 non

In [None]:
user_logs.head()

In [None]:
members.head()

In [4]:
transactions.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,ynoHZAirdqCE1WTLtSbR4DRBh+d/MCNRQyGJf3etfKQ=,22,410,1788,1788,0,20150108,20160222,0
1,6s+NSU5/dUQPW6V9xofh9MjL1KZpiGb4NSheT1zXmYY=,20,100,480,480,0,20161117,20170225,0
2,ntc5gs/VPO/6H8fOE4cytmRrDz4du8CWIiA1r05BkbI=,20,100,480,480,0,20151011,20160119,0
3,fNl44y6NC4flr2h6lKQvsQN+lOFO5Lm9t8gbb1/XQXQ=,20,100,480,480,0,20150727,20151104,0
4,ypOGEQcfsMHOc3gArywCzV0muy8aM/pm1MBPLTnSmqo=,12,410,1788,1788,0,20151213,20170205,0


In [None]:
transactions.tail()

In [None]:
features_all.head()

In [None]:
transactions.describe()

In [5]:
# Grouping by the member (msno)
transactions_gb = transactions.sort_values(["transaction_date"]).groupby(['msno'])

# How many groups i.e. members i.e. msno's. We're good if this is the same number as the members table
print('%d Groups/msnos' %(len(transactions_gb.groups)))

17375 Groups/msnos


In [6]:
transactions_gb.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
113677,v7y0FbBVGXFXaaDkiatrm6sTtPt1pjB+JcirG8ohDcw=,40,31,149,0,1,20150101,20150205,0
40745,qFIkhCRnrrU0YQVeSJ1Q2h/CoOkccJuscXiGxzbiTiA=,41,30,149,149,1,20150101,20150201,0
7728,xpJ8QLmjEgy6g7QxEMa9L2n80OUNVVqxRx0wOtm5HOA=,37,31,149,149,1,20150101,20150201,0
10791,+bbLXN6CFptKb00CaRn/Adrf+CXsTfNEsWsVSiWH7zA=,38,30,149,149,0,20150101,20150302,0
179367,33b8ZRAtQcjvlze+Eoad9LxHfUiusGPgl4EmPQ3Ts8I=,41,30,149,149,1,20150101,20150202,0
159018,RezInwPOCs7QO2h9ogo7vzp0Dbr9DqWkysZE9tFtfHI=,40,31,149,149,1,20150101,20150203,0
207,vSqFCyy1z0k9L9h9eu8KpKpAcPGAojiLwg6A+zmYgjo=,27,31,149,149,1,20150101,20150202,0
243777,+3RDqkZp1VLy9Gb8xLyO+mmJjXMQTca1vV1DNiv3z7o=,37,31,149,149,1,20150101,20150202,0
83411,9cBmnSynhMPa9HOztDuw+13EzlE+ONk7livSOBGp0VM=,41,30,149,119,1,20150101,20150224,0
114404,GPcFXXSTzyQib0ErdePs/MD3afEMrpynLzcq+PXEuOM=,40,31,149,149,1,20150101,20150201,0


Features:
    Plan no of days for the latest transaction
    Plan actual amount paid/day for the latest transaction
    plan total amount paid for the latest transaction
    Is_auto_renew for the latest transaction
    is_cancel for the latest transaction
    Total number of plan days
    Total of all the amounts paid for the plan
    Plan day difference among the latest and previous transaction
    Amount paid/day difference among the latest and previous transaction
    ....



In [7]:
# Features: Total_plan_days, Total_amount_paid
transactions_features = (transactions_gb
    .agg({'payment_plan_days':'sum', 'actual_amount_paid':'sum' })
    .rename(columns={'payment_plan_days': 'Total_plan_days', 'actual_amount_paid': 'Total_amount_paid',})
                      )
transactions_features.head()

Unnamed: 0_level_0,Total_plan_days,Total_amount_paid
msno,Unnamed: 1_level_1,Unnamed: 2_level_1
++L+G2jsvbkHMHlwvb2KQiRLAvB4VyEfjVJTUrs5auE=,720,3725
++XZCubnx3mWCWwaVKNyjfZjYZTysD8qvfPb/QMomeI=,240,1192
++am6f+rLDE3gjQM7pKLVAthwCgaI46WHbTNuKtgpbI=,780,3354
++k5broOoWP/P2WkW2N4C/sXL2bowW56Ep/emCCafeQ=,210,1043
++kqM73xL/v0vqbSItFKWgtEyIkW2POP4c/SEA0WZmw=,180,894


In [10]:
# Features: latest transaction, renaming the collumns
latest_transaction_gb = transactions_gb.tail([1]).rename(columns={'payment_plan_days': 'latest_plan_days', 'actual_amount_paid': 'latest_amount_paid','is_auto_renew': 'latest_auto_renew', 
                                                                  'transaction_date': 'latest_transaction_date',
                                                                  'membership_expire_date': 'latest_expire_date', 'is_cancel': 'latest_is_cancel' })

# Index by msno
latest_transaction_gb.set_index('msno', inplace = True)

# Test
latest_transaction_gb.head()

Unnamed: 0_level_0,payment_method_id,latest_plan_days,plan_list_price,latest_amount_paid,latest_auto_renew,latest_transaction_date,latest_expire_date,latest_is_cancel
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
NRzJYmVfPD9TfCGGP9ai1fHAr69INjKXREgt6ViaYwA=,38,450,1788,1788,0,20150630,20170218,0
GVfMVTbKtQgHkmTaJZq0PSM/49neJxN2CdM/j6SGGNs=,41,30,149,149,1,20150919,20170320,0
NSHsvkXPX2yjnIzWXW373xGMssTh1u26rqRVzHadXR0=,32,410,1788,1788,0,20150923,20170203,0
H3036d7cObAUujRSPEq7xEHJboHgQJ58Kv0zWfJKh0E=,22,395,1599,1599,0,20151013,20170209,0
tT2aF3w8Ox+2bC2GaEU/jp8dVvhhAO5inykLclBtXy8=,32,395,1599,1599,0,20151112,20170213,0


In [11]:
latest_transaction_gb.tail()

Unnamed: 0_level_0,payment_method_id,latest_plan_days,plan_list_price,latest_amount_paid,latest_auto_renew,latest_transaction_date,latest_expire_date,latest_is_cancel
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
PNBiuUY5s+aY7A8L88Q800yOSol3RJ5esloNp3zxZNk=,41,30,99,99,1,20170228,20170331,0
ypCvgiDkRM89xjjA19V4Vk4tyilqRjoCtNi1PmXIffo=,33,30,149,149,1,20170228,20170331,0
erkAT63+JXADIyV7aqfOyuZzWJ8rGKywNkHkqG4ygAw=,41,30,99,99,1,20170228,20170331,0
Mirvl6P6h3HEMrAoNtMSh8LpEtL9vlhX8WoHfDZrZ1E=,33,30,149,149,1,20170228,20170331,0
bZrVmiBVAl/M+3OvpGnnb3JmgTn7eCsbST+JbnrL9hA=,31,30,149,149,1,20170228,20170331,0


In [13]:
# Plan actual amount paid/day for the latest transaction
# Adding the collumn amount_paid_per_day

latest_transaction_gb['amount_paid_per_day'] = (latest_transaction_gb['latest_amount_paid']/latest_transaction_gb['latest_plan_days'])
latest_transaction_gb.head()

Unnamed: 0_level_0,payment_method_id,latest_plan_days,plan_list_price,latest_amount_paid,latest_auto_renew,latest_transaction_date,latest_expire_date,latest_is_cancel,amount_paid_per_day
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
NRzJYmVfPD9TfCGGP9ai1fHAr69INjKXREgt6ViaYwA=,38,450,1788,1788,0,20150630,20170218,0,3.973333
GVfMVTbKtQgHkmTaJZq0PSM/49neJxN2CdM/j6SGGNs=,41,30,149,149,1,20150919,20170320,0,4.966667
NSHsvkXPX2yjnIzWXW373xGMssTh1u26rqRVzHadXR0=,32,410,1788,1788,0,20150923,20170203,0,4.360976
H3036d7cObAUujRSPEq7xEHJboHgQJ58Kv0zWfJKh0E=,22,395,1599,1599,0,20151013,20170209,0,4.048101
tT2aF3w8Ox+2bC2GaEU/jp8dVvhhAO5inykLclBtXy8=,32,395,1599,1599,0,20151112,20170213,0,4.048101


In [None]:
# TODO Differences among latest and previous transaction

In [14]:
# Get all features in a single DF
transactions_features = transactions_features.join(latest_transaction_gb, how = 'inner')
transactions_features.head()

Unnamed: 0_level_0,Total_plan_days,Total_amount_paid,payment_method_id,latest_plan_days,plan_list_price,latest_amount_paid,latest_auto_renew,latest_transaction_date,latest_expire_date,latest_is_cancel,amount_paid_per_day
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
++L+G2jsvbkHMHlwvb2KQiRLAvB4VyEfjVJTUrs5auE=,720,3725,38,30,149,149,0,20170216,20170318,0,4.966667
++XZCubnx3mWCWwaVKNyjfZjYZTysD8qvfPb/QMomeI=,240,1192,41,30,149,149,1,20170106,20170107,1,4.966667
++am6f+rLDE3gjQM7pKLVAthwCgaI46WHbTNuKtgpbI=,780,3354,41,30,129,129,1,20170214,20170315,0,4.3
++k5broOoWP/P2WkW2N4C/sXL2bowW56Ep/emCCafeQ=,210,1043,39,30,149,149,1,20170131,20170311,0,4.966667
++kqM73xL/v0vqbSItFKWgtEyIkW2POP4c/SEA0WZmw=,180,894,41,30,149,149,1,20170123,20170123,1,4.966667


In [15]:
# Export in a pickle file
transactions_features.to_pickle('tf.pkl')