In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load the data into pandas dataframes
members = pd.read_csv('./Filtered_Dataset/members_filtered.csv')
transactions = pd.read_csv('./Filtered_Dataset/transactions_filtered.csv')
user_logs = pd.read_csv('./Filtered_Dataset/user_logs_filtered.csv')
labels = pd.read_csv('./Filtered_Dataset/labels_filtered.csv')
features_all = pd.read_csv('./features_all.csv')


In [3]:
# Getting the date in YMD format

def pd_to_date(df_col):
    df_col = pd.to_datetime(df_col, format = '%Y%m%d')
    return df_col

In [4]:
# Getting the timestamps in a consistent format

user_logs['date'] = pd_to_date(user_logs['date'])
transactions['transaction_date'] = pd_to_date(transactions['transaction_date'])
transactions['membership_expire_date'] = pd_to_date(transactions['membership_expire_date'])
members['registration_init_time'] = pd_to_date(members['registration_init_time'])

In [5]:
# Getting more info on data

print('Members: \n')
members.info()
print()
print('Labels: \n')
labels.info()
print()
print('Transactions: \n')
transactions.info()
print()
print('User Logs: \n')
user_logs.info()
print()
print('All Features: \n')
features_all.info()


Members: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17375 entries, 0 to 17374
Data columns (total 6 columns):
msno                      17375 non-null object
city                      17375 non-null int64
bd                        17375 non-null int64
gender                    7772 non-null object
registerd_via             17375 non-null int64
registration_init_time    17375 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 814.5+ KB

Labels: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17375 entries, 0 to 17374
Data columns (total 2 columns):
msno        17375 non-null object
is_churn    17375 non-null int64
dtypes: int64(1), object(1)
memory usage: 271.6+ KB

Transactions: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282160 entries, 0 to 282159
Data columns (total 9 columns):
msno                      282160 non-null object
payment_method_id         282160 non-null int64
payment_plan_days         282160 non-null int64
plan_lis

In [None]:
user_logs.head()

In [None]:
members.head()

In [None]:
transactions.head()

In [None]:
transactions.tail()

In [None]:
features_all.head()

In [None]:
transactions.describe()

In [6]:
# Grouping by the member (msno)
transactions_gb = transactions.sort_values(["transaction_date"]).groupby(['msno'])

# How many groups i.e. members i.e. msno's. We're good if this is the same number as the members table
print('%d Groups/msnos' %(len(transactions_gb.groups)))

17375 Groups/msnos


In [None]:
transactions_gb.head()

Features:
    Plan no of days for the latest transaction
    Plan actual amount paid/day for the latest transaction
    plan total amount paid for the latest transaction
    Is_auto_renew for the latest transaction
    is_cancel for the latest transaction
    Total number of plan days
    Total of all the amounts paid for the plan
    Plan day difference among the latest and previous transaction
    Amount paid/day difference among the latest and previous transaction
    ....



In [7]:
# Features: Total_plan_days, Total_amount_paid
transactions_features = (transactions_gb
    .agg({'payment_plan_days':'sum', 'actual_amount_paid':'sum' })
    .rename(columns={'payment_plan_days': 'Total_plan_days', 'actual_amount_paid': 'Total_amount_paid',})
                      )
transactions_features.head()

Unnamed: 0_level_0,Total_plan_days,Total_amount_paid
msno,Unnamed: 1_level_1,Unnamed: 2_level_1
++L+G2jsvbkHMHlwvb2KQiRLAvB4VyEfjVJTUrs5auE=,720,3725
++XZCubnx3mWCWwaVKNyjfZjYZTysD8qvfPb/QMomeI=,240,1192
++am6f+rLDE3gjQM7pKLVAthwCgaI46WHbTNuKtgpbI=,780,3354
++k5broOoWP/P2WkW2N4C/sXL2bowW56Ep/emCCafeQ=,210,1043
++kqM73xL/v0vqbSItFKWgtEyIkW2POP4c/SEA0WZmw=,180,894


In [8]:
# Features: latest transaction
latest_transaction_gb = transactions_gb.tail([1])
latest_transaction_gb.set_index('msno', inplace = True)
latest_transaction_gb.head()

Unnamed: 0_level_0,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
NRzJYmVfPD9TfCGGP9ai1fHAr69INjKXREgt6ViaYwA=,38,450,1788,1788,0,2015-06-30,2017-02-18,0
GVfMVTbKtQgHkmTaJZq0PSM/49neJxN2CdM/j6SGGNs=,41,30,149,149,1,2015-09-19,2017-03-20,0
NSHsvkXPX2yjnIzWXW373xGMssTh1u26rqRVzHadXR0=,32,410,1788,1788,0,2015-09-23,2017-02-03,0
H3036d7cObAUujRSPEq7xEHJboHgQJ58Kv0zWfJKh0E=,22,395,1599,1599,0,2015-10-13,2017-02-09,0
tT2aF3w8Ox+2bC2GaEU/jp8dVvhhAO5inykLclBtXy8=,32,395,1599,1599,0,2015-11-12,2017-02-13,0


In [None]:
latest_transaction_gb.tail()

In [9]:
# Plan actual amount paid/day for the latest transaction
# Adding the collumn amount_paid_per_day
latest_transaction_gb['amount_paid_per_day'] = (latest_transaction_gb['actual_amount_paid']/latest_transaction_gb['payment_plan_days'])
latest_transaction_gb.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,amount_paid_per_day
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
NRzJYmVfPD9TfCGGP9ai1fHAr69INjKXREgt6ViaYwA=,38,450,1788,1788,0,2015-06-30,2017-02-18,0,3.973333
GVfMVTbKtQgHkmTaJZq0PSM/49neJxN2CdM/j6SGGNs=,41,30,149,149,1,2015-09-19,2017-03-20,0,4.966667
NSHsvkXPX2yjnIzWXW373xGMssTh1u26rqRVzHadXR0=,32,410,1788,1788,0,2015-09-23,2017-02-03,0,4.360976
H3036d7cObAUujRSPEq7xEHJboHgQJ58Kv0zWfJKh0E=,22,395,1599,1599,0,2015-10-13,2017-02-09,0,4.048101
tT2aF3w8Ox+2bC2GaEU/jp8dVvhhAO5inykLclBtXy8=,32,395,1599,1599,0,2015-11-12,2017-02-13,0,4.048101


In [None]:
# TODO Differences among latest and previous transaction

In [10]:
# TODO: DOES NOT WORK YET: Add to the features table
latest_transaction_gb.head()
# transactions_features = transactions_features.join(latest_transaction_gb, how = 'inner')
# transactions_features.head()

Unnamed: 0_level_0,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,amount_paid_per_day
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
NRzJYmVfPD9TfCGGP9ai1fHAr69INjKXREgt6ViaYwA=,38,450,1788,1788,0,2015-06-30,2017-02-18,0,3.973333
GVfMVTbKtQgHkmTaJZq0PSM/49neJxN2CdM/j6SGGNs=,41,30,149,149,1,2015-09-19,2017-03-20,0,4.966667
NSHsvkXPX2yjnIzWXW373xGMssTh1u26rqRVzHadXR0=,32,410,1788,1788,0,2015-09-23,2017-02-03,0,4.360976
H3036d7cObAUujRSPEq7xEHJboHgQJ58Kv0zWfJKh0E=,22,395,1599,1599,0,2015-10-13,2017-02-09,0,4.048101
tT2aF3w8Ox+2bC2GaEU/jp8dVvhhAO5inykLclBtXy8=,32,395,1599,1599,0,2015-11-12,2017-02-13,0,4.048101


In [11]:
transactions_features.head()

Unnamed: 0_level_0,Total_plan_days,Total_amount_paid
msno,Unnamed: 1_level_1,Unnamed: 2_level_1
++L+G2jsvbkHMHlwvb2KQiRLAvB4VyEfjVJTUrs5auE=,720,3725
++XZCubnx3mWCWwaVKNyjfZjYZTysD8qvfPb/QMomeI=,240,1192
++am6f+rLDE3gjQM7pKLVAthwCgaI46WHbTNuKtgpbI=,780,3354
++k5broOoWP/P2WkW2N4C/sXL2bowW56Ep/emCCafeQ=,210,1043
++kqM73xL/v0vqbSItFKWgtEyIkW2POP4c/SEA0WZmw=,180,894


In [12]:
transactions_features = transactions_features.join(latest_transaction_gb, how = 'inner')
transactions_features.head()

Unnamed: 0_level_0,Total_plan_days,Total_amount_paid,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,amount_paid_per_day
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
++L+G2jsvbkHMHlwvb2KQiRLAvB4VyEfjVJTUrs5auE=,720,3725,38,30,149,149,0,2017-02-16,2017-03-18,0,4.966667
++XZCubnx3mWCWwaVKNyjfZjYZTysD8qvfPb/QMomeI=,240,1192,41,30,149,149,1,2017-01-06,2017-01-07,1,4.966667
++am6f+rLDE3gjQM7pKLVAthwCgaI46WHbTNuKtgpbI=,780,3354,41,30,129,129,1,2017-02-14,2017-03-15,0,4.3
++k5broOoWP/P2WkW2N4C/sXL2bowW56Ep/emCCafeQ=,210,1043,39,30,149,149,1,2017-01-31,2017-03-11,0,4.966667
++kqM73xL/v0vqbSItFKWgtEyIkW2POP4c/SEA0WZmw=,180,894,41,30,149,149,1,2017-01-23,2017-01-23,1,4.966667


In [13]:
transactions_features.to_pickle('tf.pkl')