In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load the data into pandas dataframes
members = pd.read_csv('./Filtered_Dataset/members_filtered.csv')
transactions = pd.read_csv('./Filtered_Dataset/transactions_filtered.csv')
user_logs = pd.read_csv('./Filtered_Dataset/user_logs_filtered.csv')
labels = pd.read_csv('./Filtered_Dataset/labels_filtered.csv')
features_all = pd.read_csv('./features_all.csv')


In [3]:
# Getting the date in YMD format

def pd_to_date(df_col):
    df_col = pd.to_datetime(df_col, format = '%Y%m%d')
    return df_col

In [4]:
# Getting the timestamps in a consistent format

user_logs['date'] = pd_to_date(user_logs['date'])
transactions['transaction_date'] = pd_to_date(transactions['transaction_date'])
transactions['membership_expire_date'] = pd_to_date(transactions['membership_expire_date'])
members['registration_init_time'] = pd_to_date(members['registration_init_time'])

In [5]:
# Getting more info on data

print('Members: \n')
members.info()
print()
print('Labels: \n')
labels.info()
print()
print('Transactions: \n')
transactions.info()
print()
print('User Logs: \n')
user_logs.info()
print()
print('All Features: \n')
features_all.info()


Members: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17375 entries, 0 to 17374
Data columns (total 6 columns):
msno                      17375 non-null object
city                      17375 non-null int64
bd                        17375 non-null int64
gender                    7772 non-null object
registerd_via             17375 non-null int64
registration_init_time    17375 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 814.5+ KB

Labels: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17375 entries, 0 to 17374
Data columns (total 2 columns):
msno        17375 non-null object
is_churn    17375 non-null int64
dtypes: int64(1), object(1)
memory usage: 271.6+ KB

Transactions: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282160 entries, 0 to 282159
Data columns (total 9 columns):
msno                      282160 non-null object
payment_method_id         282160 non-null int64
payment_plan_days         282160 non-null int64
plan_lis

In [None]:
user_logs.head()

In [None]:
members.head()

In [10]:
transactions.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,ynoHZAirdqCE1WTLtSbR4DRBh+d/MCNRQyGJf3etfKQ=,22,410,1788,1788,0,2015-01-08,2016-02-22,0
1,6s+NSU5/dUQPW6V9xofh9MjL1KZpiGb4NSheT1zXmYY=,20,100,480,480,0,2016-11-17,2017-02-25,0
2,ntc5gs/VPO/6H8fOE4cytmRrDz4du8CWIiA1r05BkbI=,20,100,480,480,0,2015-10-11,2016-01-19,0
3,fNl44y6NC4flr2h6lKQvsQN+lOFO5Lm9t8gbb1/XQXQ=,20,100,480,480,0,2015-07-27,2015-11-04,0
4,ypOGEQcfsMHOc3gArywCzV0muy8aM/pm1MBPLTnSmqo=,12,410,1788,1788,0,2015-12-13,2017-02-05,0


In [11]:
transactions.tail()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
282155,/aaJ2gqAqZEiu4km/XvQC6oEpdhyPFtJFglv0RlRnv4=,41,30,149,149,1,2016-12-06,2017-01-07,0
282156,SC2+L0yvRUFfuH47zaMOHslq6rW8ymU0ObRiWbYB9jA=,41,30,149,149,1,2015-01-27,2015-02-28,0
282157,ZifE1cenXDgl6UmHa6iwDToVCwho+XcCYdMYeIq7DBw=,41,30,149,149,1,2015-09-25,2016-08-25,0
282158,8j8SpJ3nmalMgT30AyC5aZG0V/S9hXFkZt8jCgfqXow=,41,30,149,119,1,2015-05-15,2015-06-16,0
282159,UZGFfSXJi1AkHz13jxT7fpy/DjmL0SIkx2AnZzlgI8k=,41,30,149,149,1,2016-08-14,2016-09-14,0


In [None]:
features_all.head()

In [8]:
transactions.describe()

Unnamed: 0,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,is_cancel
count,282160.0,282160.0,282160.0,282160.0,282160.0,282160.0
mean,39.110763,29.694135,133.454497,138.224373,0.918358,0.018773
std,3.335138,16.495047,74.47478,70.729399,0.273818,0.135723
min,6.0,0.0,0.0,0.0,0.0,0.0
25%,38.0,30.0,99.0,99.0,1.0,0.0
50%,41.0,30.0,149.0,149.0,1.0,0.0
75%,41.0,30.0,149.0,149.0,1.0,0.0
max,41.0,450.0,1788.0,1788.0,1.0,1.0


In [6]:
# Grouping by the member (msno)
transactions_gb = transactions.sort_values(["transaction_date"]).groupby(['msno'])

# How many groups i.e. members i.e. msno's. We're good if this is the same number as the members table
print('%d Groups/msnos' %(len(transactions_gb.groups)))

17375 Groups/msnos


In [12]:
transactions_gb.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
113677,v7y0FbBVGXFXaaDkiatrm6sTtPt1pjB+JcirG8ohDcw=,40,31,149,0,1,2015-01-01,2015-02-05,0
40745,qFIkhCRnrrU0YQVeSJ1Q2h/CoOkccJuscXiGxzbiTiA=,41,30,149,149,1,2015-01-01,2015-02-01,0
7728,xpJ8QLmjEgy6g7QxEMa9L2n80OUNVVqxRx0wOtm5HOA=,37,31,149,149,1,2015-01-01,2015-02-01,0
10791,+bbLXN6CFptKb00CaRn/Adrf+CXsTfNEsWsVSiWH7zA=,38,30,149,149,0,2015-01-01,2015-03-02,0
179367,33b8ZRAtQcjvlze+Eoad9LxHfUiusGPgl4EmPQ3Ts8I=,41,30,149,149,1,2015-01-01,2015-02-02,0
159018,RezInwPOCs7QO2h9ogo7vzp0Dbr9DqWkysZE9tFtfHI=,40,31,149,149,1,2015-01-01,2015-02-03,0
207,vSqFCyy1z0k9L9h9eu8KpKpAcPGAojiLwg6A+zmYgjo=,27,31,149,149,1,2015-01-01,2015-02-02,0
243777,+3RDqkZp1VLy9Gb8xLyO+mmJjXMQTca1vV1DNiv3z7o=,37,31,149,149,1,2015-01-01,2015-02-02,0
83411,9cBmnSynhMPa9HOztDuw+13EzlE+ONk7livSOBGp0VM=,41,30,149,119,1,2015-01-01,2015-02-24,0
114404,GPcFXXSTzyQib0ErdePs/MD3afEMrpynLzcq+PXEuOM=,40,31,149,149,1,2015-01-01,2015-02-01,0


Features:
    Plan no of days for the latest transaction
    Plan actual amount paid/day for the latest transaction
    plan total amount paid for the latest transaction
    Is_auto_renew for the latest transaction
    is_cancel for the latest transaction
    Total number of plan days
    Total of all the amounts paid for the plan
    Plan day difference among the latest and previous transaction
    Amount paid/day difference among the latest and previous transaction
    ....



In [None]:
# Features: Total_plan_days, Total_amount_paid
transactions_features = (transactions_gb
    .agg({'payment_plan_days':'sum', 'actual_amount_paid':'sum' })
    .rename(columns={'payment_plan_days': 'Total_plan_days', 'actual_amount_paid': 'Total_amount_paid',})
                      )
transactions_features.head()

In [None]:
# Features: latest transaction
latest_transaction_gb = transactions_gb.tail([1])
latest_transaction_gb.head()

In [None]:
# Plan actual amount paid/day for the latest transaction
# Adding the collumn amount_paid_per_day
latest_transaction_gb['amount_paid_per_day'] = (latest_transaction_gb['actual_amount_paid']/latest_transaction_gb['payment_plan_days'])
latest_transaction_gb.head()

In [None]:
# TODO Differences among latest and previous transaction

In [None]:
# TODO: DOES NOT WORK YET: Add to the features table

transactions_features = transactions_features.join(latest_transaction_gb, how = 'inner')
transactions_features.head()