In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
import warnings
from IPython.core.interactiveshell import InteractiveShell
import numpy as np
import dask.dataframe as dd
import os
import itertools
import matplotlib.ticker as ticker
alt.renderers.enable('default')
InteractiveShell.ast_node_interactivity = "all"
sns.set_theme(style="darkgrid")
warnings.filterwarnings('ignore')
sns.set_palette(sns.color_palette("Set3"))

In [2]:
uid = 'msno'

In [3]:
# This can be changed to the directory where the datasets are stored
data_dir = './data'

# Training data for january, contains two columns : user id and binary churn target variable
train = pd.read_csv(os.path.join(data_dir, 'train.csv'), dtype={'is_churn': str})

train.head()

Unnamed: 0,msno,is_churn
0,waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1
1,QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=,1
2,fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=,1
3,mT5V8rEpa+8wuqi6x0DoVd3H5icMKkE9Prt49UlmK+4=,1
4,XaPhtGLk/5UvvOYHcONTwsnH97P4eGECeq+BARGItRw=,1


In [4]:
user_logs = dd.read_csv(os.path.join(data_dir, 'user_logs_cleaned.csv'), header=None,
                        names=['msno','date','num_25','num_50','num_75','num_985','num_100','num_unq','total_secs'])
transactions = pd.read_csv(os.path.join(data_dir, 'transactions_cleaned.csv'))

In [5]:
members = pd.read_csv(os.path.join(data_dir, 'members_v3.csv'))

In [6]:
members.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,1,0,,11,20110911
1,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,1,0,,7,20110914
2,cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=,1,0,,11,20110915
3,9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=,1,0,,11,20110915
4,WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=,6,32,female,9,20110915


In [10]:
date_pred = 20170131

In [12]:
user_logs = user_logs.loc[user_logs.date <= date_pred]

In [13]:
# Using Dask API to generate basic features from the user logs dataset
aggs_user_logs = user_logs.groupby(uid).agg({'total_secs': ['mean', 'sum'], 'num_unq': ['mean', 'sum']}).compute()

In [14]:
aggs_user_logs = aggs_user_logs.reset_index()
aggs_user_logs.columns = ['_'.join(x) for x in aggs_user_logs.columns.ravel()]
aggs_user_logs.rename(columns={'msno_': 'msno'}, inplace=True)

aggs_user_logs_filtered = aggs_user_logs[aggs_user_logs[uid].isin(train[uid])]

aggs_user_logs_filtered.reset_index(inplace=True)

In [17]:
aggs_user_logs_filtered.head()

Unnamed: 0,index,msno,total_secs_mean,total_secs_sum,num_unq_mean,num_unq_sum
0,3,+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,7412.483605,563348.754,28.460526,2163
1,4,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,6665.32657,3872554.737,25.051635,14555
2,6,+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,11250.902094,6559275.921,41.360206,24113
3,8,++/9R3sX37CjxbY/AaGvbwr3QkwElKBCtSvVzhCBDOk=,5269.431311,1406938.16,13.745318,3670
4,12,++/UDNo9DLrxT8QVGiDi1OnWfczAdEwThaVyD0fXO50=,3160.614991,1349582.601,14.9274,6374


In [16]:
transactions.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,JQqsvSC2BNWif2jSUS0BJ3JT1/oREqxrrL4jNjbjFWU=,41,30,149,149,1,20150101,20150202,0
1,Z1CV2CrBP9tR4Xr5H4VqgGzx650b2HtwJWJV/ExQNaw=,41,30,149,149,1,20150101,20150201,0
2,flo7XEv3URucBlewH+Y/QgdmrAapKq3+NMs97Z+A9Sc=,41,30,149,149,1,20150101,20150201,0
3,Bv0w1EeVjiPLB8r5mR1ny2x8guIdqzGNpKa1LgJ1d9M=,41,30,149,149,1,20150101,20150203,0
4,fSn3mhLjNEREk1f/84iU1FFas2l4zAaMk4JT15WLlvE=,41,30,149,149,1,20150101,20150201,0


In [5]:
transactions = transactions.loc[transactions.transaction_date <= date_pred]

NameError: name 'date_pred' is not defined

In [None]:
transactions['transaction_date'] = pd.to_datetime(transactions['transaction_date'], format='%Y%m%d')

In [20]:
agg_transactions = transactions.groupby(['transaction_date', 'is_cancel']).agg(
    nbr_daily_transactions=('msno', 'count'), total_daily_amount=('actual_amount_paid', 'sum')).reset_index()

In [21]:
agg_transactions.to_csv('agg_transactions.csv', index=False)

In [22]:
aggs_user_logs_filtered.to_csv('agg_user_logs.csv', index=False)

In [2]:
agg_transactions = pd.read_csv('agg_transactions.csv')
agg_user_logs = pd.read_csv('agg_user_logs.csv')

In [3]:
agg_transactions.head()

Unnamed: 0,transaction_date,is_cancel,nbr_daily_transactions,total_daily_amount
0,2015-01-01,0,21309,3570800
1,2015-01-01,1,544,57174
2,2015-01-02,0,15532,2726927
3,2015-01-02,1,855,101020
4,2015-01-03,0,15904,2799779


In [12]:
transactions.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,JQqsvSC2BNWif2jSUS0BJ3JT1/oREqxrrL4jNjbjFWU=,41,30,149,149,1,20150101,20150202,0
1,Z1CV2CrBP9tR4Xr5H4VqgGzx650b2HtwJWJV/ExQNaw=,41,30,149,149,1,20150101,20150201,0
2,flo7XEv3URucBlewH+Y/QgdmrAapKq3+NMs97Z+A9Sc=,41,30,149,149,1,20150101,20150201,0
3,Bv0w1EeVjiPLB8r5mR1ny2x8guIdqzGNpKa1LgJ1d9M=,41,30,149,149,1,20150101,20150203,0
4,fSn3mhLjNEREk1f/84iU1FFas2l4zAaMk4JT15WLlvE=,41,30,149,149,1,20150101,20150201,0


In [None]:
transactions

In [14]:
nbr_cancels = transactions.groupby(uid).agg(nbr_cancels=("is_cancel", "sum")).reset_index()

In [6]:
filtered_trx = transactions.loc[transactions.is_cancel == 0]

In [7]:
filtered_trx.shape

(22086771, 9)

In [17]:
aggs_trx = filtered_trx.groupby(uid).agg(total_days_subscribed=("payment_plan_days", "sum"), 
                                         total_amount_paid=("actual_amount_paid", "sum"),
                                         average_days_subscribed=("payment_plan_days", "mean"),
                                         average_amount_paid=("actual_amount_paid", "mean"),
                                         percentage_auto_renew=("is_auto_renew", "mean")
                                        ).reset_index()

In [18]:
aggs_trx.head()

Unnamed: 0,msno,total_days_subscribed,total_amount_paid,average_days_subscribed,average_amount_paid,percentage_auto_renew
0,+++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY=,7,0,7.0,0.0,0.0
1,+++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=,805,3387,402.5,1693.5,0.0
2,+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,150,495,30.0,99.0,1.0
3,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,603,3129,28.714286,149.0,1.0
4,+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,780,4023,28.888889,149.0,1.0


In [20]:
aggs_trx.to_csv('agg_transactions.csv', index=False)

In [25]:
aggs_trx[aggs_trx.total_days_subscribed==7]['total_amount_paid'].agg(['min', 'max', 'median', 'mean'])

min          0.000000
max       2000.000000
median       0.000000
mean         0.992147
Name: total_amount_paid, dtype: float64

In [27]:
seven_days_sub = aggs_trx[aggs_trx.total_days_subscribed==7]

In [29]:
(seven_days_sub['total_amount_paid'] == 0).mean()

0.9955495361278527

In [22]:
aggs_trx.total_days_subscribed.agg(['min', 'max', 'mean', 'median', 'std'])

min          0.000000
max       7695.000000
mean       295.317309
median     213.000000
std        271.597388
Name: total_days_subscribed, dtype: float64

In [33]:
agg_user_logs.drop('index', 1).to_csv('agg_user_logs.csv', index=False)

In [36]:
aggs_trx

Unnamed: 0,msno,total_days_subscribed,total_amount_paid
0,+++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY=,7,0
1,+++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=,805,3387
2,+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,90,297
3,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,543,2831
4,+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,720,3725
...,...,...,...
2354614,zzz672Xpk1wKox75rJ5gak43ZkFQUV1f7Xek3jnPeRM=,7,0
2354615,zzz9+ZF4+GMyt63oU8xfjo1EkvRqH5OINlES0RUJI6I=,420,1846
2354616,zzzN9thH22os1dRS0VHReY/8FTfGHOi86//d+wGGFsQ=,330,1089
2354617,zzztsqkufVj9DPVJDM3FxDkhlbCL5z4aiYxgPSGkIK4=,7,0


In [10]:
free_trx = filtered_trx[filtered_trx.actual_amount_paid < 1]

In [11]:
free_trx.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
34,bZbiJQudC6SZnQjKmX0JEt38qxLM0l4XYrPWuDTG+I8=,40,31,149,0,1,20150101,20150131,0
41,dQfQJhtaW7tIRR+4btyRsNMwMoZVjZ0pS38EXsNzNhk=,40,31,149,0,1,20150101,20150131,0
70,iPBYwVbW/JZzihWRGeyo31ZA0gew345hzw0LaHQ/eC4=,41,30,149,0,1,20150101,20150228,0
127,AdO/q0fOLGiT2xaMIa7GHAjhBQpHWovPg4oWlbdmc18=,41,30,149,0,1,20150101,20150228,0
141,e3Qu5mwLPc5K7Rs/JtFR11wkOI/fb9RDK6pHSh55bFE=,41,30,149,0,1,20150101,20150228,0


In [13]:
free_trx.actual_amount_paid.unique()

array([0])

In [15]:
free_trx.agg({'payment_plan_days': ['mean', 'median', 'min', 'max']}).T

Unnamed: 0,mean,median,min,max
payment_plan_days,17.032884,7.0,0.0,415.0


In [16]:
free_trx.payment_plan_days.describe()

count    971561.000000
mean         17.032884
std          27.341103
min           0.000000
25%           7.000000
50%           7.000000
75%          30.000000
max         415.000000
Name: payment_plan_days, dtype: float64