In [None]:
# data is from: https://www.kaggle.com/c/kkbox-churn-prediction-challenge/overview/evaluation 

In [2]:
# import necessary libraries and packages.
# we will be using dask to read data as we are dealing with large files 
import dask.dataframe as dd

In [None]:
# Next, we will read each of our csv files into dataframes and examine them

In [3]:
train_df = dd.read_csv("../data/raw/train_v2.csv")

In [8]:
train_df.head()

Unnamed: 0,msno,is_churn
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1
1,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1
3,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,1
4,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1


In [5]:
members_df = dd.read_csv("../data/raw/members_v3.csv")

In [9]:
members_df.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,1,0,,11,20110911
1,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,1,0,,7,20110914
2,cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=,1,0,,11,20110915
3,9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=,1,0,,11,20110915
4,WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=,6,32,female,9,20110915


In [20]:
# the 'bd' column is a little unclear. From reading the information provided by the data source,
# I see that it is actually just the age of the customer. I will change this for clarity.
members_df = members_df.rename(columns={'bd':'age'})
members_df.head()

Unnamed: 0,msno,city,age,gender,registered_via,registration_init_time
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,1,0,,11,20110911
1,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,1,0,,7,20110914
2,cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=,1,0,,11,20110915
3,9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=,1,0,,11,20110915
4,WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=,6,32,female,9,20110915


In [21]:
# Right away, we can also see that there are NaNs for the gender, and some ages that are 0. 
# the data source also warns that there are outliers like -7000 and 2015 in the age column. 
# we will address these special cases shortly. Now, we will continue examining the rest of the sheets.

In [10]:
transactions_df = dd.read_csv("../data/raw/transactions_v2.csv")

In [11]:
transactions_df.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,++6eU4LsQ3UQ20ILS7d99XK8WbiVgbyYL4FUgzZR134=,32,90,298,298,0,20170131,20170504,0
1,++lvGPJOinuin/8esghpnqdljm6NXS8m8Zwchc7gOeA=,41,30,149,149,1,20150809,20190412,0
2,+/GXNtXWQVfKrEDqYAzcSw2xSPYMKWNj22m+5XkVQZc=,36,30,180,180,1,20170303,20170422,0
3,+/w1UrZwyka4C9oNH3+Q8fUf3fD8R3EwWrx57ODIsqk=,36,30,180,180,1,20170329,20170331,1
4,+00PGzKTYqtnb65mPKPyeHXcZEwqiEzktpQksaaSC3c=,41,30,99,99,1,20170323,20170423,0


In [22]:
user_logs_df = dd.read_csv("../data/raw/user_logs_v2.csv")

In [23]:
user_logs_df.head()

Unnamed: 0,msno,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,u9E91QDTvHLq6NXjEaWv8u4QIqhrHk72kE+w31Gnhdg=,20170331,8,4,0,1,21,18,6309.273
1,nTeWW/eOZA/UHKdD5L7DEqKKFTjaAj3ALLPoAWsU8n0=,20170330,2,2,1,0,9,11,2390.699
2,2UqkWXwZbIjs03dHLU9KHJNNEvEkZVzm69f3jCS+uLI=,20170331,52,3,5,3,84,110,23203.337
3,ycwLc+m2O0a85jSLALtr941AaZt9ai8Qwlg9n0Nql5U=,20170331,176,4,2,2,19,191,7100.454
4,EGcbTofOSOkMmQyN1NMLxHEXJ1yV3t/JdhGwQ9wXjnI=,20170331,2,1,0,1,112,93,28401.558


In [45]:
# find out the number of entries in each dataset
dfs = ["train_df", "members_df", "transactions_df", "user_logs_df"]
for df in dfs:
    print(df, "length:", f'{len(eval(df)):,}')

train_df length: 970,960
members_df length: 6,769,473
transactions_df length: 1,431,009
user_logs_df length: 18,396,362


In [42]:
f'{len(transactions_df.msno.unique()):,}'

'1,197,050'

In [60]:
# each dataframe has an 'msno' column which we will use to match the dataframes
# to see how many customers we have all the data for

full_df = dd.merge(train_df, transactions_df, on='msno', how='inner')
#full_df = full_df.merge(members_df, on='msno')
#full_df = full_df.merge(user_logs_df, on='msno')

In [61]:
full_df.head()

Unnamed: 0,msno,is_churn,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1,36,30,180,180,0,20170311,20170411,0
1,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1,17,60,0,0,0,20170311,20170314,0
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1,15,90,300,300,0,20170314,20170615,0
3,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,1,41,30,149,149,1,20150908,20170608,0
4,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,1,41,30,149,149,1,20150908,20170708,0


In [62]:
# let's see how many rows were kept:
f'{len(full_df):,}'

'1,132,036'

In [63]:
train_df.is_churn.value_counts().compute()

0    883630
1     87330
Name: is_churn, dtype: int64

In [64]:
sum(train_df.is_churn.isna())

0

In [65]:
full_df.is_churn.value_counts().compute()

0    1018463
1     113573
Name: is_churn, dtype: int64

In [66]:
sum(train_df.is_churn.isna())

0

In [None]:
# I'm getting repeated customers because they have multiple transactions... do I want to just
# keep the most recent, or can I keep all of them? 
# I'm trying to predict if individual customers will churn... 
# based on the mo