In [1]:
# data is from: https://www.kaggle.com/c/kkbox-churn-prediction-challenge/overview/evaluation 

In [2]:
# import necessary libraries and packages.
# we will be using dask to read data as we are dealing with large files 
import dask.dataframe as dd

In [3]:
# Next, we will read each of our csv files into dataframes and examine them
# my goal is to have a data frame with: 
# each row is a unique customer, 
# each row has info for each of the columns from each dataset, 
# any NaNs or outliers are dealt with.

In [4]:
train_df = dd.read_csv("../data/raw/train_v2.csv")

In [5]:
train_df.head()

Unnamed: 0,msno,is_churn
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1
1,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1
3,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,1
4,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1


In [6]:
# Let's explore our target column, is_churn:
print(sum(train_df.is_churn.isna()))
train_df.is_churn.value_counts().compute()

0


0    883630
1     87330
Name: is_churn, dtype: int64

In [7]:
members_df = dd.read_csv("../data/raw/members_v3.csv")

In [8]:
members_df.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,1,0,,11,20110911
1,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,1,0,,7,20110914
2,cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=,1,0,,11,20110915
3,9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=,1,0,,11,20110915
4,WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=,6,32,female,9,20110915


In [9]:
members_df.describe()

Unnamed: 0_level_0,city,bd,registered_via,registration_init_time
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,float64,float64,float64,float64
,...,...,...,...


In [10]:
# the 'bd' column is a little unclear. From reading the information provided by the data source,
# I see that it is actually just the age of the customer. I will change this for clarity.
members_df = members_df.rename(columns={'bd':'age'})
members_df.head()

Unnamed: 0,msno,city,age,gender,registered_via,registration_init_time
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,1,0,,11,20110911
1,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,1,0,,7,20110914
2,cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=,1,0,,11,20110915
3,9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=,1,0,,11,20110915
4,WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=,6,32,female,9,20110915


In [11]:
# Right away, we can also see that there are NaNs for the gender, and some ages that are 0. 
# the data source also warns that there are outliers like -7000 and 2015 in the age column. 
# we will address these special cases shortly. Now, we will continue examining the rest of the sheets.

In [12]:
transactions_df = dd.read_csv("../data/raw/transactions_v2.csv")

In [13]:
transactions_df.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,++6eU4LsQ3UQ20ILS7d99XK8WbiVgbyYL4FUgzZR134=,32,90,298,298,0,20170131,20170504,0
1,++lvGPJOinuin/8esghpnqdljm6NXS8m8Zwchc7gOeA=,41,30,149,149,1,20150809,20190412,0
2,+/GXNtXWQVfKrEDqYAzcSw2xSPYMKWNj22m+5XkVQZc=,36,30,180,180,1,20170303,20170422,0
3,+/w1UrZwyka4C9oNH3+Q8fUf3fD8R3EwWrx57ODIsqk=,36,30,180,180,1,20170329,20170331,1
4,+00PGzKTYqtnb65mPKPyeHXcZEwqiEzktpQksaaSC3c=,41,30,99,99,1,20170323,20170423,0


In [14]:
transactions_df.describe()

Unnamed: 0_level_0,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,float64,float64,float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...,...


In [15]:
user_logs_df = dd.read_csv("../data/raw/user_logs_v2.csv")

In [16]:
user_logs_df.head()

Unnamed: 0,msno,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,u9E91QDTvHLq6NXjEaWv8u4QIqhrHk72kE+w31Gnhdg=,20170331,8,4,0,1,21,18,6309.273
1,nTeWW/eOZA/UHKdD5L7DEqKKFTjaAj3ALLPoAWsU8n0=,20170330,2,2,1,0,9,11,2390.699
2,2UqkWXwZbIjs03dHLU9KHJNNEvEkZVzm69f3jCS+uLI=,20170331,52,3,5,3,84,110,23203.337
3,ycwLc+m2O0a85jSLALtr941AaZt9ai8Qwlg9n0Nql5U=,20170331,176,4,2,2,19,191,7100.454
4,EGcbTofOSOkMmQyN1NMLxHEXJ1yV3t/JdhGwQ9wXjnI=,20170331,2,1,0,1,112,93,28401.558


In [17]:
user_logs_df.describe()

Unnamed: 0_level_0,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,float64,float64,float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...,...


In [18]:
# find out the number of entries in each dataset
dfs = ["train_df", "members_df", "transactions_df", "user_logs_df"]
for df in dfs:
    print(df, "length:", f'{len(eval(df)):,}')

train_df length: 970,960
members_df length: 6,769,473
transactions_df length: 1,431,009
user_logs_df length: 18,396,362


In [19]:
f'{len(transactions_df.msno.unique()):,}'

'1,197,050'

In [20]:
# each dataframe has an 'msno' column which we will use to match the dataframes
# to see how many customers we have all the data for

train_trans_df = dd.merge(train_df, transactions_df, on='msno', how='inner')

In [21]:
train_trans_df.head()

Unnamed: 0,msno,is_churn,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1,36,30,180,180,0,20170311,20170411,0
1,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1,17,60,0,0,0,20170311,20170314,0
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1,15,90,300,300,0,20170314,20170615,0
3,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,1,41,30,149,149,1,20150908,20170608,0
4,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,1,41,30,149,149,1,20150908,20170708,0


In [22]:
# let's see how many rows we have:
f'{len(train_trans_df):,}'

'1,132,036'

In [23]:
# I'm getting repeated customers because they have multiple transactions... do I want to just
# keep the most recent, or can I keep all of them? 
# I'm trying to predict if individual customers will churn... 
# based on the most recent information available... 
# so I should filter things so that I only keep the most recent transaction. 

In [24]:
# compare the transaction_date column for individual columns and choose the most recent date. 
# delete older entries
# I can keep the date column as float64 and just choose the largest value
train_trans_df_recent = full_df.groupby("msno")['transaction_date'].max()


In [25]:
# let's see how many rows we have now:
f'{len(train_trans_df_recent):,}'

'933,578'

In [28]:
# this number is lower than the total number of unique customers we have churn data for, 
# 970,,060. I need to verify if these missing customers are just not recorded in transaction_df, 
# or if there is some other mistake somewhere. But this does get me much closer to my goal. 
# let's check the number of unique entries in full_df. this should match our previous number of 933,578:
f'{len(train_trans_df.msno.unique()):,}'

'933,578'

In [29]:
# and it does! Perfect. so now I see that some of the customers we have churn data for were missing from the
# transactions dataframe, that's why the number is lower. 
# now, we will join the train data and the member data
# first, let's check the number of unique entries in members: 
train_mem_df = dd.merge(train_df, members_df, on='msno', how='inner')

In [31]:
f'{len(train_mem_df):,}'

'860,967'

In [32]:
f'{len(train_mem_df.msno.unique()):,}'

'860,967'

In [None]:
# ok, so this means that again, some of the entries in train_df are not in members_df. 
# we are only interested in entries we have churn info for (for now...)

In [33]:
# now, let's explore the relationship between train_df and user_logs_df
# I expect to get several million rows, as each user's daily habits are recorded in this table
train_user_df = dd.merge(train_df, user_logs_df, on='msno', how='inner')

In [34]:
f'{len(train_user_df):,}'

'13,532,944'

In [35]:
f'{len(train_user_df.msno.unique()):,}'

'754,551'

In [36]:
f'{len(user_logs_df.msno.unique()):,}'

'1,103,894'

In [38]:
# so there are 1,103,894 total unique customers in user_logs_df, and 754,551 of those match the 
# 970,060 unique customers we have churn data for
# we need to group by msno so we get the unique customers isolated
# we will find the some of the other columns, to get an "all time" info, 
# meaning we will go from how many songs the user listened to in a day 
# to how many songs the user listened to all time... 
# first, we will convert the date column to just a bunch of 1s so we can get the total number of days
all_time_df = train_user_df.copy()
all_time_df.date = 1

# next, we will group by msno and aggregate using sum
all_time_df = all_time_df.groupby('msno').sum()
all_time_df.head()

Unnamed: 0_level_0,is_churn,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,0,31,191,90,75,144,589,885,192527.892
+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,28,43,12,15,12,485,468,115411.26
++0/NopttBsaAn6qHZA2AWWrDg7Me7UOMs1vsyo4tSI=,0,8,21,8,17,7,104,115,28450.268
++0EzISdtKY48Z0GY62jer/LFQwrNIAbADdtU5xStGY=,0,14,32,5,8,8,266,186,65936.728
++38dVm0EHKCOfND6oEal3xFKtqJYZTZJmnpbAOOZBk=,0,17,27,14,2,0,63,102,18422.129


In [41]:
# I actually want to BEFORE I add the train dataset, because I've just summed churn as well... 
all_time_df.is_churn.unique().compute()

0      0
1     23
2     14
3      7
4     31
5      1
6     17
7      8
8     11
9     30
10     9
11    22
12    15
13    21
14    29
15    12
16     4
17    28
18    25
19    27
20    20
21    13
22    26
23    10
24    19
25    24
26     2
27     3
28    18
29    16
30     6
31     5
Name: is_churn, dtype: int64

In [43]:
# but before we go back and do that, let me make sure this worked. 
# I want to see the number of unique rows I have now:
print(f'{len(all_time_df.reset_index().msno.unique()):,}')

754,551


In [27]:
# for user_logs, I will create new "all time" columns that add up a user's metrics for each day that is recorded. 

In [None]:
# next step is to read back through, make sure everything makes sense, make it pretty, then combine all 4 datasets