# Imports

In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# keep matplotlib interactive
%matplotlib notebook
# %matplotlib inline
# use ggplot style
plt.style.use('ggplot')

# Churn Prediction

Any subscription-based business is committed to keep their customer happy in exchange for their loyalty. Despite their efforts, some customers will eventually not renew their subscription. In this latter case, if a customer has not renewed within a time window after its subscription expiration date, this customer is said to have churned. Although each service provider offers several subscription options (monthly, yearly, basic, premium,...), the time window to consider a customer has churned varies quite a lot. It seems that each company sets their own.

This project is based on a [Kaggle competition](https://www.kaggle.com/c/kkbox-churn-prediction-challenge) where KKBOX, an Asian leading __music streaming service__, is interested in learning from their customer behavior to prevent them from churning.

The __grace period is 30 days__ in order to consider a customer has churned after its current membership ends. The goal is to determine churn from one month to the next.

In [3]:
# train.csv only contains 
train_dir = os.path.join(os.pardir, 'data', 'raw', 'train.csv')
df_train = pd.read_csv(train_dir, index_col = 'msno')

In [4]:
df_train.head()

Unnamed: 0_level_0,is_churn
msno,Unnamed: 1_level_1
waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y=,1
QA7uiXy8vIbUSPOkCf9RwQ3FsT8jVq2OxDr8zqa7bRQ=,1
fGwBva6hikQmTJzrbz/2Ezjm5Cth5jZUNvXigKK2AFA=,1
mT5V8rEpa+8wuqi6x0DoVd3H5icMKkE9Prt49UlmK+4=,1
XaPhtGLk/5UvvOYHcONTwsnH97P4eGECeq+BARGItRw=,1


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 992931 entries, waLDQMmcOu2jLDaV1ddDkgCrB/jl6sD66Xzs0Vqax1Y= to ZoVEIVMCpKT9/MJgqO+fh9xYrNM2vNisLhJ4xY/CVyk=
Data columns (total 1 columns):
is_churn    992931 non-null int64
dtypes: int64(1)
memory usage: 15.2+ MB


In [5]:
# _ , axChurn = plt.subplots(figsize=(10, 2))

# # one could histogram is_churn column directly but it doesn't look good
# # df_train.is_churn.plot(kind='hist', bins=[0, 1, 2], rwidth=0.5, align = 'left', ax=axChurn)

# # histogram via value_counts() then make bar plot, change ticks labels
# # churn_distrib = df_train.is_churn.value_counts()
# # churn_distrib.plot(kind='barh', ax=axChurn)
# # axChurn.set_yticklabels(('No Churn', 'Churn'), rotation = 0)
# ## plt.xticks(churn_distrib.index, ('No Churn', 'Churn'), rotation = 0)

In [6]:
_ , axChurn = plt.subplots(figsize=(9, 2))
axChurn.set_position([0.05, 0.05, 0.9, 0.65])
# histogram via value_counts() then change ticks labels
churn_distrib = df_train.is_churn.value_counts(normalize=True)
no_churn = churn_distrib.loc[0]
axChurn.barh(0, no_churn)
h_bar2 = axChurn.barh(0, churn_distrib.loc[1],  left = no_churn)
axChurn.legend(['No Churn', 'Churn'], bbox_to_anchor=(0.5,1.4), loc = 'upper center')
bar_height = h_bar2.patches[-1].get_height()/2
axChurn.plot([0.5]*2, [-bar_height, bar_height], color = 'b', linestyle = ':')
axChurn.text(0.5,-bar_height*1.2, '50%')
axChurn.plot([no_churn]*2, [-bar_height, bar_height], color = 'k', linestyle = '-')
axChurn.text(no_churn,-bar_height*1.2, '{:.0f}%'.format(no_churn*100))
axChurn.axis('off')

<IPython.core.display.Javascript object>

(0.0, 1.05, -0.44000000000000006, 0.44000000000000006)

# Data Description
There 3 data files that can tell us about customer behavior.

                                                

|                                       transactions.csv                                        |
|-----------------------------------------------------------------------------------------------|
| msno                   | user id  (letters, digits and special characters)                    |
| payment_method_id      | payment method   (masked)                                            |
| payment_plan_days      | length of membership plan in days                                    |
| plan_list_price        | in New Taiwan Dollar (NTD)                                           |
| actual_amount_paid     | in New Taiwan Dollar (NTD)                                           |
| is_auto_renew          | true when customer opted in renewing its subscription automatically  |
| transaction_date       | format %Y%m%d                                                        |
| membership_expire_date | format %Y%m%d                                                        |
| is_cancel              | whether or not the user canceled the membership in this transaction. |



|              user_logs.csv                                              |
|-------------------------------------------------------------------------|
| msno       | user id  (letters, digits and special characters)          |
| date       | format %Y%m%d                                              |
| num_25     | # of songs played less than 25% of the song length         |
| num_50     | # of songs played between 25% to 50% of the song length    |
| num_75     | # of songs played between 50% to 75% of of the song length |
| num_985    | # of songs played between 75% to 98.5% of the song length  |
| num_100    | # of songs played over 98.5% of the song length            |
| num_unq    | # of unique songs played                                   |
| total_secs | total seconds played                                       |

|                                    members.csv                  |
|---------------------------------|-------------------------------|
| msno                            |user id  (letters, digits and special characters) |
| city                            |          name of city         |
| bd                              | age$^i$                       |
| gender                          |            gender             |
| registered_via                  |     registration method       |
| registration_init_time          |          format %Y%m%d        |
| expiration_date                 | format %Y%m%d $^{ii}$         |

$^i$Note: this column has outlier values ranging from -7000 to 2015 please use your judgement.

$^{ii}$Note: taken as a snapshot at which the member.csv is extracted. Not representing the actual churn behavior.

### Transactions

In [2]:
transaction_dir = os.path.join(os.pardir, 'data', 'raw', 'transactions.csv')
df_transac = pd.read_csv(transaction_dir, index_col = 'msno', parse_dates=['transaction_date', 'membership_expire_date'])

In [3]:
df_transac.count()

payment_method_id         21547746
payment_plan_days         21547746
plan_list_price           21547746
actual_amount_paid        21547746
is_auto_renew             21547746
transaction_date          21547746
membership_expire_date    21547746
is_cancel                 21547746
dtype: int64

In [4]:
# treat payment_method_id as a category
df_transac.payment_method_id = df_transac.payment_method_id.astype('category')
# convert relevant columns to boolean
df_transac.is_auto_renew = df_transac.is_auto_renew.astype(bool)
df_transac.is_cancel = df_transac.is_cancel.astype(bool)

In [5]:
df_transac.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21547746 entries, YyO+tlZtAXYXoZhNr3Vg3+dfVQvrBVGO8j1mfqe4ZHc= to oE4y2wK5E7OR8zyrCHeW02uTeI6wTwT4QTApEVBNEdM=
Data columns (total 8 columns):
payment_method_id         category
payment_plan_days         int64
plan_list_price           int64
actual_amount_paid        int64
is_auto_renew             bool
transaction_date          datetime64[ns]
membership_expire_date    datetime64[ns]
is_cancel                 bool
dtypes: bool(2), category(1), datetime64[ns](2), int64(3)
memory usage: 1.0+ GB


In [38]:
def plot_bar(df, bar_list, is_Norm=True):
    """
        Make a bar plot using dataframe df and associated columns in bar_list
        is_Norm will normalize distribution by counts
        df: pandas DataFrame
        bar_list: list of column names
    """
    subrows = int(np.ceil(len(bar_list) / 2))
    discard_last_ax = bool(len(bar_list) % 2)
    # special case when there is only one column specified
    if subrows == 1 and discard_last_ax:
        _ , ax = plt.subplots(figsize=(9,5))
        ax = [ax]
    else:
        _ , ax = plt.subplots(subrows, 2)
        ax = ax.flatten()
    for k, colname in enumerate(bar_list):
        
        # select column and make bar plot
        s_colname = df[colname]
        distrib = s_colname.value_counts(normalize = is_Norm).sort_index()*100
        distrib.plot(kind='bar', ax=ax[k], color='b')
        
        # add percent character to y tick label if plot normalized
        if is_Norm:
            # add percent on y axis labels, uses numerical value of each tick (just in case scale has changed)
            ax[k].set_yticklabels([ '{:.0f}%'.format(l)  for l in ax[k].get_yticks()])
        
        # set x ticklabels to integer, reformat existing labels (categories must be treated as string here)
        # ax[k].set_xticklabels([ '{:.0f}'.format(float(l.get_text()))  for l in ax[k].get_xticklabels()])
        
        # add title
        ax[k].set_title(distrib.name + ' Distribution')
        
        # reset x axis label to 45 rotation
        ax[k].tick_params(axis = 'x', rotation = 45)
        
    # decide to keep last axes
    if discard_last_ax and subrows > 1: ax[-1].set_visible(False)
    plt.tight_layout()

### Payment ID method
Payment ID 41 represents more than 50% of transactions

In [39]:
plot_bar(df_transac, ['payment_method_id'])

<IPython.core.display.Javascript object>

### Payment Plan in days
There is a lot more variety in subscriptions than I expected
From [KKBOX website](https://help.kkbox.com/hk/zh-tw/billing/pay-types/260), here are the options:
1. Options are monthly plan (30/31 days) with automatic renewal (is_auto_renew true) 
2. Single purchase, no automatic renewal but more expensive:
    3. 30 days
    4. 90 days (plus 7 days bonus?)
    5. 180 days (plus 21 days bonus?)
    6. 365 days (plus 50 days bonus?)
There are discount programs too (getting a credit card with their partners and so on)

0 days doesn't make sense. Because it is low percentage, we can discard it.

We may want to keep only plans shorter than 30/31 days as predicitons are made on a monthly basis.
Indeed, longer subscription means customer will not churn next month

In [13]:
plot_bar(df_transac, ['payment_plan_days'])

<IPython.core.display.Javascript object>

### Auto-renew and active cancellation
People who have auto renewal active also don't cancel actively. It is consitent with the fact that most customers don't churn.

In [14]:
plot_bar(df_transac, ['is_auto_renew' , 'is_cancel'])

<IPython.core.display.Javascript object>

### Plan list price
Plan prices are rather discrete therefore we could categorize those for analysis.<br>
Notice some transactions were free probably due to trial period or initial offering.<br>
PS: There are entries greater than 250 NTD (~1,5% of all transactions) which corresponds to plan longer than XXX days.<br>

In [43]:
df_transac.plan_list_price.describe()

count    2.154775e+07
mean     1.398850e+02
std      1.309647e+02
min      0.000000e+00
25%      9.900000e+01
50%      1.490000e+02
75%      1.490000e+02
max      2.000000e+03
Name: plan_list_price, dtype: float64

In [50]:
_ , axListPrice = plt.subplots()
df_transac.plan_list_price.plot(kind = 'hist', \
                                bins = 250, range = (0,250), density = True,\
                                ax = axListPrice)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1e4b2cc09b0>

### Subscription cost per subscription length
How do subscription length scale with list price ? <br>
First, let's group plan duration by 0 - 7, 8 - 29, 30 - 89, 90 - 179, 180 - 364, 365 - 450


In [6]:
# create custom intervals (bin edges will be left inclusive) in increasing order
days_plan = [0, 8, 30, 90, 180, 365, 450 ]

# compile labels
days_plan_upperbounds = [d-1 for d in days_plan[1:-1] ]
days_plan_upperbounds.append(days_plan[-1])
days_plan_labels = [ "{} - {}".format(l,u) for l,u in zip(days_plan[:-1], days_plan_upperbounds) ]

print('Bin edges = {}'.format(days_plan))
print('Associated labels = {}'.format(days_plan_labels))

Bin edges = [0, 8, 30, 90, 180, 365, 450]
Associated labels = ['0 - 7', '8 - 29', '30 - 89', '90 - 179', '180 - 364', '365 - 450']


In [7]:
# create new column with plan duration category
df_transac['plan_duration'] = pd.cut(df_transac.payment_plan_days, days_plan, right=False, labels=days_plan_labels)

In [14]:
_ , axCostDays = plt.subplots(2,1)
plan_duration_distrib = df_transac.plan_duration.value_counts(normalize = True).sort_index()
plan_duration_distrib.plot(kind = 'bar', ax = axCostDays[0], color = 'b', alpha = 0.5)
axCostDays[0].tick_params(axis = 'x', rotation = 0)

<IPython.core.display.Javascript object>

In [86]:
sns.stripplot(x="plan_duration", y="plan_list_price", hue = 'is_auto_renew',\
              dodge = True, jitter = True, alpha = 0.25,\
              data=df_transac, ax = axCostDays[1])

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1e4975f0c18>

### List price vs paid price
About 10% of transcations do not have a discount.

In [17]:
# percent difference between plan price and price actually paid
discount_percent = 1 - (df_transac.actual_amount_paid/df_transac.plan_list_price)

# plot histogram of discounts
_ , axDiscount = plt.subplots()
discount_percent.plot(kind = 'hist', range = (-1, 1), bins = 20, ax = axDiscount, density = True)

### Transaction and expiration dates

In [18]:
# compute difference in time between expiration date and transaction date
deltaTransacDate = df_transac.membership_expire_date - df_transac.transaction_date

# convert timedelta object to days (result cast to int)
deltaTransacDate = deltaTransacDate.astype('timedelta64[D]')
# convert timedelta object to days (keep decimals)
# deltaTransacDate = deltaTransacDate/pd.Timedelta('1 day')
# deltaTransacDate = deltaTransacDate/np.timedelta64(1, 'D')

# histogram days
_ , axTransacDate = plt.subplots()
deltaTransacDate.plot(kind = 'hist', range= (-30, 30), bins = 60, normed = True, ax = axTransacDate)

# NOTE: some unrealistic out of range values like -17599

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1e482fa0898>

In [None]:
# add is_churn to df_transac by merging
df_transac.join(df_train)
pd.merge(df_transac, df_train, how='inner', on = 'msno')

### User_log

In [None]:
user_log_dir = os.path.join(os.pardir, 'data', 'raw', 'user_logs.csv')
Num_rows = int(20e6)
reader_iter = pd.read_csv(user_log_dir, index_col = 'msno', parse_dates=['date'], chunksize=Num_rows,\
        usecols = [ 'num_25', 'num_50', 'num_75', 'num_985', 'num_100'])

In [None]:
df_chunk = reader_iter.get_chunk(Num_rows)
# histogram all columns separately, np.histogram return histo and bin edges, only need histo
# apply columnwise (column by column or along rows)
hist_num = np.apply_along_axis(lambda a: np.histogram(a, bins=bin_edges)[0], 0, df_chunk)

In [None]:
# compile bin edges to histogram
Nbin = 1000
bin_edges = np.linspace( 0, Nbin, num=Nbin+1)
hist_num_tot = np.array(np.zeros((Nbin,5)))

for df_chunk in reader_iter:
    
    # histogram all log for num_x
#     dfhistoNum = df_chunk.drop(labels='msno', axis=1)
    hist_num = np.apply_along_axis(lambda a: np.histogram(a, bins=bin_edges)[0], 0, dfhistoNum)
    hist_num_tot = hist_num_tot + hist_num