In [4]:
import pandas as pd
import numpy as np

In [5]:
x_transaction = pd.read_csv('../../kkbox-churn-prediction-challenge/under_sample/X_val_transactions.csv')

In [7]:
x_transaction=x_transaction.drop(['Unnamed: 0'], axis=1)

In [9]:
x_transaction.shape

(110700, 9)

In [10]:
## Discount_amount
x_transaction['discount_amount'] = x_transaction['plan_list_price'] - x_transaction['actual_amount_paid']
x_transaction['discount_amount'].unique()

array([    0,  -129,  -149,    30,  -119,   149,  -894,  -150,    20,
         180,  -131, -1788,    50,  -100,  -447,  -134,   -35,  -480,
        -500,  -799,   120])

In [11]:
# change "plan_list_price"
x_transaction['plan_list_price'] = np.where(x_transaction['discount_amount']<0, x_transaction['actual_amount_paid'],x_transaction['plan_list_price'])

In [12]:
# change "payment_plan_days"
conditions = [
    (x_transaction['discount_amount'] ==-35),
    (x_transaction['discount_amount'].isin([-149,-129,-119,-150,-100,-134,-131])),
    (x_transaction['discount_amount'] ==-300),
    (x_transaction['discount_amount'] ==-400),
    (x_transaction['discount_amount'].isin([-480,-447,-450])),
    (x_transaction['discount_amount'].isin([-894,-930,-799,-536])),
    (x_transaction['discount_amount'] ==-1788)]
choices = [7, 30, 60,80,90,180,365]
x_transaction['payment_plan_days'] = np.select(conditions,choices,x_transaction['payment_plan_days'])

In [14]:
(x_transaction[x_transaction['discount_amount']<0].payment_plan_days>0).value_counts()
# for this record, could not predict it.

True     4738
False       1
Name: payment_plan_days, dtype: int64

In [18]:
to_delete = x_transaction[(x_transaction['discount_amount']<0)&(x_transaction.payment_plan_days==0)]

In [22]:
x_trainsaction=x_transaction.drop(to_delete.index)

In [24]:
# change the negative discount to be 0
x_transaction['discount_amount'] = np.where(x_transaction['discount_amount']<0, 0, x_transaction['discount_amount'])

In [25]:
# get a new column called free
x_transaction['free']=0
x_transaction['free'] = np.where(x_transaction['actual_amount_paid']==0, 1, x_transaction['free'])

In [26]:
x_transaction=x_transaction.sort_values(by=['msno', 'transaction_date'])

In [27]:
# days between transaction & expiration
x_transaction.membership_expire_date=pd.to_datetime(x_transaction.membership_expire_date, format="%Y%m%d")
x_transaction.transaction_date=pd.to_datetime(x_transaction.transaction_date, format="%Y%m%d")
x_transaction['length']=(x_transaction['membership_expire_date']-x_transaction['transaction_date']).dt.days

# the average amount paid by user in each payment
x_transaction['amtperday']=x_transaction['actual_amount_paid']/x_transaction['payment_plan_days']

In [28]:
# get the consecutive difference from rows
x_transaction["d_transaction"] = x_transaction["transaction_date"].diff(1).fillna(0).astype('timedelta64[D]').astype(int)
x_transaction.d_transaction = np.where(x_transaction.d_transaction < 0, 0,x_transaction.d_transaction)

In [29]:
# same for expire_date
x_transaction["d_expire_date"] = x_transaction["membership_expire_date"].diff(1).fillna(0).astype('timedelta64[D]').astype(int)
x_transaction.d_expire_date = np.where(x_transaction.d_expire_date < 0, 0,x_transaction.d_expire_date)

In [30]:
#potential churn
from datetime import datetime
startdate = datetime(2017, 2, 1)
enddate = datetime(2017, 2, 28)
x_transaction['potential_churn'] = np.where((x_transaction['is_auto_renew']==0)&(x_transaction['membership_expire_date']>=startdate) 
                                            & (x_transaction['membership_expire_date'] <= enddate), 1,0)

### Groupby

In [31]:
# define some common used funcitons
most_common = lambda x: pd.Series.mode(x)[0]
most_common.__name__ = 'most_common'
max_min_diff = lambda x: x.max() - x.min()
max_min_diff.__name__ = 'max_min_diff'
def pcet_of_zero(x):
    return 1-(x.mean())
def change_or_not(x):
    return (x.nunique()-1)
def find_positive_pct(x):
    return ((x>0).sum()/x.count())
def has_discount(x):
    return x.nunique()>1

# def change column name
def chagne_name(df):
    table = df.columns.levels[0][0]+"_"
    cols=df.columns.levels[1]
    cols=cols[-1:]+cols[:-1]
    cols=["{}".format(table) + s for s in cols]
    cols.insert(0, "msno")
    return cols

# def change column name 2:
def change_name_2(df):
    df.columns = ["_".join(x) for x in df.columns.ravel()]
    df.rename(columns={'msno_':'msno'}, inplace=True)
    return df

In [None]:
# payment_method
payment_method=x_transaction.groupby('msno',as_index=False).agg({
    'payment_method_id':['nunique',most_common]})
payment_method.columns=chagne_name(payment_method)

In [None]:
# payment_plan_days
payment_plan_days=x_transaction.groupby('msno',as_index=False).agg({
    'payment_plan_days':['nunique',most_common]})
payment_plan_days.columns=chagne_name(payment_plan_days)

In [None]:
# plan_list_price
plan_list_price=x_transaction.groupby('msno',as_index=False).agg({
    'plan_list_price':['nunique', most_common,'mean',max_min_diff]})
plan_list_price.columns=chagne_name(plan_list_price)

In [None]:
# actual_amount_paid
actual_amount_paid=x_transaction.groupby('msno',as_index=False).agg({
    'actual_amount_paid':['nunique', most_common,'mean',max_min_diff]})
actual_amount_paid.columns=chagne_name(actual_amount_paid)

In [None]:
# is_auto_renew
is_auto_renew=x_transaction.groupby('msno',as_index=False).agg({
    'is_auto_renew':[pcet_of_zero, change_or_not,most_common]})
is_auto_renew.columns=chagne_name(is_auto_renew)

In [None]:
# is_cancel
is_cancel=x_transaction.groupby('msno',as_index=False).agg({
    'is_cancel':['mean', change_or_not]})
is_cancel.columns=chagne_name(is_cancel)

In [None]:
# discount_amount
discount_amount=x_transaction.groupby('msno',as_index=False).agg({
    'discount_amount':[find_positive_pct, 'mean','sum',has_discount]})
discount_amount.columns=chagne_name(discount_amount)

In [None]:
# free
free=x_transaction.groupby('msno',as_index=False).agg({
    'free':['mean',has_discount]})
free.columns=chagne_name(free)

In [None]:
# length
length=x_transaction.groupby('msno',as_index=False).agg({
    'length':['mean','sum','std','first']}).fillna(0)
length=change_name_2(length)

In [None]:
# amtperday
amtperday=x_transaction.groupby('msno',as_index=False).agg({
    'amtperday':['mean']}).fillna(0)
amtperday=change_name_2(amtperday)

In [None]:
# transaction_date
transaction_date=x_transaction.groupby('msno',as_index=False).agg({
    'transaction_date':['first']})
transaction_date=change_name_2(transaction_date)

In [None]:
# d_transaction
d_transaction=x_transaction.groupby('msno',as_index=False).agg({
    'd_transaction':['mean','std']}).fillna(0)
d_transaction=change_name_2(d_transaction)

In [None]:
# d_expire_date
d_expire_date=x_transaction.groupby('msno',as_index=False).agg({
    'd_expire_date':['mean','std','sum']}).fillna(0)
d_expire_date=change_name_2(d_expire_date)

In [None]:
# potential_churn
potential_churn=x_transaction.groupby('msno',as_index=False).agg({
    'potential_churn':[has_discount]})
potential_churn.columns=chagne_name(potential_churn)

In [None]:
data_frames=[payment_method,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,is_cancel,discount_amount,free,length,amtperday,transaction_date,d_transaction,d_expire_date,potential_churn]

In [None]:
from functools import reduce
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['msno'],
                                            how='inner'), data_frames)

In [None]:
df_merged['membership_days']=df_merged['d_expire_date_sum']+df_merged['length_first'] 
# get total membership days
df_merged=df_merged.drop(['d_expire_date_sum', 'length_first'], axis=1)