In [1]:
import numpy as np 
import pandas as pd
from datetime import datetime
from functools import reduce

In [2]:
INPUT_PATH = "../../kkbox-churn-prediction-challenge/50_under_sample/"
FILE_NAME = "test_transactions.csv"
OUTPUT_NAME = "test_transactions_transformed.csv"
x_transaction = pd.read_csv(INPUT_PATH + FILE_NAME)

In [3]:
x_transaction.head(1)

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,3faY9Bx/5LqsIytjwgYau4yRCWm+HD9kmBYva5p4JW4=,21,30,149,149,1,2015-12-02T00:00:00,2016-01-08T00:00:00,0


In [4]:
# define some common used funcitons
most_common = lambda x: pd.Series.mode(x)[0]
most_common.__name__ = 'most_common'

def pcet_of_zero(x):
    return 1-(x.mean())
def change_or_not(x):
    return (x.nunique()-1)
def find_positive_pct(x):
    return ((x>0).sum()/x.count())
def binary_has(x):
    return x.nunique()>1

In [5]:
# def change column name
def change_name(df):
    table = df.columns.levels[0][0]+"_"
    cols=df.columns.levels[1]
    cols=cols[-1:]+cols[:-1]
    cols=["{}".format(table) + s for s in cols]
    cols.insert(0, "msno")
    return cols

# def change column name 2:
def change_name_2(df):
    df.columns = ["_".join(x) for x in df.columns.ravel()]
    df.rename(columns={'msno_':'msno'}, inplace=True)
    return df

In [6]:
def transaction_features(df):
    df=df.sort_values(by=['msno', 'transaction_date'])
    
    # new column: discount
    df['discount_amount'] = df['plan_list_price'] - df['actual_amount_paid']
    # do not change plan_list_price and plan_payment_days using discount_amount, do not change discount to positive
    # problem 1: discount_amount will have negative -> will also be count as having discount
    # problem 2: plan_list_price = 0 -> just treat as a seperate column
    # problem 3: payment_plan_days = 0 -> amtperday = inf -> solve by changing to 0
    df['discount_positive']=np.where(df['discount_amount']>=0, df['discount_amount'], 0)
    
    
    # new column: free
    df['free'] = np.where(df['actual_amount_paid']==0, 1, 0)
    
    # the average amount paid by user in each payment
    df['amtperday']=df['actual_amount_paid']/df['payment_plan_days']
    df['amtperday'] = np.where(df['amtperday']==np.inf, 0, df['amtperday'])
    
    # days between transaction & expiration
    df.membership_expire_date=pd.to_datetime(df.membership_expire_date)
    df.transaction_date=pd.to_datetime(df.transaction_date)
    df['length']=(df['membership_expire_date']-df['transaction_date']).dt.days
    
    # get the consecutive difference from rows for transaction date
    df["d_transaction"] = df["transaction_date"].diff(1).fillna(0).astype('timedelta64[D]').astype(int)
    df.d_transaction = np.where(df.d_transaction <= 0, np.NaN, df.d_transaction)
    # problem: d_transaction has NANs
    
    # get the consecutive difference from rows for expiration date
    df["d_expire_date"] = df["membership_expire_date"].diff(1).fillna(0).astype('timedelta64[D]').astype(int)
    df.d_expire_date = np.where(df.d_expire_date <= 0, np.NaN,df.d_expire_date)
    # problem: d_expire_date has NANs
    
    # get the potential churn
    df['potential_churn'] = np.where((df['is_auto_renew']==0)&(df['membership_expire_date']>=datetime(2017, 2, 1)) 
                                            & (df['membership_expire_date'] <= datetime(2017, 2, 28)), 1,0)
    
    # payment_method
    payment_method=df.groupby('msno',as_index=False).agg({'payment_method_id':['nunique',most_common]})
    payment_method.columns=change_name(payment_method)
    
    # payment_plan_days
    payment_plan_days=df.groupby('msno',as_index=False).agg({'payment_plan_days':['nunique',most_common]})
    payment_plan_days.columns=change_name(payment_plan_days)
    
    # plan_list_price
    plan_list_price=df.groupby('msno',as_index=False).agg({'plan_list_price':['nunique', most_common,'mean','std']})
    plan_list_price.columns=change_name(plan_list_price)
    # here, could make most_common a continuous rather than categorical? try it later
    
    # actual_amount_paid
    actual_amount_paid=df.groupby('msno',as_index=False).agg({'actual_amount_paid':['nunique', 
                                                                                    most_common,'mean','std']})
    actual_amount_paid.columns=change_name(actual_amount_paid)
    # same as plan_list_price, make most_common continuous
    
    # is_auto_renew
    is_auto_renew=df.groupby('msno',as_index=False).agg({'is_auto_renew':[pcet_of_zero, change_or_not,most_common]})
    is_auto_renew.columns=change_name(is_auto_renew)
    
    # is_cancel
    is_cancel=df.groupby('msno',as_index=False).agg({'is_cancel':['mean', change_or_not]})
    is_cancel.columns=change_name(is_cancel)
    
    # discount_amount
    discount_amount=df.groupby('msno',as_index=False).agg({'discount_amount':[find_positive_pct, 
                                                                              'mean','sum']}).fillna(0)
    discount_amount.columns=change_name(discount_amount)
    
    # discount_positive
    discount_positive=df.groupby('msno',as_index=False).agg({'discount_positive':[binary_has]})
    discount_positive.columns=change_name(discount_positive)
    
    # free
    free=df.groupby('msno',as_index=False).agg({'free':['mean',binary_has]}).fillna(0)
    free.columns=change_name(free)
    
    # length
    length=df.groupby('msno',as_index=False).agg({'length':['mean','sum','std','first']}).fillna(0)
    length=change_name_2(length)
    
    # amtperday
    amtperday=df.groupby('msno',as_index=False).agg({'amtperday':['mean','std']}).fillna(0)
    amtperday=change_name_2(amtperday)
    # add std for amtperday
    
    # transaction_date
    transaction_date=df.groupby('msno',as_index=False).agg({'transaction_date':['first']})
    transaction_date=change_name_2(transaction_date)
    
    # # d_transaction
    d_transaction=df.groupby('msno',as_index=False).agg({'d_transaction':['mean','std']}).fillna(0)
    d_transaction=change_name_2(d_transaction)
    
    # d_expire_date
    d_expire_date=df.groupby('msno',as_index=False).agg({'d_expire_date':['mean','std','sum']}).fillna(0)
    d_expire_date=change_name_2(d_expire_date)
    
    # potential_churn
    potential_churn=df.groupby('msno',as_index=False).agg({'potential_churn':[binary_has]})
    potential_churn.columns=change_name(potential_churn)
    
    # merge data_frames
    data_frames=[payment_method,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,
                 is_cancel,discount_amount,discount_positive,free,length,amtperday,transaction_date,
                 d_transaction,d_expire_date,potential_churn]
    df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['msno'],
                                            how='inner'), data_frames)
    
    #total membership days
    df_merged['membership_days']=df_merged['d_expire_date_sum']+df_merged['length_first'] 
    df_merged=df_merged.drop(['d_expire_date_sum', 'length_first'], axis=1)
    
    return df_merged

In [7]:
transaction_transformed = transaction_features(x_transaction)

In [8]:
transaction_transformed.to_csv(INPUT_PATH+OUTPUT_NAME,index=False)