In [3]:
import pandas as pd
import numpy as np
from itertools import product
from datetime import datetime,date, timedelta
from google.cloud import storage
from google.cloud.storage import Blob
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
client = storage.Client(project="ds5500")

INPUT_PATH = "gs://kkbox-data/50_pct_undersample/"
bucket = client.get_bucket("kkbox-data")
split="val" # Use "train", "val" or "test"

# Process Transactions

In [4]:
transactions = pd.read_csv(INPUT_PATH + "{}_transactions.csv".format(split))
members_df = pd.read_csv(INPUT_PATH + "{}_members_transformed.csv".format(split))
y = members_df.loc[:,["msno","is_churn"]].copy()

In [5]:
# Remove transactions that don't exist in members df
msno_diff = list(set(transactions.msno) - set(members_df.msno)) #difference in 2 sets
transactions = transactions[~transactions["msno"].isin(msno_diff)]

In [6]:
transactions.loc[:,"transaction_date"] = pd.to_datetime(transactions.transaction_date)
transactions = transactions.drop(["payment_method_id","plan_list_price","membership_expire_date"],axis=1)

In [7]:
transactions.columns = ['msno', 'payment_plan_days', 'actual_amount_paid', 'is_auto_renew',
       'date', 'is_cancel']
transactions.columns

Index(['msno', 'payment_plan_days', 'actual_amount_paid', 'is_auto_renew',
       'date', 'is_cancel'],
      dtype='object')

### Log Normalize

In [8]:
std_scaler = preprocessing.StandardScaler()
max_scaler = preprocessing.MinMaxScaler()

for col in ["payment_plan_days","actual_amount_paid"]:
    col_log = np.log(transactions[col].values.reshape(-1,1) + .001)
    col_std = std_scaler.fit_transform(col_log)
    col_clip = np.clip(col_std, a_min=col_std.min(),a_max=2)
    col_norm = max_scaler.fit_transform(col_clip)
    transactions[col + "_norm"] = col_norm

In [9]:
transactions = transactions.drop(["payment_plan_days", "actual_amount_paid"], axis=1)

In [10]:
transactions.to_csv(INPUT_PATH + "transactions_transformed.csv".format(split))

# Monthly Data
Compress the transactions sequences down to monthly data points with the assumption being that daily data points were too sparse.

In [11]:
transactions["year_month"] = transactions.date.dt.year.astype("str").str.cat(transactions.date.dt.month.astype("str")) 
transactions = transactions.drop(["date"],axis=1)

In [12]:
transactions_monthly = transactions.groupby(["msno","year_month"]).agg({'is_auto_renew':'any','is_cancel':'any','payment_plan_days_norm':'mean','actual_amount_paid_norm':'mean'}).reset_index()
transactions_monthly.loc[:,"is_auto_renew"] = transactions_monthly.is_auto_renew.astype('int32')
transactions_monthly.loc[:,"is_cancel"] = transactions_monthly.is_cancel.astype('int32')

In [13]:
users = list(set(transactions_monthly.msno))
num_users = len(users)
dates = list(set(transactions_monthly.year_month))
# Remove dates before 201601 and after 201701
dates.sort()
dates=dates[12:25]

In [14]:
num_dates=len(dates)
# Join the user_log data with the dates created in the previous step
padded_transactions_monthly = pd.DataFrame(product(users, dates), columns=["msno","year_month"])
padded_transactions_monthly = padded_transactions_monthly.merge(transactions_monthly, how='left', on=["msno","year_month"]).fillna(0)
padded_transactions_monthly = padded_transactions_monthly.sort_values(by=["msno","year_month"])

In [15]:
# Reshape df to numpy 3d matrix
padded_data = padded_transactions_monthly.iloc[:,2:]
num_cols = len(padded_data.columns)
padded_array = padded_data.values.reshape(num_users,num_dates,num_cols)
np.save("{}_transactions_monthly_padded".format(split),padded_array)
padded_array.shape

(21517, 13, 4)

In [16]:
# Move numpy file to GCS
blob = Blob("50_pct_undersample/{}_transactions_monthly_padded.npy".format(split), bucket)
with open("{}_transactions_monthly_padded.npy".format(split), "rb") as my_file:
    blob.upload_from_file(my_file)