In [77]:
import pandas as pd
import numpy as np
from itertools import product
from datetime import datetime,date, timedelta
from google.cloud import storage
from google.cloud.storage import Blob
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
client = storage.Client(project="ds5500")

INPUT_PATH = "gs://kkbox-data/50_pct_undersample/"
bucket = client.get_bucket("kkbox-data")
split="test" #use "train", "val" or "test"

# Process User Logs

In [78]:
user_logs = pd.read_csv(INPUT_PATH + "{}_user_logs.csv".format(split))
#user_logs = pd.read_csv("gs://dataprep-staging-a4a9adec-2491-40c1-8c86-fe48a78ef1f1/ribaudo.a@husky.neu.edu/jobrun/train_50_pct_user_logs.csv")
user_logs.loc[:,["date"]]=pd.to_datetime(user_logs.date).dt.date
members_df = pd.read_csv(INPUT_PATH + "{}_members_transformed.csv".format(split))
y = members_df.loc[:,["msno","is_churn"]].copy()

In [79]:
# Remove user_logs that don't exist in members df
msno_diff = list(set(user_logs.msno) - set(members_df.msno)) #difference in 2 sets
user_logs = user_logs[~user_logs["msno"].isin(msno_diff)]

### Log Normalize

In [80]:
# Standardize and clip the values at 2 * std because we want to be very sensitive at the lower end of the data.
# In other words, it's safe to assume that those playing tracks above 2*std won't churn, we want to investigate the distribution of data below that point
std_scaler = preprocessing.StandardScaler()
max_scaler = preprocessing.MinMaxScaler()

for col in ["num_25", "num_50", "num_75", "num_985", "num_100", "num_unq", "total_secs"]:
    count_clip = np.clip(user_logs[col].values.reshape(-1,1), a_min=0,a_max=user_logs[col].values.reshape(-1,1).max())
    #col_log = np.log(count_clip +.0001)
    #col_std = std_scaler.fit_transform(col_log)
    #col_clip = np.clip(col_std, a_min=col_std.min(),a_max=2)
    #col_norm = max_scaler.fit_transform(col_clip)
    #user_logs[col + "_norm"] = col_norm
    user_logs[col + "_clip"] = count_clip


In [81]:
user_logs = user_logs.drop(["num_25", "num_50", "num_75", "num_985", "num_100", "num_unq", "total_secs"], axis=1)

In [82]:
users = list(set(user_logs.msno))
num_users = len(users)
start_date = date(2016,7,1) # Technically we can go back further, but it introduces a lot of missing data for various users
end_date = date(2017,1,31) # Max date we care about before evaluating churn
#print(f"Num dates: {num_dates}; num_users: {num_users}; padded records to create: {num_dates*num_users}")

In [83]:
# this creates a list containing all dates from start to end
dates = [start_date + timedelta(days=x) for x in range((end_date-start_date).days + 1)]
num_dates=len(dates)
# Join the user_log data with the dates created in the previous step
padded_df = pd.DataFrame(product(users, dates), columns=["msno","date"])
padded_df = padded_df.merge(user_logs, how='left', on=["msno","date"]).fillna(0)
padded_df = padded_df.sort_values(by=["msno","date"])

# EDA

## Reshape User Logs and Save

In [84]:
# Reshape df to numpy 3d matrix
padded_df_data = padded_df.iloc[:,2:]
num_cols = len(padded_df_data.columns)
padded_array = padded_df_data.values.reshape(num_users,num_dates,num_cols)
np.save("{}_user_logs_padded".format(split),padded_array)
padded_array.shape

(22411, 215, 7)

In [85]:
# Move numpy file to GCS
blob = Blob("50_pct_undersample/{}_user_logs_padded.npy".format(split), bucket)
with open("{}_user_logs_padded.npy".format(split), "rb") as my_file:
    blob.upload_from_file(my_file)

# Aggregate by Week

In [86]:
# Reload because the norms were taken from daily data
user_logs = pd.read_csv(INPUT_PATH + "{}_user_logs.csv".format(split))

In [87]:
# Remove user_logs that don't exist in members df
msno_diff = list(set(user_logs.msno) - set(members_df.msno)) #difference in 2 sets
user_logs = user_logs[~user_logs["msno"].isin(msno_diff)]

In [88]:
# Clip with min=0 to remove invalid counts
for col in ["num_25", "num_50", "num_75", "num_985", "num_100", "num_unq", "total_secs"]:
    user_logs[col] =  np.clip(user_logs[col].values.reshape(-1,1), a_min=0,a_max=user_logs[col].values.reshape(-1,1).max())

user_logs.loc[:,["date"]]=pd.to_datetime(user_logs.date).dt.date
user_logs["week"] = pd.to_datetime(user_logs['date']).dt.to_period('W')
user_logs_group = user_logs.drop(["date"],axis=1)
user_logs_week = user_logs_group.groupby(["msno","week"]).sum().reset_index()

## Standardize, clip, normalize

In [89]:
# Standardize and clip the values at 2 * std because we want to be very sensitive at the lower end of the data.
# In other words, it's safe to assume that those playing tracks above 2*std won't churn, we want to investigate the distribution of data below that point
std_scaler = preprocessing.StandardScaler()
max_scaler = preprocessing.MinMaxScaler()

for col in ["num_25", "num_50", "num_75", "num_985", "num_100", "num_unq", "total_secs"]:
    count_clip = np.clip(user_logs_week[col].values.reshape(-1,1), a_min=0,a_max=user_logs_week[col].values.reshape(-1,1).max())
    col_log = np.log(count_clip +.0001)
    col_std = std_scaler.fit_transform(col_log)
    col_clip = np.clip(col_std, a_min=col_std.min(),a_max=2)
    col_norm = max_scaler.fit_transform(col_clip)
    user_logs_week[col + "_norm"] = col_norm


In [90]:
user_logs_week = user_logs_week.drop(["num_25", "num_50", "num_75", "num_985", "num_100", "num_unq", "total_secs"], axis=1)

In [91]:
users = list(set(user_logs_week.msno))
num_users = len(users)
start_date = date(2016,7,1) # Technically we can go back further, but it introduces a lot of missing data for various users
end_date = date(2017,1,31) # Max date we care about before evaluating churn
#print(f"Num dates: {num_dates}; num_users: {num_users}; padded records to create: {num_dates*num_users}")
# this creates a list containing all dates from start to end
dates = [start_date + timedelta(days=x) for x in range((end_date-start_date).days + 1)]
weeks = pd.to_datetime(pd.Series(dates)).dt.to_period('W')

In [92]:
weeks_list = list(set(weeks.astype('str')))
weeks_list.sort()
weeks_list = weeks_list[0:len(weeks_list)-1] # remove last week which includes Feb
num_weeks=len(weeks_list)

In [93]:
# Join the user_log data with the dates created in the previous step
padded_df = pd.DataFrame(product(users, weeks_list), columns=["msno","week"])
# Convert week to string before merge
user_logs_week["week"] = user_logs_week.week.astype('str')

padded_df = padded_df.merge(user_logs_week, how='left', on=["msno","week"]).fillna(0)
padded_df = padded_df.sort_values(by=["msno","week"])

## EDA

## Reshape and Save

In [94]:
# Reshape df to numpy 3d matrix
padded_df_data = padded_df.iloc[:,2:]
num_cols = len(padded_df_data.columns)
padded_array = padded_df_data.values.reshape(num_users,num_weeks,num_cols)
np.save("{}_user_logs_weekly_padded".format(split),padded_array)
padded_array.shape
# Move numpy file to GCS
blob = Blob("50_pct_undersample/{}_user_logs_weekly_padded.npy".format(split), bucket)
with open("{}_user_logs_weekly_padded.npy".format(split), "rb") as my_file:
    blob.upload_from_file(my_file)