In [20]:
import pandas as pd
import numpy as np
from itertools import product
from datetime import datetime,date, timedelta
from google.cloud import storage
from google.cloud.storage import Blob
from sklearn import preprocessing
import seaborn as sns
client = storage.Client(project="ds5500")

#INPUT_PATH = "D:\\Northeastern\\100 pct undersample split\\"
INPUT_PATH = "gs://kkbox-data/data_100_pct_undersample/"
bucket = client.get_bucket("kkbox-data")

# Process User Logs

In [3]:
split="train"
user_logs = pd.read_csv(INPUT_PATH + "X_{}_user_logs.csv".format(split),usecols=[1,2,3,4,5,6,7,8,9])
user_logs.loc[:,["date"]]=pd.to_datetime(user_logs.date).dt.date
members_df = pd.read_csv(INPUT_PATH + "X_{}_transformed.csv".format(split))

### Normalize

In [61]:
scaler = preprocessing.MinMaxScaler()
for col in ["num_25", "num_50", "num_75", "num_985", "num_100", "num_unq", "total_secs"]:
    max_num = 3 * user_logs[col].values.std()
    col_clip = np.clip(user_logs[col].values, a_min=0,a_max=max_num)
    col_norm = scaler.fit_transform(col_clip.reshape(-1,1))
    user_logs[col + "_norm"] = col_norm


In [63]:
user_logs = user_logs.drop(["num_25", "num_50", "num_75", "num_985", "num_100", "num_unq", "total_secs"], axis=1)

In [64]:
user_logs.columns

Index(['msno', 'date', 'num_25_norm', 'num_50_norm', 'num_75_norm',
       'num_985_norm', 'num_100_norm', 'num_unq_norm', 'total_secs_norm'],
      dtype='object')

In [65]:
users = list(set(user_logs.msno))
num_users = len(users)
start_date = min(user_logs.date)
end_date = date(2017,1,31) # Max date we care about before evaluating churn
#print(f"Num dates: {num_dates}; num_users: {num_users}; padded records to create: {num_dates*num_users}")

In [66]:
# this creates a list containing all dates from start to end
dates = [start_date + timedelta(days=x) for x in range((end_date-start_date).days + 1)]
num_dates=len(dates)

padded_df = pd.DataFrame(product(users, dates), columns=["msno","date"])
padded_df = padded_df.merge(user_logs, how='left', on=["msno","date"]).fillna(0)
padded_df = padded_df.sort_values(by=["msno","date"])

## Reshape User Logs and Save

In [67]:
# Reshape df to numpy 3d matrix
padded_df_data = padded_df.iloc[:,2:9]
num_cols = len(padded_df_data.columns)
padded_array = padded_df_data.values.reshape(num_users,num_dates,num_cols)
np.save("{}_user_logs_padded".format(split),padded_array)
padded_array.shape

(24798, 762, 7)

In [68]:
# Move numpy file to GCS
blob = Blob("data_100_pct_undersample/{}_user_logs_padded.npy".format(split), bucket)
with open("{}_user_logs_padded.npy".format(split), "rb") as my_file:
    blob.upload_from_file(my_file)