In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
from sklearn import preprocessing
from sklearn import datasets
from google.cloud import storage

In [12]:
INPUT_PATH = "gs://kkbox-data/50_pct_undersample/"
split = "val"

#X = pd.read_csv(INPUT_PATH + f"X_{split}.csv")
X = pd.read_csv(INPUT_PATH + "{}_members.csv".format(split))
X.head()

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time
0,yNmChdogFy89InCTypzYXaJSkc5oav+mxXepVUJYQrw=,0,13,29,female,9,20070829
1,owa+kX0LbFIxpnV8fp060KwFOeZwq9NbbAPr42XToOA=,0,15,36,male,9,20060719
2,04ReIMkd3hm7kQUGX0hW6VYD3I6O2L5Lb7i66JC67/Y=,0,1,0,,7,20150819
3,FPFaTQIn4O2De6QbyLBLIyHlkiBh5kqRMNG81AGc/u4=,0,13,23,female,9,20060816
4,KUG2UcdQ4EJc3WRldKxE0W0T28Ik9tlkpxqyjfVMGpk=,0,13,20,,9,20141111


# Gender
We treat 'unknown' as a 3rd category of gender with the assumption being that the user's decision to not provide gender may be relevant to our predictions

In [13]:
X.loc[X["gender"] == "male","gender"] = "M"
X.loc[X["gender"] == "female","gender"] = "F"
X.loc[:,"gender"]=X["gender"].fillna("U")

enc_gender = preprocessing.OneHotEncoder(drop="first",categories="auto")
gender_onehot = enc_gender.fit_transform(X.gender.values.reshape(-1,1)).toarray()
# Drop old gender column and merge new onehot encoded columns
X=X.drop(["gender"],axis=1)
X["gender_male"] = gender_onehot[:,0]
X["gender_unknown"] = gender_onehot[:,1]
X.head()

Unnamed: 0,msno,is_churn,city,bd,registered_via,registration_init_time,gender_male,gender_unknown
0,yNmChdogFy89InCTypzYXaJSkc5oav+mxXepVUJYQrw=,0,13,29,9,20070829,0.0,0.0
1,owa+kX0LbFIxpnV8fp060KwFOeZwq9NbbAPr42XToOA=,0,15,36,9,20060719,1.0,0.0
2,04ReIMkd3hm7kQUGX0hW6VYD3I6O2L5Lb7i66JC67/Y=,0,1,0,7,20150819,0.0,1.0
3,FPFaTQIn4O2De6QbyLBLIyHlkiBh5kqRMNG81AGc/u4=,0,13,23,9,20060816,0.0,0.0
4,KUG2UcdQ4EJc3WRldKxE0W0T28Ik9tlkpxqyjfVMGpk=,0,13,20,9,20141111,0.0,1.0


# Registration Date
We convert registration date into an integer containing the number of days since registration. Those values are then standardized.

In [14]:
reg_date = pd.to_datetime(X.registration_init_time, format="%Y%m%d")
# Assuming here that our training date for predicting February 2017 is Jan 1, 2017
train_date = pd.to_datetime("20170131", format="%Y%m%d")
reg_days = (train_date - reg_date).dt.days

reg_days_scaled = preprocessing.scale(reg_days)
X["reg_days_scaled"] = pd.Series(reg_days_scaled)
X=X.drop(["registration_init_time"], axis=1)
X.head()

Unnamed: 0,msno,is_churn,city,bd,registered_via,gender_male,gender_unknown,reg_days_scaled
0,yNmChdogFy89InCTypzYXaJSkc5oav+mxXepVUJYQrw=,0,13,29,9,0.0,0.0,1.701712
1,owa+kX0LbFIxpnV8fp060KwFOeZwq9NbbAPr42XToOA=,0,15,36,9,1.0,0.0,2.080461
2,04ReIMkd3hm7kQUGX0hW6VYD3I6O2L5Lb7i66JC67/Y=,0,1,0,7,0.0,1.0,-1.014833
3,FPFaTQIn4O2De6QbyLBLIyHlkiBh5kqRMNG81AGc/u4=,0,13,23,9,0.0,0.0,2.054341
4,KUG2UcdQ4EJc3WRldKxE0W0T28Ik9tlkpxqyjfVMGpk=,0,13,20,9,0.0,1.0,-0.752694


# Registration Method
We one-hot encode the registration methods

In [15]:
enc_registration = preprocessing.OneHotEncoder(drop="first",categories="auto")
registration_onehot = enc_registration.fit_transform(X.registered_via.values.reshape(-1,1)).toarray()
registration_df = pd.DataFrame(registration_onehot)
registration_df.columns = enc_registration.get_feature_names(["registered_via"])
X = pd.concat([X,registration_df], axis=1)
X = X.drop(["registered_via"],axis=1)
X.head()

Unnamed: 0,msno,is_churn,city,bd,gender_male,gender_unknown,reg_days_scaled,registered_via_4,registered_via_7,registered_via_9
0,yNmChdogFy89InCTypzYXaJSkc5oav+mxXepVUJYQrw=,0,13,29,0.0,0.0,1.701712,0.0,0.0,1.0
1,owa+kX0LbFIxpnV8fp060KwFOeZwq9NbbAPr42XToOA=,0,15,36,1.0,0.0,2.080461,0.0,0.0,1.0
2,04ReIMkd3hm7kQUGX0hW6VYD3I6O2L5Lb7i66JC67/Y=,0,1,0,0.0,1.0,-1.014833,0.0,1.0,0.0
3,FPFaTQIn4O2De6QbyLBLIyHlkiBh5kqRMNG81AGc/u4=,0,13,23,0.0,0.0,2.054341,0.0,0.0,1.0
4,KUG2UcdQ4EJc3WRldKxE0W0T28Ik9tlkpxqyjfVMGpk=,0,13,20,0.0,1.0,-0.752694,0.0,0.0,1.0


# City
We one-hot encode the city

In [16]:
enc_city = preprocessing.OneHotEncoder(drop="first",categories="auto")
city_onehot = enc_city.fit_transform(X.city.values.reshape(-1,1)).toarray()
city_df = pd.DataFrame(city_onehot)
city_df.columns = enc_city.get_feature_names(["city"])
X = pd.concat([X,city_df], axis=1)
X = X.drop(["city"],axis=1)
X.head()

Unnamed: 0,msno,is_churn,bd,gender_male,gender_unknown,reg_days_scaled,registered_via_4,registered_via_7,registered_via_9,city_3,...,city_13,city_14,city_15,city_16,city_17,city_18,city_19,city_20,city_21,city_22
0,yNmChdogFy89InCTypzYXaJSkc5oav+mxXepVUJYQrw=,0,29,0.0,0.0,1.701712,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,owa+kX0LbFIxpnV8fp060KwFOeZwq9NbbAPr42XToOA=,0,36,1.0,0.0,2.080461,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,04ReIMkd3hm7kQUGX0hW6VYD3I6O2L5Lb7i66JC67/Y=,0,0,0.0,1.0,-1.014833,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,FPFaTQIn4O2De6QbyLBLIyHlkiBh5kqRMNG81AGc/u4=,0,23,0.0,0.0,2.054341,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,KUG2UcdQ4EJc3WRldKxE0W0T28Ik9tlkpxqyjfVMGpk=,0,20,0.0,1.0,-0.752694,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Birthday

Scaling the BD values. Others have indicated that this feature is predictive (despite the strange, negative values)

In [17]:
X["bd"] = preprocessing.scale(X.bd)
X.head()

Unnamed: 0,msno,is_churn,bd,gender_male,gender_unknown,reg_days_scaled,registered_via_4,registered_via_7,registered_via_9,city_3,...,city_13,city_14,city_15,city_16,city_17,city_18,city_19,city_20,city_21,city_22
0,yNmChdogFy89InCTypzYXaJSkc5oav+mxXepVUJYQrw=,0,0.570167,0.0,0.0,1.701712,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,owa+kX0LbFIxpnV8fp060KwFOeZwq9NbbAPr42XToOA=,0,0.898509,1.0,0.0,2.080461,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,04ReIMkd3hm7kQUGX0hW6VYD3I6O2L5Lb7i66JC67/Y=,0,-0.790109,0.0,1.0,-1.014833,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,FPFaTQIn4O2De6QbyLBLIyHlkiBh5kqRMNG81AGc/u4=,0,0.28873,0.0,0.0,2.054341,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,KUG2UcdQ4EJc3WRldKxE0W0T28Ik9tlkpxqyjfVMGpk=,0,0.148012,0.0,1.0,-0.752694,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
X.to_csv(INPUT_PATH + "{}_members_transformed.csv".format(split),index=False)