In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
from sklearn import preprocessing
from sklearn import datasets

In [2]:
X_train = pd.read_csv("D:\\Northeastern\\50 pct undersample final data\\X_train.csv")
y_train = pd.read_csv("D:\\Northeastern\\50 pct undersample final data\\y_train.csv")

# Gender
We treat 'unknown' as a 3rd category of gender with the assumption being that the user's decision to not provide gender may be relevant to our predictions

In [3]:
X_train.loc[X_train["gender"] == "male","gender"] = "M"
X_train.loc[X_train["gender"] == "female","gender"] = "F"
X_train.loc[:,"gender"]=X_train["gender"].fillna("U")

enc_gender = preprocessing.OneHotEncoder(drop="first",categories="auto")
gender_onehot = enc_gender.fit_transform(X_train.gender.values.reshape(-1,1)).toarray()
# Drop old gender column and merge new onehot encoded columns
X_train=X_train.drop(["gender"],axis=1)
X_train["gender_male"] = gender_onehot[:,0]
X_train["gender_unknown"] = gender_onehot[:,1]
X_train.head()

Unnamed: 0,msno,city,bd,registered_via,registration_init_time,gender_male,gender_unknown
0,ilhEBEcFTKNUTeI7JQWCLCYzBpwQMRUm7l/xWEoxe7w=,4,32,3,20130923,1.0,0.0
1,VQJZ++zT7tUAUxqEoLuqL2nd0nQQjmcVnzcxst6mU9Y=,1,0,7,20100917,0.0,1.0
2,XHnJ2CiOAcphBxBrP1nxuZD/HRPueBpQ2x2SRKOvOH0=,1,0,7,20110217,0.0,1.0
3,3jm/VESznMiIIO51lhm1ixdM8+6MG5w0rPKVcubEFYc=,1,0,7,20150916,0.0,1.0
4,tZjyBxs9dtl07Izqja+akspMHEYIzJCe5mvZKIlQpps=,13,32,7,20100427,1.0,0.0


# Registration Date
We convert registration date into an integer containing the number of days since registration. Those values are then standardized.

In [4]:
reg_date = pd.to_datetime(X_train.registration_init_time, format="%Y%m%d")
# Assuming here that our training date for predicting February 2017 is Jan 1, 2017
train_date = pd.to_datetime("20170131", format="%Y%m%d")
reg_days = (train_date - reg_date).dt.days

reg_days_scaled = preprocessing.scale(reg_days)
X_train["reg_days_scaled"] = pd.Series(reg_days_scaled)
X_train=X_train.drop(["registration_init_time"], axis=1)
X_train.head()

Unnamed: 0,msno,city,bd,registered_via,gender_male,gender_unknown,reg_days_scaled
0,ilhEBEcFTKNUTeI7JQWCLCYzBpwQMRUm7l/xWEoxe7w=,4,32,3,1.0,0.0,-0.364365
1,VQJZ++zT7tUAUxqEoLuqL2nd0nQQjmcVnzcxst6mU9Y=,1,0,7,0.0,1.0,0.667749
2,XHnJ2CiOAcphBxBrP1nxuZD/HRPueBpQ2x2SRKOvOH0=,1,0,7,0.0,1.0,0.524452
3,3jm/VESznMiIIO51lhm1ixdM8+6MG5w0rPKVcubEFYc=,1,0,7,0.0,1.0,-1.041514
4,tZjyBxs9dtl07Izqja+akspMHEYIzJCe5mvZKIlQpps=,13,32,7,1.0,0.0,0.80168


# Registration Method
We one-hot encode the registration methods

In [5]:
enc_registration = preprocessing.OneHotEncoder(drop="first",categories="auto")
registration_onehot = enc_registration.fit_transform(X_train.registered_via.values.reshape(-1,1)).toarray()
registration_df = pd.DataFrame(registration_onehot)
registration_df.columns = enc_registration.get_feature_names(["registered_via"])
X_train = pd.concat([X_train,registration_df], axis=1)
X_train = X_train.drop(["registered_via"],axis=1)
X_train.head()

Unnamed: 0,msno,city,bd,gender_male,gender_unknown,reg_days_scaled,registered_via_4,registered_via_7,registered_via_9
0,ilhEBEcFTKNUTeI7JQWCLCYzBpwQMRUm7l/xWEoxe7w=,4,32,1.0,0.0,-0.364365,0.0,0.0,0.0
1,VQJZ++zT7tUAUxqEoLuqL2nd0nQQjmcVnzcxst6mU9Y=,1,0,0.0,1.0,0.667749,0.0,1.0,0.0
2,XHnJ2CiOAcphBxBrP1nxuZD/HRPueBpQ2x2SRKOvOH0=,1,0,0.0,1.0,0.524452,0.0,1.0,0.0
3,3jm/VESznMiIIO51lhm1ixdM8+6MG5w0rPKVcubEFYc=,1,0,0.0,1.0,-1.041514,0.0,1.0,0.0
4,tZjyBxs9dtl07Izqja+akspMHEYIzJCe5mvZKIlQpps=,13,32,1.0,0.0,0.80168,0.0,1.0,0.0


# City
We one-hot encode the city

In [10]:
enc_city = preprocessing.OneHotEncoder(drop="first",categories="auto")
city_onehot = enc_city.fit_transform(X_train.city.values.reshape(-1,1)).toarray()
city_df = pd.DataFrame(city_onehot)
city_df.columns = enc_city.get_feature_names(["city"])
X_train = pd.concat([X_train,city_df], axis=1)
X_train = X_train.drop(["city"],axis=1)
X_train.head()

Unnamed: 0,msno,bd,gender_male,gender_unknown,reg_days_scaled,registered_via_4,registered_via_7,registered_via_9,city_3,city_4,...,city_13,city_14,city_15,city_16,city_17,city_18,city_19,city_20,city_21,city_22
0,ilhEBEcFTKNUTeI7JQWCLCYzBpwQMRUm7l/xWEoxe7w=,32,1.0,0.0,-0.364365,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,VQJZ++zT7tUAUxqEoLuqL2nd0nQQjmcVnzcxst6mU9Y=,0,0.0,1.0,0.667749,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,XHnJ2CiOAcphBxBrP1nxuZD/HRPueBpQ2x2SRKOvOH0=,0,0.0,1.0,0.524452,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3jm/VESznMiIIO51lhm1ixdM8+6MG5w0rPKVcubEFYc=,0,0.0,1.0,-1.041514,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,tZjyBxs9dtl07Izqja+akspMHEYIzJCe5mvZKIlQpps=,32,1.0,0.0,0.80168,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Birthday

Scaling the BD values. Others have indicated that this feature is predictive (despite the strange, negative values)

In [18]:
X_train["bd"] = preprocessing.scale(X_train.bd)
X_train.head()

Unnamed: 0,msno,bd,gender_male,gender_unknown,reg_days_scaled,registered_via_4,registered_via_7,registered_via_9,city_3,city_4,...,city_13,city_14,city_15,city_16,city_17,city_18,city_19,city_20,city_21,city_22
0,ilhEBEcFTKNUTeI7JQWCLCYzBpwQMRUm7l/xWEoxe7w=,0.697941,1.0,0.0,-0.364365,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,VQJZ++zT7tUAUxqEoLuqL2nd0nQQjmcVnzcxst6mU9Y=,-0.778092,0.0,1.0,0.667749,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,XHnJ2CiOAcphBxBrP1nxuZD/HRPueBpQ2x2SRKOvOH0=,-0.778092,0.0,1.0,0.524452,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3jm/VESznMiIIO51lhm1ixdM8+6MG5w0rPKVcubEFYc=,-0.778092,0.0,1.0,-1.041514,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,tZjyBxs9dtl07Izqja+akspMHEYIzJCe5mvZKIlQpps=,0.697941,1.0,0.0,0.80168,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
X_train.to_csv("D:\\Northeastern\\50 pct undersample final data\\X_train_transformed.csv",index=False)