In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import xgboost
from sklearn.neural_network import MLPRegressor

In [6]:
sales_df = pd.read_csv('data/wish_sales_explore.csv')
sales_df.head(3)

Unnamed: 0,price,units_sold,rating,rating_count,badges_count,badge_product_quality,product_variation_inventory,merchant_rating_count,merchant_rating,merchant_has_profile_picture,...,rating_one_count,size_m,size_other,size_s,size_xs,log_units_sold,tag_summer,tag_women's fashion,tag_sexy,tag_tank
0,16.0,100,3.76,54,0,0,50,568,4.128521,0,...,9.0,1,0,0,0,4.60517,1,1,0,0
1,8.0,20000,3.45,6135,0,0,50,17752,3.899673,0,...,1077.0,0,0,0,1,9.903488,1,1,1,0
2,8.0,100,3.57,14,0,0,1,295,3.989831,0,...,3.0,0,0,0,1,4.60517,1,1,1,0


## Split the dataset into train, test, and validation

In [7]:
y = sales_df["log_units_sold"]
X = sales_df.drop(["units_sold","log_units_sold"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

### Normalize the numerical features

In [8]:
X_train.columns

Index(['price', 'rating', 'rating_count', 'badges_count',
       'badge_product_quality', 'product_variation_inventory',
       'merchant_rating_count', 'merchant_rating',
       'merchant_has_profile_picture', 'rating_five_count',
       'rating_four_count', 'rating_three_count', 'rating_two_count',
       'rating_one_count', 'size_m', 'size_other', 'size_s', 'size_xs',
       'tag_summer', 'tag_women's fashion', 'tag_sexy', 'tag_tank'],
      dtype='object')

In [10]:
scaler = MinMaxScaler()
num_col =[ 'price', 'rating', 'rating_count', 'badges_count',
       'product_variation_inventory','merchant_rating_count', 'merchant_rating','rating_five_count',
       'rating_four_count', 'rating_three_count', 'rating_two_count',
       'rating_one_count']
scaler.fit(X_train[num_col]) #fir the min_max scalar on the train dataset 

MinMaxScaler()

In [11]:
def minmax_on_dataset(scaler, df, num_col):
    num_scale = scaler.transform(df[num_col])

    num_scale_df = pd.DataFrame(num_scale, columns = num_col)
    cat_df = df.drop(num_col, axis=1)

    num_scale_df.reset_index(drop=True,inplace=True)
    cat_df.reset_index(drop=True,inplace=True)

    result_df = pd.concat([num_scale_df, cat_df], axis=1)
    return result_df

In [12]:
X_train_prep = minmax_on_dataset(scaler, X_train, num_col)
X_train_prep.head()

Unnamed: 0,price,rating,rating_count,badges_count,product_variation_inventory,merchant_rating_count,merchant_rating,rating_five_count,rating_four_count,rating_three_count,...,badge_product_quality,merchant_has_profile_picture,size_m,size_other,size_s,size_xs,tag_summer,tag_women's fashion,tag_sexy,tag_tank
0,0.307692,0.935,0.002266,0.0,1.0,0.013009,0.6261,0.003577,0.000723,0.000981,...,0,0,0,0,1,0,1,1,0,1
1,0.384615,0.74,0.032106,0.0,1.0,0.025416,0.670925,0.032189,0.024085,0.029768,...,0,0,0,0,1,0,1,1,0,1
2,0.423077,0.74,0.011907,0.0,1.0,0.086989,0.664667,0.012071,0.008911,0.010468,...,0,1,0,0,1,0,1,1,1,0
3,0.192308,0.75,0.000193,0.0,1.0,0.381,0.687441,0.000179,0.000241,0.0,...,0,0,0,0,1,0,1,1,1,0
4,0.5,0.7625,0.008051,0.0,1.0,0.0619,0.577017,0.008315,0.007707,0.005888,...,0,0,0,0,1,0,1,1,0,0


In [13]:
X_test_prep = minmax_on_dataset(scaler, X_test, num_col)
X_val_prep = minmax_on_dataset(scaler, X_val, num_col)

Our goal is to predict the ```log_units_sold```. This is a regression problem. We will start with building and comparing 3 different kinds of models: 
- RandomForest
- XGBoost
- Neural Network

### RandomForest

In [14]:
regressor = RandomForestRegressor(n_estimators=20, random_state=1)
regressor.fit(X_train_prep, y_train)
y_pred = regressor.predict(X_val_prep)

print('Root Mean Squared Error:', 
      np.sqrt(metrics.mean_squared_error(np.exp(y_val), np.exp(y_pred))))

Root Mean Squared Error: 4859.771296882556


### XGBoost

In [32]:
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train_prep, y_train, eval_set=[(X_val_prep, y_val)], eval_metric='rmse',early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val_prep)
print('Root Mean Squared Error:', 
      np.sqrt(metrics.mean_squared_error(np.exp(y_val), np.exp(y_pred))))

[0]	validation_0-rmse:4.73246
Will train until validation_0-rmse hasn't improved in 2 rounds.
[1]	validation_0-rmse:3.40658
[2]	validation_0-rmse:2.46812
[3]	validation_0-rmse:1.82942
[4]	validation_0-rmse:1.39315
[5]	validation_0-rmse:1.11359
[6]	validation_0-rmse:0.92833
[7]	validation_0-rmse:0.82684
[8]	validation_0-rmse:0.77031
[9]	validation_0-rmse:0.73921
[10]	validation_0-rmse:0.72096
[11]	validation_0-rmse:0.71930
[12]	validation_0-rmse:0.71908
[13]	validation_0-rmse:0.72193
[14]	validation_0-rmse:0.72224
Stopping. Best iteration:
[12]	validation_0-rmse:0.71908

Root Mean Squared Error: 6485.270819590707


### Neural Network

In [30]:
nn_reg = MLPRegressor(hidden_layer_sizes=(100,300,),random_state = 1, max_iter=500)
nn_reg.fit(X_train_prep, y_train)
y_pred = nn_reg.predict(X_val_prep)

print('Root Mean Squared Error:', 
      np.sqrt(metrics.mean_squared_error(np.exp(y_val), np.exp(y_pred))))

Root Mean Squared Error: 8401.490276628996
