In [48]:
pip install xgboost

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
import xgboost
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score

In [50]:
sales_df = pd.read_csv('data/wish_sales_explore.csv')
sales_df.head(3)

Unnamed: 0,price,units_sold,rating,rating_count,badges_count,badge_product_quality,product_variation_inventory,merchant_rating_count,merchant_rating,merchant_has_profile_picture,...,rating_one_count,size_m,size_other,size_s,size_xs,log_units_sold,tag_summer,tag_women's fashion,tag_sexy,tag_tank
0,16.0,100,3.76,54,0,0,50,568,4.128521,0,...,9.0,1,0,0,0,4.60517,1,1,0,0
1,8.0,20000,3.45,6135,0,0,50,17752,3.899673,0,...,1077.0,0,0,0,1,9.903488,1,1,1,0
2,8.0,100,3.57,14,0,0,1,295,3.989831,0,...,3.0,0,0,0,1,4.60517,1,1,1,0


## Split the dataset into train, test, and validation

In [51]:
#y = sales_df["log_units_sold"]
y = sales_df["log_units_sold"]
X = sales_df.drop(["units_sold","log_units_sold"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Normalize the numerical features

In [52]:
X_train.columns

Index(['price', 'rating', 'rating_count', 'badges_count',
       'badge_product_quality', 'product_variation_inventory',
       'merchant_rating_count', 'merchant_rating',
       'merchant_has_profile_picture', 'rating_five_count',
       'rating_four_count', 'rating_three_count', 'rating_two_count',
       'rating_one_count', 'size_m', 'size_other', 'size_s', 'size_xs',
       'tag_summer', 'tag_women's fashion', 'tag_sexy', 'tag_tank'],
      dtype='object')

In [53]:
scaler = MinMaxScaler()
num_col =[ 'price', 'rating', 'rating_count', 'badges_count',
       'product_variation_inventory','merchant_rating_count', 'merchant_rating','rating_five_count',
       'rating_four_count', 'rating_three_count', 'rating_two_count',
       'rating_one_count']
scaler.fit(X_train[num_col]) #fir the min_max scalar on the train dataset 

MinMaxScaler()

In [54]:
def minmax_on_dataset(scaler, df, num_col):
    num_scale = scaler.transform(df[num_col])

    num_scale_df = pd.DataFrame(num_scale, columns = num_col)
    cat_df = df.drop(num_col, axis=1)

    num_scale_df.reset_index(drop=True,inplace=True)
    cat_df.reset_index(drop=True,inplace=True)

    result_df = pd.concat([num_scale_df, cat_df], axis=1)
    return result_df

In [55]:
X_train_prep = minmax_on_dataset(scaler, X_train, num_col)
X_train_prep.head()

Unnamed: 0,price,rating,rating_count,badges_count,product_variation_inventory,merchant_rating_count,merchant_rating,rating_five_count,rating_four_count,rating_three_count,...,badge_product_quality,merchant_has_profile_picture,size_m,size_other,size_s,size_xs,tag_summer,tag_women's fashion,tag_sexy,tag_tank
0,0.076923,0.6,0.000723,0.0,0.040816,0.000319,0.584422,0.000433,0.000723,0.000547,...,0,0,0,0,0,1,1,1,1,1
1,0.884615,0.8025,0.036734,0.0,1.0,0.004863,0.740604,0.03992,0.036127,0.017496,...,0,0,0,1,0,0,0,0,0,0
2,0.307692,0.645,0.003905,0.0,1.0,0.016448,0.571332,0.002511,0.004817,0.003554,...,0,0,0,0,1,0,1,1,0,0
3,0.269231,0.6675,0.000289,0.0,0.387755,0.000618,0.546965,0.000173,0.000241,0.000547,...,0,0,0,0,1,0,1,1,1,0
4,0.384615,0.5725,0.002362,0.0,1.0,0.0002,0.524554,0.001299,0.001927,0.00328,...,0,0,1,0,0,0,1,1,1,0


In [56]:
X_test_prep = minmax_on_dataset(scaler, X_test, num_col)

Our goal is to predict the ```log_units_sold```. This is a regression problem. We will start with building and comparing 3 different kinds of models: 
- RandomForest
- XGBoost
- Neural Network

In [57]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

### RandomForest

In [58]:
regressor = RandomForestRegressor(n_estimators=20, random_state=4, criterion='mse')
regressor.fit(X_train_prep, y_train)
regressor_scores = cross_val_score(regressor, X_train_prep, y_train, scoring='neg_mean_squared_error')

regressor_mae_scores =  np.sqrt(-regressor_scores)
display_scores(regressor_mae_scores)

Scores: [0.74716593 0.68297109 0.68868995 0.73966999 0.70063254]
Mean: 0.711825899665639
Standard deviation: 0.026523100723128953


### XGBoost

In [59]:
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train_prep, y_train)
xgb_scores = cross_val_score(xgb_reg, X_train_prep, y_train, scoring='neg_mean_squared_error')

xgb_mae_scores = np.sqrt(-xgb_scores)
display_scores(xgb_mae_scores)

Scores: [0.74245291 0.67270865 0.7498293  0.75267925 0.72631802]
Mean: 0.7287976270134355
Standard deviation: 0.029500005977797412


### Neural Network

In [60]:
nn_reg = MLPRegressor(hidden_layer_sizes=(100,300,),random_state = 1, max_iter=500)
nn_reg.fit(X_train_prep, y_train)

nn_scores = cross_val_score(nn_reg, X_train_prep, y_train, scoring='neg_mean_squared_error')

nn_mae_scores = np.sqrt(-nn_scores)
display_scores(nn_mae_scores)

Scores: [0.95516538 0.84125619 0.9774098  0.9324119  0.86176173]
Mean: 0.9136010005381238
Standard deviation: 0.05305486496409064


The cross validation scores suggest that RandomForest has the best performance. However, in general, XGBoost should have better performance than randomforest. We will check if hyperparameter tuning can help on improving the XGBoost's performance in our next notebook