# Import Libraries and Datasets

In [1]:
!pip install xgboost --quiet

In [2]:
from xgboost import XGBRegressor

In [3]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor

In [4]:
data = pd.read_csv("/Users/salihburakgurhan/GitHub/Hellium10 and Keepa.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,URL,ASIN,Title_x,Brand,Fulfillment,Category,BSR,Subcategory,Price,...,Number of Images,Variation Count,Sales to Reviews,Buy Box 🚚: Current,Title_y,Sales Rank: Current,Sales Rank: 90 days avg.,Sales Rank: Drops last 90 days,Sales Rank: Subcategory Sales Ranks,Bought in past month
0,0,https://amazon.com/dp/B0933BVK6T,B0933BVK6T,Apple AirTag,Apple,Amazon,Electronics,1,Finders,27.99,...,6,0,0.23,$ 27.99,Apple AirTag,1.0,3,33,# 1 | Top 1% | Item Finders,10000
1,1,https://amazon.com/dp/B0932QJ2JZ,B0932QJ2JZ,Apple AirTag 4 Pack,Apple,Amazon,Electronics,3,Finders,84.99,...,6,0,0.36,$ 84.99,Apple AirTag 4 Pack,3.0,3,98,# 2 | Top 1% | Item Finders,10000
2,2,https://amazon.com/dp/B0CHWRXH8B,B0CHWRXH8B,Apple AirPods Pro (2nd Generation) Wireless Ea...,Apple,Amazon,Electronics,2,Earbud Headphones,189.0,...,6,1,4.48,$ 189.00,Apple AirPods Pro (2nd Generation) Wireless Ea...,2.0,5,100,# 1 | Top 1% | Earbud & In-Ear Headphones,10000
3,3,https://amazon.com/dp/B07PXGQC1Q,B07PXGQC1Q,Apple AirPods (2nd Generation) Wireless Ear Bu...,Apple,Amazon,Electronics,4,Earbud Headphones,99.0,...,6,0,0.06,$ 99.00,Apple AirPods (2nd Generation) Wireless Ear Bu...,4.0,5,105,# 2 | Top 1% | Earbud & In-Ear Headphones,10000
4,4,https://amazon.com/dp/B09BKHHL5H,B09BKHHL5H,Surge Protector USB Outlet Extender - 5-Outlet...,Lvetek,FBA,Electronics,11,Surge Protectors,12.98,...,8,0,1.93,$ 12.98,Surge Protector USB Outlet Extender - 5-Outlet...,13.0,41,197,# 1 | Top 1% | Surge Protectors,10000


# Data Manipulation

In [5]:
drop_columns = ['Buy Box 🚚: Current', 'Title_y', 'Sales Rank: Current','Sales Year Over Year (%)', 'Sales to Reviews', 'Price Trend (90 days) (%)', 'Sales Trend (90 days) (%)']

df = data.drop(drop_columns, axis=1)

In [6]:
print("Before: ", df.duplicated().sum())
df.drop_duplicates(inplace=True)
print("After: ", df.duplicated().sum())

Before:  0
After:  0


In [7]:
df.columns

Index(['Unnamed: 0', 'URL', 'ASIN', 'Title_x', 'Brand', 'Fulfillment',
       'Category', 'BSR', 'Subcategory', 'Price', 'Monthly Sales',
       'Monthly Revenue', 'Review Count', 'Reviews Rating', 'Seller',
       'Number of Active Sellers', 'Last Year Sales', 'Size Tier', 'Length',
       'Width', 'Height', 'Weight', 'Storage Fee (Jan - Sep)',
       'Storage Fee (Oct - Dec)', 'Best Sales Period', 'Age (Month)',
       'Number of Images', 'Variation Count', 'Sales Rank: 90 days avg.',
       'Sales Rank: Drops last 90 days', 'Sales Rank: Subcategory Sales Ranks',
       'Bought in past month'],
      dtype='object')

In [8]:
df = df[['ASIN', 'Fulfillment', 'BSR', 'Price', 'Review Count', 'Size Tier',
       'Sales Rank: Drops last 90 days','Bought in past month', 'Monthly Sales']]

In [9]:
df["Review Count"].fillna(df["Review Count"].mean(), inplace=True)

df["Size Tier"].fillna("FBM", inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4184 entries, 0 to 4183
Data columns (total 9 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ASIN                            4184 non-null   object 
 1   Fulfillment                     4184 non-null   object 
 2   BSR                             4184 non-null   int64  
 3   Price                           4184 non-null   float64
 4   Review Count                    4184 non-null   float64
 5   Size Tier                       4184 non-null   object 
 6   Sales Rank: Drops last 90 days  4184 non-null   int64  
 7   Bought in past month            4184 non-null   int64  
 8   Monthly Sales                   4184 non-null   int64  
dtypes: float64(2), int64(4), object(3)
memory usage: 294.3+ KB


In [11]:
df_2 = pd.get_dummies(df.drop("ASIN", axis=1))

In [12]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4184 entries, 0 to 4183
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   BSR                             4184 non-null   int64  
 1   Price                           4184 non-null   float64
 2   Review Count                    4184 non-null   float64
 3   Sales Rank: Drops last 90 days  4184 non-null   int64  
 4   Bought in past month            4184 non-null   int64  
 5   Monthly Sales                   4184 non-null   int64  
 6   Fulfillment_-                   4184 non-null   bool   
 7   Fulfillment_Amazon              4184 non-null   bool   
 8   Fulfillment_FBA                 4184 non-null   bool   
 9   Fulfillment_FBM                 4184 non-null   bool   
 10  Size Tier_FBM                   4184 non-null   bool   
 11  Size Tier_Large Oversize        4184 non-null   bool   
 12  Size Tier_Large Standard-Size   41

# Train - Test Split

In [13]:
X = df_2.drop("Monthly Sales", axis=1)
y = df_2["Monthly Sales"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)
print("y_train.shape", y_train.shape)
print("y_test.shape", y_test.shape)


X_train.shape (3347, 16)
X_test.shape (837, 16)
y_train.shape (3347,)
y_test.shape (837,)


In [15]:
def check_metrics(pred):
    rmse = round(mean_squared_error(y_test, pred, squared=False), 2)
    r2 = round(r2_score(y_test, pred),2)
    mape = round(mean_absolute_percentage_error(y_test, pred),3)

    print(f"RMSE: {rmse}")
    print(f"R2_Score: {r2}")
    print(f"MAPE: %{mape*100}")

# Model

In [16]:
xgb = XGBRegressor()

In [17]:
params = {
    "n_estimators" : [50, 100, 200, 400, 800, 1000, 2000],
    "max_depth" : [2, 4, 6, 8, 10, 20, 50],
    "learning_rate" : [0.01, 0.05, 0.10, 0.20, 0.50, 0.7, 0.15]
}

random_search = RandomizedSearchCV(estimator=xgb, param_distributions=params, n_iter=10, cv=5, verbose=2, random_state=42, n_jobs=-1)

random_search.fit(X, y)

print("Best, params: ", random_search.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END ...learning_rate=0.1, max_depth=6, n_estimators=800; total time=   1.0s
[CV] END .learning_rate=0.5, max_depth=20, n_estimators=2000; total time=   1.3s
[CV] END ...learning_rate=0.1, max_depth=6, n_estimators=800; total time=   0.9s
[CV] END ...learning_rate=0.1, max_depth=6, n_estimators=800; total time=   1.0s
[CV] END ...learning_rate=0.1, max_depth=6, n_estimators=100; total time=   0.1s
[CV] END ...learning_rate=0.1, max_depth=6, n_estimators=100; total time=   0.1s
[CV] END ...learning_rate=0.1, max_depth=6, n_estimators=100; total time=   0.1s
[CV] END ...learning_rate=0.1, max_depth=6, n_estimators=100; total time=   0.1s
[CV] END ...learning_rate=0.1, max_depth=6, n_estimators=100; total time=   0.1s
[CV] END .learning_rate=0.5, max_depth=20, n_estimators=2000; total time=   1.7s
[CV] END .learning_rate=0.5, max_depth=20, n_estimators=2000; total time=   1.7s
[CV] END .learning_rate=0.5, max_depth=20, n_est

In [18]:
xgb = XGBRegressor(random_state=42, n_estimators=31, max_depth=4, learning_rate=0.14, n_jobs=-1, objective="reg:squarederror")

xgb.fit(X_train, y_train)

y_preds = xgb.predict(X_test)

In [19]:
importance = xgb.feature_importances_
for i, col in enumerate(importance):
    print(f"{X_test.columns[i]}:, {col}")

BSR:, 0.0807504951953888
Price:, 0.011392793618142605
Review Count:, 0.005453013814985752
Sales Rank: Drops last 90 days:, 0.036321207880973816
Bought in past month:, 0.8513047695159912
Fulfillment_-:, 0.0
Fulfillment_Amazon:, 0.01038078311830759
Fulfillment_FBA:, 0.0
Fulfillment_FBM:, 0.004396948032081127
Size Tier_FBM:, 0.0
Size Tier_Large Oversize:, 0.0
Size Tier_Large Standard-Size:, 0.0
Size Tier_Medium Oversize:, 0.0
Size Tier_Small Oversize:, 0.0
Size Tier_Small Standard-Size:, 0.0
Size Tier_Special Oversize:, 0.0


In [20]:
check_metrics(y_preds)

RMSE: 478.47
R2_Score: 0.96
MAPE: %16.6


In [21]:
dif = y_test - y_preds
dif.mean()

4.50654472615414