### Import Necessary Libraries

In [2]:
import pandas as pd
import numpy as np
import datetime
from datetime import date
from sklearn.metrics import r2_score


### Load Dataset

In [3]:
stockx_data = pd.read_excel("./StockX-Data-Contest-2019.xlsx",sheet_name="Raw Data")
color = pd.read_excel("./supplemental_data_colorway.xlsx",sheet_name="Sheet1")

In [4]:
# --- Data Cleaning ---
# calculate price premium: (sale price - retail price)/retail price
stockx_data["Price_Percentage_change"] = (stockx_data['Sale Price'] - stockx_data['Retail Price'])/stockx_data['Retail Price']

# calculate days since release: order data - release date
stockx_data["Days_Since_Release"] = (pd.to_datetime(stockx_data['Order Date']) - pd.to_datetime(stockx_data['Release Date']))/np.timedelta64('1','D')

# --- style variables ---
# style: v2
stockx_data["yeezy"] = stockx_data['Sneaker Name'].apply(lambda x : 1 if 'Yeezy' in x.split("-") else 0)
# style: airjordan
stockx_data["airjordan"] = stockx_data['Sneaker Name'].apply(lambda x : 1 if 'Jordan' in x.split("-") else 0)
# style: airforce
stockx_data["airforce"] = stockx_data['Sneaker Name'].apply(lambda x : 1 if 'Force' in x.split("-") else 0)
# style: airmax90
stockx_data["airmax90"] = stockx_data['Sneaker Name'].apply(lambda x : 1 if '90' in x.split("-") else 0)
# style: airmax97
stockx_data["airmax97"] = stockx_data['Sneaker Name'].apply(lambda x : 1 if '97' in x.split("-") else 0)
# style: presto
stockx_data["presto"] = stockx_data['Sneaker Name'].apply(lambda x : 1 if 'Presto' in x.split("-") else 0)
# style: vapormax
stockx_data["vapormax"] = stockx_data['Sneaker Name'].apply(lambda x : 1 if 'VaporMax' in x.split("-") else 0)
# style: blazer
stockx_data["blazer"] = stockx_data['Sneaker Name'].apply(lambda x : 1 if 'Blazer' in x.split("-") else 0)
# style: zoom
stockx_data["zoom"] = stockx_data['Sneaker Name'].apply(lambda x : 1 if 'Zoom' in x.split("-") else 0)
# style: zoom
stockx_data["react"] = stockx_data['Sneaker Name'].apply(lambda x : 1 if 'React' in x.split("-") else 0)

# --- state variables ---
# creating the california variable
stockx_data["California"] = stockx_data["Buyer Region"].apply(lambda x : 1 if 'California' in x else 0)
# creating the new_york variable
stockx_data["New York"] = stockx_data["Buyer Region"].apply(lambda x : 1 if 'New York' in x else 0)
# creating the oregon variable
stockx_data["Oregon"] = stockx_data["Buyer Region"].apply(lambda x : 1 if 'Oregon' in x else 0)
# creating the florida variable
stockx_data["Florida"] = stockx_data["Buyer Region"].apply(lambda x : 1 if 'Florida' in x else 0)
# creating the texas variable
stockx_data["Texas"] = stockx_data["Buyer Region"].apply(lambda x : 1 if 'Texas' in x else 0)
# creating the other_state variable
above5pct_states = ["California", "New York", "Oregon", "Florida", "Texas"]
stockx_data["Other States"] = pd.Series(list(map(int,~stockx_data["Buyer Region"].isin(above5pct_states))))


In [5]:
stockx_data = stockx_data.merge(color, left_on = 'Sneaker Name', right_on = 'Style', how = 'left')
stockx_data = stockx_data.drop(columns = ['Order Date', 'Brand','Sneaker Name', 'Release Date', 'Buyer Region',
                                         'Website', 'Product Line', 'Style'])
stockx_data = stockx_data.fillna(0)

### Make Training and Testing Data

In [6]:
#import the train-test split
from sklearn.model_selection import train_test_split

In [7]:
from sklearn.model_selection import train_test_split
y = stockx_data["Price_Percentage_change"]
X = stockx_data.drop(["Sale Price",'Price_Percentage_change', "Other States", "New York", "Texas"], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=808)

In [8]:
from sklearn.metrics import mean_squared_error as mse

### 1. Adaboost Regressor

In [9]:
from sklearn.ensemble import AdaBoostRegressor

In [10]:
#creating an AdaBoost instance
clf = AdaBoostRegressor(random_state=808)
#training the model
clf.fit(X_train,y_train)
pred_train=clf.predict(X_train)
pred_test=clf.predict(X_test)

In [11]:
rmse_train1 = np.sqrt(mse(y_train, pred_train))
rmse_test1 = np.sqrt(mse(y_test, pred_test))

print('Training error:',rmse_train1)
print('Testing error:', rmse_test1)

Training error: 1.6441434632330953
Testing error: 1.6486352819272658


In [12]:
r21 = r2_score(y_test, pred_test)
print(f'R-squared (R2) score: {r21}')

R-squared (R2) score: -0.19843873169476045


## 2. Tuned Adaboost Regressor

In [13]:
from sklearn.tree import DecisionTreeRegressor as dt

In [14]:
clf_tuned = AdaBoostRegressor(random_state=96, base_estimator=dt(random_state=101,criterion='squared_error',max_depth=4,max_leaf_nodes=15,min_samples_split=30),n_estimators=200,learning_rate=0.05)

In [15]:

clf_tuned.fit(X_train,y_train)
pred_train=clf_tuned.predict(X_train)
pred_test=clf_tuned.predict(X_test)



In [16]:
rmse_train2 = np.sqrt(mse(y_train, pred_train))
rmse_test2 = np.sqrt(mse(y_test, pred_test))

print('Training error:',rmse_train2)
print('Testing error:', rmse_test2)

Training error: 0.7114241192251485
Testing error: 0.715205217058389


In [17]:
r22 = r2_score(y_test, pred_test)
print(f'R-squared (R2) score: {r22}')

R-squared (R2) score: 0.7744577003583968


## 3. Gradient Boosting Regressor

In [18]:
from sklearn.ensemble import GradientBoostingRegressor

In [19]:
#creating an GradientBoost instance
grb = GradientBoostingRegressor(random_state=101)
#training the model
grb.fit(X_train,y_train)
pred_train=grb.predict(X_train)
pred_test=grb.predict(X_test)

In [20]:
rmse_train3 = np.sqrt(mse(y_train, pred_train))
rmse_test3 = np.sqrt(mse(y_test, pred_test))

print('Training error:',rmse_train3)
print('Testing error:', rmse_test3)

Training error: 0.3745000865259489
Testing error: 0.37336734206898986


In [21]:
r23 = r2_score(y_test, pred_test)
print(f'R-squared (R2) score: {r23}')

R-squared (R2) score: 0.9385333827340063


## 4. XGBoost Regressor

In [22]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [23]:
import xgboost as xg

In [24]:
#creating an XGBoost instance
xgb = xg.XGBRegressor(random_state=101)
#training the model
xgb.fit(X_train,y_train)
pred_train=xgb.predict(X_train)
pred_test=xgb.predict(X_test)

In [26]:
xgb = xg.XGBRegressor(random_state=101, max_depth = 10)
#training the model
xgb.fit(X_train,y_train)
pred_train=xgb.predict(X_train)
pred_test=xgb.predict(X_test)
rmse_train4 = np.sqrt(mse(y_train, pred_train))
rmse_test4 = np.sqrt(mse(y_test, pred_test))

print('Training error:',rmse_train4)
print('Testing error:', rmse_test4)

Training error: 0.09822854012298676
Testing error: 0.1872981968194215


In [27]:
r24 = r2_score(y_test, pred_test)
print(f'R-squared (R2) score: {r24}')

R-squared (R2) score: 0.9845320111871657


In [28]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

X = stockx_data.drop(["Sale Price", 'Price_Percentage_change', "Other States", "New York", "Texas"], axis=1)

# Standardize Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=0.95)  # Keep 95% of the variance
X_pca = pca.fit_transform(X_scaled)

X_pca_df = pd.DataFrame(data=X_pca)
X_pca_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.412970,-1.328111,-1.311169,0.596513,-0.252531,2.920776,-1.407448,-1.953588,1.309648,0.571839,-0.871524,0.335676,0.481301,0.092134,-1.253521,-0.398610,-1.672678,-1.745897,3.019067,0.723876
1,0.548845,-1.832994,-0.320594,-0.002316,-1.220471,1.745614,-1.233464,-2.132382,1.846514,0.839227,-0.576741,0.000239,0.324575,0.109933,-0.542722,-0.156204,-2.507055,-1.242702,2.531003,0.908869
2,-0.629859,-1.747690,0.917176,-0.076270,-0.068997,-1.489230,-0.269612,-1.867079,2.598703,5.341688,3.701759,3.787111,-5.298174,19.823277,-4.065362,-0.302559,2.207974,-0.427654,0.351574,-1.050571
3,-0.315670,-0.774497,0.160701,0.096539,-0.590401,-1.616038,0.267813,-2.119834,1.081455,2.508953,4.085114,3.301514,0.561295,-2.662588,-0.944625,-0.113196,1.024592,-0.518176,-0.238494,-0.416557
4,-0.267802,-0.801606,0.092629,0.168490,-0.507406,-1.645254,0.230299,-1.973029,1.125650,2.525847,4.024626,3.241451,0.582549,-2.677222,-0.988786,0.015292,0.994405,-0.559089,-0.333687,-0.470237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99951,-0.170602,0.514127,-0.420810,-0.711458,1.024277,-0.150584,-0.785620,2.747464,1.529005,-0.217253,0.206262,0.897704,-0.164327,-0.261815,0.093771,-0.608278,-0.258051,0.702085,1.200536,-0.875054
99952,-0.207518,0.360930,-0.450364,-0.741410,1.069236,0.070012,-0.671225,-0.285355,-0.142382,1.194491,-1.137398,-0.066538,0.345988,0.008159,-0.993798,0.429576,-0.531570,0.224616,0.365926,-1.000834
99953,-0.334761,0.337433,-0.420034,-0.704958,1.035798,0.028851,-0.511069,1.035440,0.483329,0.046428,-0.360650,0.020922,0.159823,-0.153200,-0.673344,0.692741,0.130563,-1.072404,-0.759687,-0.435302
99954,-0.168849,0.367004,-0.401619,-0.761220,1.042511,0.095983,-0.596533,-0.761811,-0.448447,0.984386,-0.950371,0.106359,0.355595,0.063816,-0.896432,-0.284839,-0.748607,0.500290,0.539342,-1.180844


In [29]:
from sklearn.metrics import mean_squared_error

X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca_df, y, test_size=0.33, random_state=808)

cart_model = xg.XGBRegressor(random_state=101, max_depth = 10)
cart_model.fit(X_train_pca, y_train)

cart_predictions_pca = cart_model.predict(X_test_pca)
cart_mse_pca = mean_squared_error(y_test, cart_predictions_pca)
rmse_pca = np.sqrt(cart_mse_pca)
r_squared_pca = r2_score(y_test, cart_predictions_pca)

print(f"Random Forest Model MSE with PCA: {cart_mse_pca}")
print(f"Root Mean Squared Error (RMSE): {rmse_pca}")
print(f"R-squared: {r_squared_pca}")

Random Forest Model MSE with PCA: 0.053911438464220675
Root Mean Squared Error (RMSE): 0.23218836849467864
R-squared: 0.9762289931867558


## Summary

In [30]:
mdata = {'Model': ["AdaBoost","Tuned AdaBoost", "GradientBoost","XGBoost"],
        "Training RMSE": [rmse_train1,rmse_train2,rmse_train3,rmse_train4],
        "Test RMSE": [rmse_test1,rmse_test2,rmse_test3,rmse_test4],
        "OSR^2": [r21,r22,r23,r24]} 
pd.DataFrame.from_dict(mdata)

Unnamed: 0,Model,Training RMSE,Test RMSE,OSR^2
0,AdaBoost,1.644143,1.648635,-0.198439
1,Tuned AdaBoost,0.711424,0.715205,0.774458
2,GradientBoost,0.3745,0.373367,0.938533
3,XGBoost,0.098229,0.187298,0.984532


## Conclusion :
**Gradient Boost and Xgboost are doing better even without Hyperparameter Tuning. We can use Gradient Boost Regressor as Testing Error is least i.e it performs the best on  unseen Data**