In [228]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline

In [229]:
df = pd.read_csv("data/main_dataset_cleaned.csv")

In [230]:
df

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,1643690.90,0,42.31,2.572,211.096358,8.106
1,1,1641957.44,1,38.51,2.548,211.242170,8.106
2,1,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,1554806.68,0,46.50,2.625,211.350143,8.106
...,...,...,...,...,...,...,...
6430,45,713173.95,0,64.88,3.997,192.013558,8.684
6431,45,733455.07,0,64.89,3.985,192.170412,8.667
6432,45,734464.36,0,54.47,4.000,192.327265,8.667
6433,45,718125.53,0,56.47,3.969,192.330854,8.667


In [231]:
X = df.drop("Weekly_Sales", axis=1)
y = df["Weekly_Sales"]

In [232]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=.25, random_state=42)

In [233]:
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [234]:
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

In [235]:
xgb.fit(X_train, y_train)

In [236]:
print(f"{r2_score(y_test, xgb.predict(X_test))}\n{mean_absolute_error(y_test, xgb.predict(X_test))}\n\n\n{r2_score(y_train, xgb.predict(X_train))}\n{mean_absolute_error(y_train, xgb.predict(X_train))}")

0.9483190088218927
72426.3792720634


0.9926403511281451
32498.733942835683


In [237]:
params = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [1, 1.5, 2]
}

In [None]:
rcv = RandomizedSearchCV(XGBRegressor(random_state=42), 
                         param_distributions=params, 
                         scoring="neg_mean_squared_error", 
                         cv=5, 
                         n_jobs=-1, 
                         refit=True, 
                         random_state=42)

In [239]:
rcv.fit(X_train, y_train)

In [240]:
rcv.best_score_

np.float64(-15577025183.015665)

In [241]:
print(f"{r2_score(y_test, rcv.predict(X_test))}\n{mean_absolute_error(y_test, rcv.predict(X_test))}\n\n\n{r2_score(y_train, rcv.predict(X_train))}\n{mean_absolute_error(y_train, rcv.predict(X_train))}")

0.9537656530682757
67990.56876126476


0.9885002144215074
37046.55696526627


In [242]:
rcv.best_params_

{'subsample': 1.0,
 'reg_lambda': 1,
 'reg_alpha': 0.01,
 'n_estimators': 500,
 'min_child_weight': 5,
 'max_depth': 7,
 'learning_rate': 0.05,
 'gamma': 0,
 'colsample_bytree': 0.6}

In [243]:
xgb_regressor = XGBRegressor(objective='reg:squarederror', 
                               random_state=42, 
                               subsample=1, 
                               reg_lambda=1, 
                               reg_alpha=0.01, 
                               n_estimators=500, 
                               min_child_weight=5, 
                               max_depth=7, 
                               learning_rate=0.05, 
                               gamma=0, 
                               colsample_bytree=0.6)

In [244]:
model = Pipeline([
    ("scaler", scaler),
    ("regressor", xgb_regressor)
])

In [245]:
model.fit(X_train, y_train)

In [246]:
print(f"{r2_score(y_test, model.predict(X_test))}\n{mean_absolute_error(y_test, model.predict(X_test))}\n\n\n{r2_score(y_train, model.predict(X_train))}\n{mean_absolute_error(y_train, model.predict(X_train))}")

0.9537656530682757
67990.56876126476


0.9885002144215074
37046.55696526627


In [247]:
import pickle as pkl

pkl.dump(model, open("models/model.pkl", "wb"))