In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import os
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
df_train = pd.read_csv('/content/drive/MyDrive/datasets/DSCS_Winter23_HW2/train_kaggle.csv')
df_test = pd.read_csv('/content/drive/MyDrive/datasets/DSCS_Winter23_HW2/test_kaggle.csv')
result = df_test['id']
result = pd.DataFrame(result)
df_test['Date'] = pd.to_datetime(df_test['Date'])
df_train['Date'] = pd.to_datetime(df_train['Date'])

In [3]:
df_test.head()

Unnamed: 0,id,Store_id,SKU_id,Date,Promo,Demand,Regular_Price,Promo_Price
0,0,1,1,2016-05-23,1.0,,128.98,119.6
1,1,1,1,2016-05-24,,,128.98,
2,2,1,1,2016-05-25,,,131.7,
3,3,1,1,2016-05-26,,,131.7,
4,4,1,1,2016-05-27,,,131.7,


In [4]:
# df_train['Day_of_week'] = df_train['Date'].dt.dayofweek
# df_train['Year'] = df_train['Date'].apply(lambda time: time.year)
df_train['Month'] = df_train['Date'].apply(lambda time: time.month)
df_train['Day'] = df_train['Date'].apply(lambda time: time.day)
df_train['Promo'].fillna(0, inplace=True)
df_train['Promo_Price'].fillna(df_train['Regular_Price'], inplace=True)
df_train['Price'] = df_train['Promo_Price'] * df_train['Promo'] + df_train['Regular_Price'] * (1 - df_train['Promo'])

# df_test['Day_of_week'] = df_test['Date'].dt.dayofweek
# df_test['Year'] = df_test['Date'].apply(lambda time: time.year)
df_test['Month'] = df_test['Date'].apply(lambda time: time.month)
df_test['Day'] = df_test['Date'].apply(lambda time: time.day)
df_test['Promo'].fillna(0, inplace=True)
df_test['Promo_Price'].fillna(df_test['Regular_Price'], inplace=True)
df_test['Price'] = df_test['Promo_Price'] * df_test['Promo'] + df_test['Regular_Price'] * (1 - df_test['Promo'])

In [5]:
df_train.drop(['Date', 'Promo_Price', 'Regular_Price'], axis=1, inplace=True)
df_test.drop(['Demand', 'id', 'Date', 'Promo_Price', 'Regular_Price'], axis=1, inplace=True)

In [6]:
df_train.head()

Unnamed: 0,Store_id,SKU_id,Promo,Demand,Month,Day,Price
0,1,1,0.0,22,1,1,163.78
1,1,1,0.0,41,2,1,163.78
2,1,1,0.0,35,3,1,163.78
3,1,1,0.0,72,4,1,163.78
4,1,1,0.0,25,5,1,163.78


In [7]:
X_train = df_train.drop(['Demand'], axis=1)
y_train = df_train['Demand']
X_test = df_test

In [8]:
scaler = StandardScaler()

X_train[['Store_id', 'Price', 'Month', 'Day']] = scaler.fit_transform(X_train[['Store_id', 'Price', 'Month', 'Day']])
X_test[['Store_id', 'Price', 'Month', 'Day']] = scaler.transform(X_test[['Store_id', 'Price', 'Month', 'Day']])

# X_train['Year'] = X_train['Year'] - 2015
# X_test['Year'] = X_test['Year'] - 2015

X_train['SKU_id'] = X_train['SKU_id'] - 1
X_test['SKU_id'] = X_test['SKU_id'] - 1

X_train[['Promo']] = X_train[['Promo']].astype(int)
X_test[['Promo']] = X_test[['Promo']].astype(int)

y_train = np.log(y_train)
y_train = y_train.replace(-np.inf, 0)
X_train

Unnamed: 0,Store_id,SKU_id,Promo,Month,Day,Price
0,-1.660707,0,0,-1.414025,-1.547807,2.252293
1,-1.660707,0,0,-1.128535,-1.547807,2.252293
2,-1.660707,0,0,-0.843045,-1.547807,2.252293
3,-1.660707,0,0,-0.557556,-1.547807,2.252293
4,-1.660707,0,0,-0.272066,-1.547807,2.252293
...,...,...,...,...,...,...
86911,2.391235,1,0,-0.272066,0.313604,0.189914
86912,2.391235,1,0,-0.272066,0.423099,0.189914
86913,2.391235,1,0,-0.272066,0.532593,0.189914
86914,2.391235,1,0,-0.272066,0.642088,0.189914


In [9]:
y_train

0        3.091042
1        3.713572
2        3.555348
3        4.276666
4        3.218876
           ...   
86911    2.197225
86912    1.098612
86913    2.564949
86914    1.098612
86915    0.000000
Name: Demand, Length: 86916, dtype: float64

In [10]:
num = [10, 20, 50]
RNG_SEED = 888
N_JOBS = 2
N_FOLDS = 5
SCORING_METRIC = 'neg_root_mean_squared_error'

In [11]:
train_models = []
train_results = []
train_model_names = []
train_metrics = []

In [12]:
train_models.append(('XGB', XGBRegressor(random_state=RNG_SEED, n_jobs=N_JOBS, objective='reg:squarederror', tree_method='gpu_hist')))

In [13]:
for name, model in train_models:
    kfold = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RNG_SEED)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=SCORING_METRIC, n_jobs=2, verbose=1)
    train_results.append(cv_results)
    train_model_names.append(name)
    train_metrics.append(cv_results.mean())
    print("%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()))
    print(model)
print ('Average metrics (' + SCORING_METRIC + ') from all models:', np.mean(train_metrics))

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


XGB: -0.637618 (0.006781)
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=2, num_parallel_tree=None, predictor=None,
             random_state=888, ...)
Average metrics (neg_root_mean_squared_error) from all models: -0.6376176383202268


[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    5.7s finished


In [14]:
tune_results = []
tune_model_names = []

In [15]:
tune_model1 = XGBRegressor(random_state=RNG_SEED, n_jobs=N_JOBS, objective='reg:squarederror', tree_method='gpu_hist')
tune_model_names.append('XGB_1')
paramGrid1 = dict(n_estimators=range(100, 501, 100),
                  max_depth=np.array([3, 6, 9]),
                  min_child_weight=np.array([1, 2, 3]))

kfold = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RNG_SEED)
grid1 = GridSearchCV(estimator=tune_model1, param_grid=paramGrid1, scoring=SCORING_METRIC, cv=kfold, n_jobs=N_JOBS, verbose=1)
grid_result1 = grid1.fit(X_train, y_train)

print("Best: %f using %s" % (grid_result1.best_score_, grid_result1.best_params_))
tune_results.append(grid_result1.cv_results_['mean_test_score'])
means = grid_result1.cv_results_['mean_test_score']
stds = grid_result1.cv_results_['std_test_score']
params = grid_result1.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 5 folds for each of 45 candidates, totalling 225 fits
Best: -0.609904 using {'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 100}
-0.718218 (0.005175) with: {'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100}
-0.694816 (0.005794) with: {'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200}
-0.683947 (0.005756) with: {'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 300}
-0.676601 (0.005961) with: {'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 400}
-0.670612 (0.006032) with: {'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 500}
-0.718218 (0.005175) with: {'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 100}
-0.694891 (0.005848) with: {'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 200}
-0.684038 (0.005750) with: {'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 300}
-0.676612 (0.006226) with: {'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 400}
-0.670692 (0.006685) with: {'max_depth': 3, 'min_child_weight':

In [16]:
BEST_MAX_DEPTH = grid_result1.best_params_['max_depth']
BEST_MIN_CHILD_WEIGHT = grid_result1.best_params_['min_child_weight']
BEST_N_ESTIMATORS = grid_result1.best_params_['n_estimators']
print('Best max_depth parameter:', BEST_MAX_DEPTH)
print('Best min_child_weight parameter:', BEST_MIN_CHILD_WEIGHT)
print('Best n_estimators parameter:', BEST_N_ESTIMATORS)

Best max_depth parameter: 9
Best min_child_weight parameter: 3
Best n_estimators parameter: 100


In [17]:
tune_model2 = XGBRegressor(n_estimators=BEST_N_ESTIMATORS, max_depth=BEST_MAX_DEPTH, min_child_weight=BEST_MIN_CHILD_WEIGHT,
                            random_state=RNG_SEED, n_jobs=N_JOBS, objective='reg:squarederror', tree_method='gpu_hist')
tune_model_names.append('XGB_2')
paramGrid2 = dict(subsample=np.array([0.7, 0.8, 0.9, 1.0]),
                  colsample_bytree=np.array([0.7, 0.8, 0.9, 1.0]))

kfold = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RNG_SEED)
grid2 = GridSearchCV(estimator=tune_model2, param_grid=paramGrid2, scoring=SCORING_METRIC, cv=kfold, n_jobs=N_JOBS, verbose=1)
grid_result2 = grid2.fit(X_train, y_train)

print("Best: %f using %s" % (grid_result2.best_score_, grid_result2.best_params_))
tune_results.append(grid_result2.cv_results_['mean_test_score'])
means = grid_result2.cv_results_['mean_test_score']
stds = grid_result2.cv_results_['std_test_score']
params = grid_result2.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best: -0.609904 using {'colsample_bytree': 1.0, 'subsample': 1.0}
-0.621161 (0.006983) with: {'colsample_bytree': 0.7, 'subsample': 0.7}
-0.619992 (0.005396) with: {'colsample_bytree': 0.7, 'subsample': 0.8}
-0.616410 (0.006581) with: {'colsample_bytree': 0.7, 'subsample': 0.9}
-0.616069 (0.005430) with: {'colsample_bytree': 0.7, 'subsample': 1.0}
-0.621161 (0.006983) with: {'colsample_bytree': 0.8, 'subsample': 0.7}
-0.619992 (0.005396) with: {'colsample_bytree': 0.8, 'subsample': 0.8}
-0.616410 (0.006581) with: {'colsample_bytree': 0.8, 'subsample': 0.9}
-0.616069 (0.005430) with: {'colsample_bytree': 0.8, 'subsample': 1.0}
-0.618989 (0.005768) with: {'colsample_bytree': 0.9, 'subsample': 0.7}
-0.615755 (0.004885) with: {'colsample_bytree': 0.9, 'subsample': 0.8}
-0.613294 (0.005364) with: {'colsample_bytree': 0.9, 'subsample': 0.9}
-0.611254 (0.005994) with: {'colsample_bytree': 0.9, 'subsample': 1.0}
-0.618103 (0.006309) 

In [18]:
BEST_COLSAMPLE_BYTREE = grid_result2.best_params_['colsample_bytree']
BEST_SUBSAMPLE = grid_result2.best_params_['subsample']
print('Best colsample_bytree parameter:', BEST_COLSAMPLE_BYTREE)
print('Best subsample parameter:', BEST_SUBSAMPLE)

Best colsample_bytree parameter: 1.0
Best subsample parameter: 1.0


In [19]:
final_model = XGBRegressor(n_estimators=BEST_N_ESTIMATORS, max_depth=BEST_MAX_DEPTH, min_child_weight=BEST_MIN_CHILD_WEIGHT,
                           colsample_bytree=BEST_COLSAMPLE_BYTREE, subsample=BEST_SUBSAMPLE, random_state=RNG_SEED,
                           n_jobs=N_JOBS, objective='reg:squarederror', tree_method='gpu_hist')
final_model.fit(X_train, y_train)
print(final_model)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=1.0, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=9, max_leaves=None,
             min_child_weight=3, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=2, num_parallel_tree=None, predictor=None,
             random_state=888, ...)


In [27]:
test_predictions =  final_model.predict(X_test)

In [28]:
test_predictions = np.exp(test_predictions)
test_predictions = np.round(test_predictions)
result['Demand'] = test_predictions
result.to_csv('submission.csv', index=False)
result

Unnamed: 0,id,Demand
0,0,330.0
1,1,128.0
2,2,94.0
3,3,92.0
4,4,83.0
...,...,...
5965,5965,2.0
5966,5966,2.0
5967,5967,7.0
5968,5968,5.0
