## setup

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import seaborn
%matplotlib inline

# project paths
project_root_dir = os.path.normpath(os.getcwd() + os.sep + os.pardir)

data_path = os.path.join(project_root_dir, "data")
os.makedirs(data_path, exist_ok=True)

# function for reading data
def read_data(filename, date_cols=None, file_path=data_path):
    csv_path = os.path.join(file_path, filename)
    return pd.read_csv(csv_path, parse_dates=date_cols)

# function for saving data as csv file
def save_dataframe(df, filename, file_path=data_path):
    path = os.path.join(file_path, filename)
    df.to_csv(path, index=False)

### Read Data

In [2]:
train = read_data("TRAIN.CSV", date_cols=["Date"])
test = read_data("TEST_FINAL.csv", date_cols=["Date"])
submission = read_data("SAMPLE.csv")

In [3]:
from prepare import prepare_data
X_train1, y_train1, X_test1, full_pipe = prepare_data(train, test)
#length of df
n = len(X_train1)
# training set
X_train = X_train1[:int(n*0.7)]
y_train = y_train1[:int(n*0.7)]
# validation set
X_valid = X_train1[int(n*0.7):]
y_valid = y_train1[int(n*0.7):]
# test set
X_test = X_test1.copy()

In [None]:
param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    gbm = lgb.train(param, dtrain)

In [6]:
import optuna
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.pipeline import make_pipeline

def objective(trial):
    # hyperparameter setting
    num_leaves = trial.suggest_int('num_leaves', 2, 256)
    reg_alpha = trial.suggest_loguniform('reg_alpha', 1e-8, 10.0)
    reg_lambda = trial.suggest_loguniform('reg_alpha', 1e-8, 10.0)
    learning_rate = trial.suggest_loguniform('learning_rate',0.001,0.2)
    
    lgbm = make_pipeline(full_pipe, LGBMRegressor(random_state=42, n_jobs=-1))
    lgbm.fit(X_train, y_train)
    y_pred = lgbm.predict(X_valid)
    msle = mean_squared_log_error(y_valid, y_pred)
    return msle

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=60)

[32m[I 2021-09-18 23:43:47,595][0m A new study created in memory with name: no-name-004664af-99ce-4423-b2f0-bcdf4790f3a1[0m
[32m[I 2021-09-18 23:43:48,323][0m Trial 0 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 229, 'reg_alpha': 2.335634945840647, 'learning_rate': 0.15755334891853467}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-18 23:43:49,042][0m Trial 1 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 136, 'reg_alpha': 3.3837383441711726, 'learning_rate': 0.009755453699452615}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-18 23:43:49,785][0m Trial 2 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 111, 'reg_alpha': 7.046689195177098, 'learning_rate': 0.02714148089052181}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-18 23:43:50,530][0m Trial 3 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 108, 'reg_alpha'

[32m[I 2021-09-18 23:44:12,437][0m Trial 33 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 162, 'reg_alpha': 7.370456251287568e-06, 'learning_rate': 0.020637849356277153}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-18 23:44:13,214][0m Trial 34 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 231, 'reg_alpha': 2.5777860052716494e-08, 'learning_rate': 0.04127659341491296}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-18 23:44:13,956][0m Trial 35 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 108, 'reg_alpha': 7.640864570501905, 'learning_rate': 0.15600101849332385}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-18 23:44:14,679][0m Trial 36 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 80, 'reg_alpha': 1.1692353989881262e-05, 'learning_rate': 0.07973393267764392}. Best is trial 0 with value: 0.10199453047681575.[0

In [7]:
print('Minimum mean squared error: ' + str(study.best_value))
print('Best parameter: ' + str(study.best_params))

Minimum mean squared error: 0.10199453047681575
Best parameter: {'num_leaves': 229, 'reg_alpha': 2.335634945840647, 'learning_rate': 0.15755334891853467}


In [12]:
def objective(trial):
    # hyperparameter setting
    num_leaves = trial.suggest_int('num_leaves', 2, 256)
    reg_alpha = trial.suggest_loguniform('reg_alpha', 1e-8, 10.0)
    reg_lambda = trial.suggest_loguniform('reg_alpha', 1e-8, 10.0)
    #learning_rate = trial.suggest_loguniform('learning_rate',0.0001,0.4)
    min_child_samples = trial.suggest_int('min_child_samples', 5, 100)
    n_estimators = trial.suggest_int('n_estimators',100,1000)
    
    lgbm = make_pipeline(full_pipe, LGBMRegressor(random_state=42, n_jobs=-1))
    lgbm.fit(X_train, y_train)
    y_pred = lgbm.predict(X_valid)
    msle = mean_squared_log_error(y_valid, y_pred)
    return msle

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=150)

[32m[I 2021-09-18 23:58:39,758][0m A new study created in memory with name: no-name-3815d3c1-d1a1-48ce-838d-f0faa4dffa4c[0m
[32m[I 2021-09-18 23:58:40,434][0m Trial 0 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 49, 'reg_alpha': 6.659965675575887e-08, 'min_child_samples': 40, 'n_estimators': 902}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-18 23:58:41,146][0m Trial 1 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 177, 'reg_alpha': 5.488947567094909e-06, 'min_child_samples': 90, 'n_estimators': 658}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-18 23:58:41,879][0m Trial 2 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 192, 'reg_alpha': 0.04939721625889302, 'min_child_samples': 38, 'n_estimators': 835}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-18 23:58:42,574][0m Trial 3 finished with value: 0.10199453047681575 and parameters: 

[32m[I 2021-09-18 23:59:03,017][0m Trial 32 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 59, 'reg_alpha': 0.28213123971192483, 'min_child_samples': 13, 'n_estimators': 454}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-18 23:59:03,800][0m Trial 33 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 131, 'reg_alpha': 0.8797198792497036, 'min_child_samples': 12, 'n_estimators': 612}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-18 23:59:04,512][0m Trial 34 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 136, 'reg_alpha': 9.646684558907696, 'min_child_samples': 29, 'n_estimators': 659}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-18 23:59:05,231][0m Trial 35 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 3, 'reg_alpha': 5.43476573009405e-08, 'min_child_samples': 58, 'n_estimators': 920}. Best is trial 0 with value: 0.

[32m[I 2021-09-18 23:59:25,875][0m Trial 64 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 182, 'reg_alpha': 4.2469635724005925e-07, 'min_child_samples': 43, 'n_estimators': 946}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-18 23:59:26,575][0m Trial 65 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 36, 'reg_alpha': 2.350201801957768e-08, 'min_child_samples': 40, 'n_estimators': 916}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-18 23:59:27,283][0m Trial 66 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 167, 'reg_alpha': 0.0006812287318068353, 'min_child_samples': 60, 'n_estimators': 808}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-18 23:59:28,005][0m Trial 67 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 14, 'reg_alpha': 7.925892568812012e-08, 'min_child_samples': 49, 'n_estimators': 841}. Best is trial 0 wi

[32m[I 2021-09-18 23:59:49,301][0m Trial 96 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 229, 'reg_alpha': 8.990759685981494e-08, 'min_child_samples': 45, 'n_estimators': 953}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-18 23:59:50,010][0m Trial 97 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 221, 'reg_alpha': 9.645057808704842e-07, 'min_child_samples': 39, 'n_estimators': 787}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-18 23:59:50,734][0m Trial 98 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 242, 'reg_alpha': 4.658133324973902e-06, 'min_child_samples': 41, 'n_estimators': 965}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-18 23:59:51,453][0m Trial 99 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 34, 'reg_alpha': 1.0052336288944945e-08, 'min_child_samples': 37, 'n_estimators': 913}. Best is trial 0 w

[32m[I 2021-09-19 00:00:13,157][0m Trial 128 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 175, 'reg_alpha': 0.00022230973158669678, 'min_child_samples': 69, 'n_estimators': 862}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-19 00:00:13,883][0m Trial 129 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 255, 'reg_alpha': 2.0251546534337575e-05, 'min_child_samples': 62, 'n_estimators': 903}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-19 00:00:14,601][0m Trial 130 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 180, 'reg_alpha': 3.722992359781807e-05, 'min_child_samples': 73, 'n_estimators': 800}. Best is trial 0 with value: 0.10199453047681575.[0m
[32m[I 2021-09-19 00:00:15,343][0m Trial 131 finished with value: 0.10199453047681575 and parameters: {'num_leaves': 212, 'reg_alpha': 3.396970932440059e-08, 'min_child_samples': 50, 'n_estimators': 825}. Best is tri

In [13]:
print('Minimum mean squared log error: ' + str(study.best_value))
print('Best parameter: ' + str(study.best_params))

Minimum mean squared log error: 0.10199453047681575
Best parameter: {'num_leaves': 49, 'reg_alpha': 6.659965675575887e-08, 'min_child_samples': 40, 'n_estimators': 902}


In [19]:
from  sklearn.model_selection import cross_val_score

lgbm = make_pipeline(full_pipe, LGBMRegressor(num_leaves=49,
                                              reg_alpha=6.659965675575887e-08,
                                              min_child_samples=40,
                                              n_estimators=902,
                                              random_state=42,n_jobs=-1))
scores = cross_val_score(lgbm, X_train, y_train, cv=5, scoring="neg_mean_squared_log_error")
print("Scores:", -scores)
print("Average score:", np.mean(-scores))

Traceback (most recent call last):
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 87, in __call__
    score = scorer._score(cached_call, estimator,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Scores: [0.10217816 0.07070072 0.05705155        nan 0.28774859]
Average score: nan


In [20]:
lgbm.fit(X_train1, y_train1)
submission['Sales'] = lgbm.predict(X_test)
save_dataframe(submission, "lgbm_optuna.csv")

In [22]:
from sklearn.ensemble import RandomForestRegressor

def objective(trial):
    # hyperparameter setting
    max_depth = trial.suggest_int("max_depth",5,40)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt','log2'])
    n_estimators = trial.suggest_int("n_estimators",100,500)
    
    rf = make_pipeline(full_pipe, RandomForestRegressor(random_state=42))
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_valid)
    msle = mean_squared_log_error(y_valid, y_pred)
    return msle

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

[32m[I 2021-09-19 00:24:06,997][0m A new study created in memory with name: no-name-592acede-5004-4a0c-ada1-0b333e862321[0m
[32m[I 2021-09-19 00:25:12,900][0m Trial 0 finished with value: 0.116745845740955 and parameters: {'max_depth': 27, 'min_samples_split': 4, 'min_samples_leaf': 13, 'max_features': 'auto', 'n_estimators': 129}. Best is trial 0 with value: 0.116745845740955.[0m
[32m[I 2021-09-19 00:26:20,951][0m Trial 1 finished with value: 0.116745845740955 and parameters: {'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 18, 'max_features': 'sqrt', 'n_estimators': 269}. Best is trial 0 with value: 0.116745845740955.[0m
[32m[I 2021-09-19 00:27:25,612][0m Trial 2 finished with value: 0.116745845740955 and parameters: {'max_depth': 18, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'log2', 'n_estimators': 250}. Best is trial 0 with value: 0.116745845740955.[0m
[32m[I 2021-09-19 00:28:30,367][0m Trial 3 finished with value: 0.116745845740955 

[32m[I 2021-09-19 00:57:50,971][0m Trial 31 finished with value: 0.116745845740955 and parameters: {'max_depth': 32, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'auto', 'n_estimators': 229}. Best is trial 0 with value: 0.116745845740955.[0m
[32m[I 2021-09-19 00:58:53,977][0m Trial 32 finished with value: 0.116745845740955 and parameters: {'max_depth': 35, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'auto', 'n_estimators': 258}. Best is trial 0 with value: 0.116745845740955.[0m
[32m[I 2021-09-19 00:59:57,363][0m Trial 33 finished with value: 0.116745845740955 and parameters: {'max_depth': 37, 'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': 'auto', 'n_estimators': 231}. Best is trial 0 with value: 0.116745845740955.[0m
[32m[I 2021-09-19 01:01:00,820][0m Trial 34 finished with value: 0.116745845740955 and parameters: {'max_depth': 26, 'min_samples_split': 12, 'min_samples_leaf': 2, 'max_features': 'auto', 'n_estimators': 183

In [23]:
print('Minimum mean squared log error: ' + str(study.best_value))
print('Best parameter: ' + str(study.best_params))

Minimum mean squared log error: 0.116745845740955
Best parameter: {'max_depth': 27, 'min_samples_split': 4, 'min_samples_leaf': 13, 'max_features': 'auto', 'n_estimators': 129}
