## [Rohlik2 Lama v6 weighted](https://www.kaggle.com/code/samvelkoch/rohlik2-lama-v6-weighted)

In [4]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
import requests
import joblib

import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
import torch

from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

In [5]:
train = pd.read_csv('../../data/sales_train.csv', parse_dates= ['date'])
test = pd.read_csv('../../data/sales_test.csv', parse_dates= ['date']  )
ss = pd.read_csv('../../data/solution.csv')
inventory = pd.read_csv('../../data/inventory.csv')
calendar  = pd.read_csv('../../data/calendar.csv', parse_dates= ['date'])
weights  = pd.read_csv('../../data/test_weights.csv')

In [6]:
Frankfurt_1 = calendar.query('date >= "2020-08-01 00:00:00" and warehouse =="Frankfurt_1"')
Prague_2 = calendar.query('date >= "2020-08-01 00:00:00" and warehouse =="Prague_2"')
Brno_1 = calendar.query('date >= "2020-08-01 00:00:00" and warehouse =="Brno_1"')
Munich_1 = calendar.query('date >= "2020-08-01 00:00:00" and warehouse =="Munich_1"')
Prague_3 = calendar.query('date >= "2020-08-01 00:00:00" and warehouse =="Prague_3"')
Prague_1 = calendar.query('date >= "2020-08-01 00:00:00" and warehouse =="Prague_1"')
Budapest_1 = calendar.query('date >= "2020-08-01 00:00:00" and warehouse =="Budapest_1"')

def process_calendar(df):
    """
    Обрабатывает календарный датафрейм, добавляя новые колонки:
    - days_to_holiday
    - days_to_shops_closed
    - day_after_closing
    - long_weekend
    - weekday
    """
    # Убеждаемся, что даты отсортированы
    df = df.sort_values('date').reset_index(drop=True)
    
    # 1. days_to_holiday
    df['next_holiday_date'] = df.loc[df['holiday'] == 1, 'date'].shift(-1)
    df['next_holiday_date'] = df['next_holiday_date'].bfill()
    df['days_to_holiday'] = (df['next_holiday_date'] - df['date']).dt.days
    df.drop(columns=['next_holiday_date'], inplace=True)
    
    # 2. days_to_shops_closed
    df['next_shops_closed_date'] = df.loc[df['shops_closed'] == 1, 'date'].shift(-1)
    df['next_shops_closed_date'] = df['next_shops_closed_date'].bfill()
    df['days_to_shops_closed'] = (df['next_shops_closed_date'] - df['date']).dt.days
    df.drop(columns=['next_shops_closed_date'], inplace=True)
    
    # 3. day_after_closing
    df['day_after_closing'] = (
        (df['shops_closed'] == 0) & (df['shops_closed'].shift(1) == 1)
    ).astype(int)
    
    # 4. long_weekend
    df['long_weekend'] = (
        (df['shops_closed'] == 1) & (df['shops_closed'].shift(1) == 1)
    ).astype(int)
    
    # 5. weekday
    df['weekday'] = df['date'].dt.weekday  # 0 (понедельник) - 6 (воскресенье)
    
    return df


# Список датафреймов
dfs = ['Frankfurt_1', 'Prague_2', 'Brno_1', 'Munich_1', 'Prague_3', 'Prague_1', 'Budapest_1']

# Применяем функцию ко всем датафреймам и собираем их в список
processed_dfs = [process_calendar(globals()[df]) for df in dfs]

# Конкатенируем все датафреймы в один
calendar_extended = pd.concat(processed_dfs).sort_values('date').reset_index(drop=True)

In [7]:
train_calendar = train.merge(calendar_extended, on=['date', 'warehouse'], how='left')
train_inventory = train_calendar.merge(inventory, on=['unique_id', 'warehouse'], how='left')
train_data = train_inventory.merge(weights, on=['unique_id'], how='left')

test_calendar = test.merge(calendar_extended, on=['date', 'warehouse'], how='left')
test_data = test_calendar.merge(inventory, on=['unique_id', 'warehouse'], how='left')

In [8]:
train_data = train_data.drop(columns=['availability'])

train_data.head()

Unnamed: 0,unique_id,date,warehouse,total_orders,sales,sell_price_main,type_0_discount,type_1_discount,type_2_discount,type_3_discount,...,day_after_closing,long_weekend,weekday,product_unique_id,name,L1_category_name_en,L2_category_name_en,L3_category_name_en,L4_category_name_en,weight
0,4845,2024-03-10,Budapest_1,6436.0,16.34,646.26,0.0,0.0,0.0,0.0,...,0,0,6,2375,Croissant_35,Bakery,Bakery_L2_18,Bakery_L3_83,Bakery_L4_1,1.925596
1,4845,2021-05-25,Budapest_1,4663.0,12.63,455.96,0.0,0.0,0.0,0.0,...,1,0,1,2375,Croissant_35,Bakery,Bakery_L2_18,Bakery_L3_83,Bakery_L4_1,1.925596
2,4845,2021-12-20,Budapest_1,6507.0,34.55,455.96,0.0,0.0,0.0,0.0,...,0,0,0,2375,Croissant_35,Bakery,Bakery_L2_18,Bakery_L3_83,Bakery_L4_1,1.925596
3,4845,2023-04-29,Budapest_1,5463.0,34.52,646.26,0.20024,0.0,0.0,0.0,...,0,0,5,2375,Croissant_35,Bakery,Bakery_L2_18,Bakery_L3_83,Bakery_L4_1,1.925596
4,4845,2022-04-01,Budapest_1,5997.0,35.92,486.41,0.0,0.0,0.0,0.0,...,0,0,4,2375,Croissant_35,Bakery,Bakery_L2_18,Bakery_L3_83,Bakery_L4_1,1.925596


In [9]:
train_data.dropna(subset=['sales'], inplace=True)

In [10]:
train_data.dtypes

unique_id                          int64
date                      datetime64[ns]
warehouse                         object
total_orders                     float64
sales                            float64
sell_price_main                  float64
type_0_discount                  float64
type_1_discount                  float64
type_2_discount                  float64
type_3_discount                  float64
type_4_discount                  float64
type_5_discount                  float64
type_6_discount                  float64
holiday_name                      object
holiday                            int64
shops_closed                       int64
winter_school_holidays             int64
school_holidays                    int64
days_to_holiday                  float64
days_to_shops_closed             float64
day_after_closing                  int32
long_weekend                       int32
weekday                            int32
product_unique_id                  int64
name            

In [11]:
train_data = train_data.sort_values(['unique_id', 'date'])
#train_data = train_data.set_index('date')
train_data.head()

Unnamed: 0,unique_id,date,warehouse,total_orders,sales,sell_price_main,type_0_discount,type_1_discount,type_2_discount,type_3_discount,...,day_after_closing,long_weekend,weekday,product_unique_id,name,L1_category_name_en,L2_category_name_en,L3_category_name_en,L4_category_name_en,weight
3910978,0,2022-07-18,Budapest_1,5289.0,3.97,710.89,0.0,0.0,0.0,0.0,...,0,0,0,0,Plum_0,Fruit and vegetable,Fruit and vegetable_L2_0,Fruit and vegetable_L3_0,Fruit and vegetable_L4_0,2.535369
3910983,0,2022-07-19,Budapest_1,5255.0,73.36,710.89,0.0,0.0,0.0,0.0,...,0,0,1,0,Plum_0,Fruit and vegetable,Fruit and vegetable_L2_0,Fruit and vegetable_L3_0,Fruit and vegetable_L4_0,2.535369
3910959,0,2022-07-20,Budapest_1,5334.0,558.09,710.89,0.0,0.0,0.45045,0.0,...,0,0,2,0,Plum_0,Fruit and vegetable,Fruit and vegetable_L2_0,Fruit and vegetable_L3_0,Fruit and vegetable_L4_0,2.535369
3910961,0,2022-07-21,Budapest_1,5459.0,14.03,710.89,0.0,0.0,0.45045,0.0,...,0,0,3,0,Plum_0,Fruit and vegetable,Fruit and vegetable_L2_0,Fruit and vegetable_L3_0,Fruit and vegetable_L4_0,2.535369
3910956,0,2022-07-22,Budapest_1,5461.0,558.53,710.89,0.0,0.0,0.45045,0.0,...,0,0,4,0,Plum_0,Fruit and vegetable,Fruit and vegetable_L2_0,Fruit and vegetable_L3_0,Fruit and vegetable_L4_0,2.535369


In [18]:
task = Task('reg')

N_THREADS = 4
N_FOLDS = 8
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 3600*100

np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [19]:
%%time

automl = TabularAutoML(
    task = task,
    # timeout = TIMEOUT,
    cpu_limit = N_THREADS,
    #general_params = {'use_algos':[['linear_l2', 'lgb', 'lgb_tuned']]},
    selection_params={'mode' : 0},
    tuning_params = {'max_tuning_time': 3600},
    reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE}
)

out_of_fold_predictions = automl.fit_predict(
    train_data,
    roles = {
        'target': 'sales',
        #'drop': ['unique_id']
        'weights': 'weight'
    }, 
    verbose = 2
)

[17:58:40] Stdout logging level is INFO2.
[17:58:40] Task: reg

[17:58:40] Start automl preset with listed constraints:
[17:58:40] - time: 3600.00 seconds
[17:58:40] - CPU: 4 cores
[17:58:40] - memory: 16 GB

[17:58:40] [1mTrain data shape: (4007367, 30)[0m



[17:58:57] Layer [1m1[0m train process start. Time left 3583.01 secs
[18:00:42] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[18:00:42] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m =====
[18:02:44] ===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m =====
[18:05:24] Time limit exceeded after calculating fold 1

[18:05:25] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m-5848.615590942035[0m
[18:05:25] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[18:05:25] Time left 3195.06 secs

[18:07:25] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[18:07:25] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m =====
[18:09:54] ===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m =====
[18:12:20] Time limit exceeded after calculating fold 1

[18:12:20] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m-1541

Optimization Progress:   1%|          | 1/101 [02:44<4:34:07, 164.48s/it, best_trial=0, best_value=-1.39e+3]

[18:15:05] Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m completed
[18:15:05] The set of hyperparameters [1m{'feature_fraction': 0.6872700594236812, 'num_leaves': 244, 'bagging_fraction': 0.8659969709057025, 'min_sum_hessian_in_leaf': 0.24810409748678125, 'reg_alpha': 2.5361081166471375e-07, 'reg_lambda': 2.5348407664333426e-07}[0m
 achieve -1385.5925 mse
[18:15:05] Start fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m ...





[18:15:06] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m =====
[18:19:00] ===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m =====
[18:22:51] ===== Start working with [1mfold 2[0m for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m =====
[18:26:43] Time limit exceeded after calculating fold 2

[18:26:43] Fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m finished. score = [1m-1379.6366958947[0m
[18:26:43] [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m fitting and predicting completed
[18:26:44] Start fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m ...
[18:26:44] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m =====
[18:35:34] Time limit exceeded after calculating fold 0

[18:35:34] Fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m finished. score = [1m-3151.861746469119[0m
[18:35:34] [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m fitting and predicting completed
[18:35:34] Time left 1385.61 secs

[18:35:34]

In [20]:
joblib.dump(automl, 'model.pkl')

['model.pkl']

In [21]:
test_predictions = automl.predict(test_data)

In [22]:
test_predictions.data

array([[ 21.013128],
       [107.50994 ],
       [100.87147 ],
       ...,
       [132.39137 ],
       [610.4416  ],
       [ 29.529753]], dtype=float32)

In [23]:
submission = pd.DataFrame({
    'id': ss.id.values,
    'sales_hat': test_predictions.data[:, 0],
})

In [24]:
submission

Unnamed: 0,id,sales_hat
0,1226_2024-06-03,21.013128
1,1226_2024-06-11,107.509941
2,1226_2024-06-13,100.871468
3,1226_2024-06-15,95.317230
4,1226_2024-06-09,18.273584
...,...,...
47016,4572_2024-06-03,60.370441
47017,3735_2024-06-04,117.256508
47018,3735_2024-06-03,132.391373
47019,2129_2024-06-03,610.441589


In [32]:
submission.to_csv('lama_v6_weighted.csv', index = False)

In [37]:
submission['sales_hat'].values

array([ 21.013128, 107.50994 , 100.87147 , ..., 132.39137 , 610.4416  ,
        29.529753], dtype=float32)

In [38]:
print(f'MAE score: {mean_absolute_error(train_data["sales"].values, out_of_fold_predictions.data[:, 0])}')

ValueError: Found input variables with inconsistent numbers of samples: [4007367, 47021]