## [Rohlik2 Lama v6 weighted](https://www.kaggle.com/code/samvelkoch/rohlik2-lama-v6-weighted)

In [83]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
import requests
import joblib

import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
import torch

from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

In [84]:
N_THREADS = 12
N_FOLDS = 8
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 3600*100

np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [85]:
train = pd.read_csv('../../data/sales_train.csv', parse_dates= ['date'])
test = pd.read_csv('../../data/sales_test.csv', parse_dates= ['date']  )
ss = pd.read_csv('../../data/solution.csv')
inventory = pd.read_csv('../../data/inventory.csv')
weights  = pd.read_csv('../../data/test_weights.csv')
calendar  = pd.read_csv('../../data/calendar.csv', parse_dates= ['date'])

In [86]:
Frankfurt_1 = calendar.query('date >= "2020-08-01 00:00:00" and warehouse =="Frankfurt_1"')
Prague_2 = calendar.query('date >= "2020-08-01 00:00:00" and warehouse =="Prague_2"')
Brno_1 = calendar.query('date >= "2020-08-01 00:00:00" and warehouse =="Brno_1"')
Munich_1 = calendar.query('date >= "2020-08-01 00:00:00" and warehouse =="Munich_1"')
Prague_3 = calendar.query('date >= "2020-08-01 00:00:00" and warehouse =="Prague_3"')
Prague_1 = calendar.query('date >= "2020-08-01 00:00:00" and warehouse =="Prague_1"')
Budapest_1 = calendar.query('date >= "2020-08-01 00:00:00" and warehouse =="Budapest_1"')

def process_calendar(df):
    """
    Обрабатывает календарный датафрейм, добавляя новые колонки:
    - days_to_holiday
    - days_to_shops_closed
    - day_after_closing
    - long_weekend
    - weekday
    """
    # Убеждаемся, что даты отсортированы
    df = df.sort_values('date').reset_index(drop=True)
    
    # 1. days_to_holiday
    df['next_holiday_date'] = df.loc[df['holiday'] == 1, 'date'].shift(-1)
    df['next_holiday_date'] = df['next_holiday_date'].bfill()
    df['days_to_holiday'] = (df['next_holiday_date'] - df['date']).dt.days
    df.drop(columns=['next_holiday_date'], inplace=True)
    
    # 2. days_to_shops_closed
    df['next_shops_closed_date'] = df.loc[df['shops_closed'] == 1, 'date'].shift(-1)
    df['next_shops_closed_date'] = df['next_shops_closed_date'].bfill()
    df['days_to_shops_closed'] = (df['next_shops_closed_date'] - df['date']).dt.days
    df.drop(columns=['next_shops_closed_date'], inplace=True)
    
    # 3. day_after_closing
    df['day_after_closing'] = (
        (df['shops_closed'] == 0) & (df['shops_closed'].shift(1) == 1)
    ).astype(int)
    
    # 4. long_weekend
    df['long_weekend'] = (
        (df['shops_closed'] == 1) & (df['shops_closed'].shift(1) == 1)
    ).astype(int)
    
    # 5. weekday
    df['weekday'] = df['date'].dt.weekday  # 0 (понедельник) - 6 (воскресенье)
    
    return df


# Список датафреймов
dfs = ['Frankfurt_1', 'Prague_2', 'Brno_1', 'Munich_1', 'Prague_3', 'Prague_1', 'Budapest_1']

# Применяем функцию ко всем датафреймам и собираем их в список
processed_dfs = [process_calendar(globals()[df]) for df in dfs]

# Конкатенируем все датафреймы в один
calendar_extended = pd.concat(processed_dfs).sort_values('date').reset_index(drop=True)

In [87]:
train_calendar = train.merge(calendar_extended, on=['date', 'warehouse'], how='left')
train_inventory = train_calendar.merge(inventory, on=['unique_id', 'warehouse'], how='left')
train_data = train_inventory.merge(weights, on=['unique_id'], how='left')

test_calendar = test.merge(calendar_extended, on=['date', 'warehouse'], how='left')
test_data = test_calendar.merge(inventory, on=['unique_id', 'warehouse'], how='left')

In [None]:
train_data = train_data.drop(columns=['availability'])

train_data.head()

In [89]:
train_data.dropna(subset=['sales'], inplace=True)

In [None]:
train_data.dtypes

In [None]:
train_data = train_data.sort_values(['unique_id', 'date'])
#train_data = train_data.set_index('date')
train_data.head()

In [92]:
task = Task('reg')

In [None]:
print(train_data.isna().sum())

In [None]:
%%time

automl = TabularAutoML(
    task = task,
    # timeout = TIMEOUT,
    cpu_limit = N_THREADS,
    #general_params = {'use_algos':[['linear_l2', 'lgb', 'lgb_tuned']]},
    selection_params={'mode' : 0},
    tuning_params = {'max_tuning_time': 3600},
    reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE}
)

out_of_fold_predictions = automl.fit_predict(
    train_data,
    roles = {
        'target': 'sales',
        #'drop': ['unique_id']
        'weights': 'weight'
    }, 
    verbose = 2
)

In [None]:
joblib.dump(automl, 'model.pkl')

In [96]:
test_predictions = automl.predict(test_data)

In [None]:
test_predictions.data

In [98]:
submission = pd.DataFrame({
    'id': ss.id.values,
    'sales_hat': test_predictions.data[:, 0],
})

In [None]:
submission

In [100]:
submission.to_csv('lama_v6_weighted.csv', index = False)

In [None]:
print(f'MAE score: {mean_absolute_error(train_data["sales"].values, submission["sales_hat"].values)}')
