<a href="https://colab.research.google.com/github/blackcaer/SMF-training/blob/main/SMF_all.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Initializing
from google.colab import drive
drive.mount('/content/drive/')

"""gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')"""

!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd  # https://github.com/microsoft/LightGBM/issues/5914

!pip install lightgbm --config-settings=cmake.define.USE_GPU=ON

import lightgbm as lgb
from IPython.display import display

import os
import json
import time
import gc
import random
import psutil
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from datetime import datetime
from sys import getsizeof

from collections import defaultdict, Counter
from itertools import combinations

from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error,mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, GroupShuffleSplit, GridSearchCV
from sklearn.model_selection import TimeSeriesSplit

from scipy.fftpack import fft
from scipy.stats import zscore

from torch.cuda import get_device_name,is_available
#from os import cpu_count

#print(f"Liczba rdzeni procesora: {cpu_count()} (realnych 2x mniej prawdopodobnie)")
gpu_available = is_available()

if gpu_available:
    print("GPU jest dostępne.")
    print("Nazwa GPU:", get_device_name(0))
else:
    print("GPU nie jest dostępne.")
_features_added=False


Mounted at /content/drive/


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



GPU nie jest dostępne.


In [2]:
#@title Constants

ITEMS_PHSM_JSON = '/content/drive/My Drive/SMF_files/items_phsm.json'
PLAYER_COUNT_JSON = '/content/drive/My Drive/SMF_files/rust_player_count_interpolated.json'
TRAIN_PATH = '/content/drive/My Drive/SMF_files/train_set.csv'
VALID_PATH = '/content/drive/My Drive/SMF_files/validation_set.csv'

TRAIN_PATH_TEST = '/content/drive/My Drive/SMF_files/train_set_test.csv'
VALID_PATH_TEST = '/content/drive/My Drive/SMF_files/validation_set_test.csv'
ITEMS_PHSM_JSON_TEST = '/content/drive/My Drive/SMF_files/items_phsm_test.json'

SPIKES_TH=2
SPIKES_TH_TEST=1.5
SPIKES_PATH='/content/drive/My Drive/SMF_files/spikes_correction.csv'
SPIKES_PATH_TEST='/content/drive/My Drive/SMF_files/spikes_correction_test.csv'

MODEL_SAVES_PATH='/content/drive/MyDrive/SMF_files/model_saves'

ITEMS_TO_EXCLUDE=['Metal Tree Door'] # 'Metal Tree Door' - like one of the rarest items in the game, very few sales since 2018

In [3]:
#@title Params
TEST_MODE = 1
MAKE_SPIKES_CORR_FILE=0


In [4]:
#@title reducer

"""reducing.py
Author: Kirgsn, 2018

Use like this:
>>> import reducing
>>> df = reducing.Reducer().reduce(df)
"""
from joblib import Parallel, delayed
from fastprogress import master_bar, progress_bar

#__all__ = ['Reducer']

def measure_time_mem(func):
    def wrapped_reduce(self, df, *args, **kwargs):
        # pre
        mem_usage_orig = df.memory_usage().sum() / self.memory_scale_factor
        start_time = time.time()
        # exec
        ret = func(self, df, *args, **kwargs)
        # post
        mem_usage_new = ret.memory_usage().sum() / self.memory_scale_factor
        end_time = time.time()
        print(f'reduced df from {mem_usage_orig:.4f} MB '
              f'to {mem_usage_new:.4f} MB '
              f'in {(end_time - start_time):.2f} seconds')
        gc.collect()
        return ret
    return wrapped_reduce


class Reducer:
    """
    Class that takes a dict of increasingly big numpy datatypes to transform
    the data of a pandas dataframe into, in order to save memory usage.
    """
    memory_scale_factor = 1024**2  # memory in MB

    def __init__(self, conv_table=None, use_categoricals=True, n_jobs=-1):
        """
        :param conv_table: dict with np.dtypes-strings as keys
        :param use_categoricals: Whether the new pandas dtype "Categoricals"
                shall be used
        :param n_jobs: Parallelization rate
        """

        self.conversion_table = \
            conv_table or {'int': [np.int8, np.int16, np.int32, np.int64],
                           'uint': [np.uint8, np.uint16, np.uint32, np.uint64],
                           'float': [np.float32, ]}
        self.null_int = {   np.int8:  pd.Int8Dtype,
                            np.int16: pd.Int16Dtype,
                            np.int32: pd.Int32Dtype,
                            np.int64: pd.Int64Dtype,
                            np.uint8: pd.UInt8Dtype,
                            np.uint16:pd.UInt16Dtype,
                            np.uint32:pd.UInt32Dtype,
                            np.uint64:pd.UInt64Dtype}

        self.use_categoricals = use_categoricals
        self.n_jobs = n_jobs

    def _type_candidates(self, k):
        for c in self.conversion_table[k]:
            i = np.iinfo(c) if 'int' in k else np.finfo(c)
            yield c, i

    @measure_time_mem
    def reduce(self, df, verbose=False):
        """Takes a dataframe and returns it with all data transformed to the
        smallest necessary types.

        :param df: pandas dataframe
        :param verbose: If True, outputs more information
        :return: pandas dataframe with reduced data types
        """
        ret_list = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(progress_bar(list(delayed(self._reduce)
                                                (df[c], c, verbose) for c in
                                                df.columns)))

        del df
        gc.collect()
        return pd.concat(ret_list, axis=1)

    def _reduce(self, s, colname, verbose):
        try:
            isnull = False
            # skip NaNs
            if s.isnull().any():
                isnull = True
            # detect kind of type
            coltype = s.dtype
            if np.issubdtype(coltype, np.integer):
                conv_key = 'int' if s.min() < 0 else 'uint'
            elif np.issubdtype(coltype, np.floating):
                conv_key = 'float'
                asint = s.fillna(0).astype(np.int64)
                result = (s - asint)
                result = np.abs(result.sum())
                if result < 0.01:
                    conv_key = 'int' if s.min() < 0 else 'uint'
            else:
                if isinstance(coltype, object) and self.use_categoricals:
                    # check for all-strings series
                    if s.apply(lambda x: isinstance(x, str)).all():
                        if verbose: print(f'convert {colname} to categorical')
                        return s.astype('category')
                if verbose: print(f'{colname} is {coltype} - Skip..')
                return s
            # find right candidate
            for cand, cand_info in self._type_candidates(conv_key):
                if s.max() <= cand_info.max and s.min() >= cand_info.min:
                    if verbose: print(f'convert {colname} to {cand}')
                    if isnull:
                        return s.astype(self.null_int[cand]())
                    else:
                        return s.astype(cand)

            # reaching this code is bad. Probably there are inf, or other high numbs
            print(f"WARNING: {colname} doesn't fit the grid with \nmax: {s.max()} "
                f"and \nmin: {s.min()}")
            print('Dropping it..')
        except Exception as ex:
            print(f'Exception for {colname}: {ex}')
            return s

def reduce_mem_usage(df):
  return Reducer().reduce(df)

In [5]:
#@title Helpers
def show_metrics(y_pred_train, y_train, y_pred_test, y_test, y_pred_naive, label="[no label]"):
  """Function to display metrics"""

  print('=' * 8 + " " + label + " " + '=' * 8)

  rmse_naive = mean_squared_error(y_test, y_pred_naive, squared=False)
  #rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
  rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)

  print(f'RMSE change: {(1 - rmse_test / rmse_naive) * -100:.2f}%   (naive->test)')


  results = {
      "Metric": ["Accuracy", "RMSE", "MAE", "MAPE", "Max Error", "Median Absolute Error"],
      "Train": calc_accuracy(y_pred_train, y_train, "Train accuracy:"),
      "Valid": calc_accuracy(y_pred_test, y_test, "Validation accuracy:"),
      "Naive": calc_accuracy(y_pred_naive, y_test, "Naive accuracy:")
  }

  df = pd.DataFrame(results)
  display(df)

def calc_accuracy(pred, actual, label=""):
  """ Returns accuracy metrics: accuracy, rmse, mae, mape, max_error, median_absolute_error """
  N=3
  errors = np.abs(pred - actual)
  accuracy = round(100 * (1 - np.mean(errors / actual)), N)
  max_error = round(np.max(errors), N)
  std_deviation = round(np.std(errors), N)
  mae = round(np.mean(errors), N)
  median_absolute_error = round(np.median(errors), N)
  mape = round(mean_absolute_percentage_error(actual, pred), N)
  rmse = round(mean_squared_error(actual, pred, squared=False), N)

  return accuracy, rmse, mae, mape, max_error, median_absolute_error


def _print_columns_info(df, show_type=1, show_minmax=0, first_x_cols=50):
    print("Column names:", df.columns)
    print("Columns number:", len(df.columns))

    lst = []
    cols = ['Column']
    if show_type:
        cols.append('Type')
    if show_minmax:
        cols.append('Min')
        cols.append('Max')

    for column in df.columns:
        row = {'Column': column}
        if show_type:
            row['Type'] = df[column].dtype
        if show_minmax:
            try:
                row['Min'] = df[column].min()
                row['Max'] = df[column].max()
            except TypeError as e:
                row['Min'] = None
                row['Max'] = None

        lst.append(row)

    summary_df = pd.DataFrame(lst, columns=cols)

    if first_x_cols:
        summary_df = summary_df[:first_x_cols]
    # Print the summary DataFrame
    with pd.option_context('display.max_columns', None):
        print(summary_df)

def print_importances(model,start=0,print_tab=False,name="",figsize=(10, 18),end=-1):
  importance = model.feature_importance(importance_type='gain')  # 'split' lub 'gain'
  feature_names = model.feature_name()

  trim=0
  importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
  importance_df = importance_df.sort_values(by='Importance', ascending=False)[start:end]

  # Wizualizacja
  print(f"Trimmed first {trim} features to see better")
  plt.figure(figsize=figsize)
  plt.barh(importance_df['Feature'], importance_df['Importance'])
  plt.xlabel('Importance')
  plt.title(f'Feature Importance {name}')
  plt.style.use('dark_background')
  plt.gca().invert_yaxis()
  plt.show()
  pd.set_option('display.max_rows', None)
  if print_tab:
    display(importance_df)

def aggregate_pricehistories(pricehistories):
    interpolated_histories = []
    max_date = max(df.index.max() for df in pricehistories)
    for df in pricehistories:
        df_resampled = df.resample('D').interpolate(method='linear')
        full_date_range = pd.date_range(start=df_resampled.index.min(), end=max_date, freq='D')
        df_reindexed = df_resampled.reindex(full_date_range)
        df_filled = df_reindexed.ffill()

        interpolated_histories.append(df_filled)

    all_data = pd.concat(interpolated_histories)

    sum = all_data.groupby(all_data.index).sum()
    med = all_data.groupby(all_data.index).median()
    mean = all_data.groupby(all_data.index).mean()

    #plot_pricehistory(interpolated_histories,end_names,"Test",reset_index=0)
    plot_pricehistory([med[:],mean[:]],['med','mean'],"Test")
    plot_pricehistory([sum[:]],['sum'],"Test",1)
    return sum,med,mean

def col_from_idx(df,idx_name,pos=None):
  df[idx_name] = df.index.get_level_values(idx_name)
  if pos is not None:
    cols = df.columns.tolist()
    cols.insert(pos, cols.pop(cols.index(idx_name)))
    df = df[cols]

def rmse_top_items_analysis(y_actual,y_pred,data_mapped,show_top_items=False):
  squared_errors = np.power(np.abs(y_actual - y_pred),2)

  data_mapped_with_errors = data_mapped.assign(squared_error=squared_errors.values)
  avg_rmse_per_item = np.sqrt(data_mapped_with_errors.groupby('name')['squared_error'].mean())

  top_item_rmse = avg_rmse_per_item.nlargest(10)
  if show_top_items:
    print("Top items with largest rmse:")
    print(top_item_rmse)

  items_to_exclude = top_item_rmse.index
  mask = ~data_mapped.index.get_level_values('name').isin(items_to_exclude)

  filtered_data_mapped = data_mapped[mask]
  filtered_y_actual = y_actual[mask]
  filtered_y_pred = y_pred[mask]
  rmse_without_top_items = mean_squared_error(filtered_y_actual, filtered_y_pred,squared=False)
  rmse_normal = mean_squared_error(y_actual, y_pred, squared=False)
  print(f'\nRMSE without top items: {rmse_without_top_items:.4f} (Change {(rmse_without_top_items/rmse_normal-1)*100:.2f}%)\n')
  return top_item_rmse

def plot_pricehistory(pricehistories:list,labels:list,title,day_interval=60,relative_x_axis=False,figsize=(16, 6)):
  plt.style.use('dark_background')
  plt.figure(figsize=figsize)
  ax = plt.gca()
  for i in range(len(pricehistories)):
    if relative_x_axis:
      days_from_start = (pricehistories[i].index - pricehistories[i].index[0]).days

      plt.plot(days_from_start, pricehistories[i].values, label=labels[i])
      ax.set_xticks(days_from_start[::day_interval])
      ax.set_xticklabels(days_from_start[::day_interval])
    else:
      plt.plot(pricehistories[i], label=labels[i])
      #ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
      ax.xaxis.set_major_locator(mdates.DayLocator(interval=day_interval))
      ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))

  ax.grid(True)
  ax.grid(color='gray', linestyle='--', linewidth=0.5)
  plt.xticks(rotation=45)
  plt.xlabel('Date' if not relative_x_axis else 'Days from Start')
  plt.ylabel('Price')
  plt.title(title)
  plt.legend()
  plt.show()

def plot_predictions_vs_actuals(test_dataset, predictions, item_name,label_main="",label1='real',label2='pred'):
  item_data = test_dataset.xs(item_name, level='name',drop_level=False)

  if len(item_data)==0:
    print(f"{item_name} is not in given data")
    return

  item_actuals = item_data['price']
  item_predictions = predictions[item_data.index]

  # Połącz dane w DataFrame
  plot_df = pd.DataFrame({'date': item_data.index.get_level_values('date'), 'actual': item_actuals, 'predicted': item_predictions})

  rmse = mean_squared_error(item_actuals, item_predictions, squared=False)

  print(f'RMSE {item_name}: {rmse}')
  # Utwórz wykres
  plt.style.use('dark_background')
  plt.figure(figsize=(12, 6))
  plt.plot(plot_df['date'], plot_df['actual'], label=label1)
  plt.plot(plot_df['date'], plot_df['predicted'], label=label2)

  # Dodaj etykiety i tytuł
  plt.xticks(rotation=45)
  plt.xlabel('Data')
  plt.ylabel('Cena')
  plt.title(f'{item_name}: {label_main} {label1} vs {label2}')
  plt.legend()
  plt.show()



In [6]:
#@title DataPrepper
class DataPrepper:
    def __init__(self):
        self.val_df = None
        self.train_df = None
        self.items_phsm_df = None
        self.rust_player_count_df = None
        self.dataset: pd.DataFrame = None
        self.columns_to_drop = [
            'previewUrl', 'views', 'timeCreated', 'timeRefreshed', 'isAvailableOnStore', 'creatorName',
            'appId',
            'id', 'nameId','hasGlow', 'hasCutout','timeAccepted'] # 'hasGlow', 'hasCutout' - no gain, no splits
        self.desired_column_order = ['date', 'price', 'volume', 'name', 'playerCount', 'supplyTotalEstimated',
                                     'storePrice',
                                     'glowRatio',  'cutoutRatio', 'hasGlowSights', 'facepunchSkin',
                                     'itemType',
                                     'itemCollection',
                                     ]

    def _main(self):
        raise NotImplementedError("Func only for tests")
        self.load_data()

        self.preprocess_data()

        self.dataset = self.add_features(self.dataset)

        _print_columns_info(self.dataset)

        # self.train_df, self.val_df = train_test_split(self.dataset, test_size=0.2, shuffle=False, stratify=None)

        # self.save_dataset()

    def load_data(self):
        """
        Loads self.items_phsm_df and self.rust_player_count_df from files.
        """
        with open(ITEMS_PHSM_JSON if not TEST_MODE else ITEMS_PHSM_JSON_TEST, 'r') as f:
            items_phsm = json.load(f)

        with open(PLAYER_COUNT_JSON, 'r') as f:
            rust_player_count_json = json.load(f)

        items_phsm = [item for item in items_phsm if item['name'] not in ITEMS_TO_EXCLUDE]

        phsm_records = self.unfold_phsm(items_phsm)

        self.items_phsm_df = pd.DataFrame(phsm_records)

        self.rust_player_count_df = pd.DataFrame(rust_player_count_json)

        print("Loaded")

    def preprocess_data(self):
        """
        Prepares, merges data and assigns it to self.dataset
        """

        self.prep_player_count()
        self.prep_items_phsm()
        #display(self.dataset.head())
        #display(self.dataset.head())

        merged_df = pd.merge(self.items_phsm_df, self.rust_player_count_df, on='date',
                             how='left')

        ordered_df = merged_df.reindex(columns=self.desired_column_order)

        #ordered_df.drop(columns=['timeAccepted'], inplace=True,errors='ignore')  # Drop unused columns

        self.dataset = ordered_df

    @staticmethod
    def unfold_phsm(items_phsm):
        phsm_records = []
        for item in items_phsm:
            for phsm_entry in item['phsm']:
                record = {k: v for k, v in item.items() if k != 'phsm'}
                record.update(phsm_entry)
                phsm_records.append(record)
        return phsm_records

    def prep_player_count(self):
        self.rust_player_count_df = self.rust_player_count_df.rename(columns={'Date': 'date'})
        self.rust_player_count_df = self.rust_player_count_df.rename(columns={'Player_count': 'playerCount'})
        self.rust_player_count_df['date'] = pd.to_datetime(self.rust_player_count_df['date'])

        return self.rust_player_count_df

    def prep_items_phsm(self):
        self.items_phsm_df.rename(columns={'median': 'price'},inplace=True)
        self.items_phsm_df['date'] = pd.to_datetime(self.items_phsm_df['date'])
        self.items_phsm_df['timeAccepted'] = pd.to_datetime(self.items_phsm_df['timeAccepted'], errors='coerce')
        self.items_phsm_df['date'] = pd.to_datetime(self.items_phsm_df['date'])
        self.items_phsm_df.drop(columns=self.columns_to_drop,inplace=True)  # Delete unnecessary columns
        self.items_phsm_df = self.items_phsm_df.sort_values(by=['name', 'date'])


        self.items_phsm_df.reset_index(drop=True)

        def normalize_spikes(group, threshold):
            name = group.iloc[0]['name']
            prices = group['price'].to_numpy()
            corrections = []
            num_spikes = 0
            i = 3

            while i < len(group) - 3:
                current_price = prices[i]
                window = prices[i-4:i+4] # 4 before i and i + 3 after i, so indexes are correct
                median = np.median(window)

                if current_price > threshold * median:
                    num_spikes += 1
                    idx=group.index[i]
                    corrections.append((idx, np.mean(window)))  # index and new price
                i += 1

            if num_spikes:
                print(f"Prepared normalization for {num_spikes} spikes for {name}")

            return corrections

        if MAKE_SPIKES_CORR_FILE:
            corrections = self.items_phsm_df.groupby('name').apply(normalize_spikes, threshold=SPIKES_TH if not TEST_MODE else SPIKES_TH_TEST).sum()#.sum().tolist()

            corrections_df = pd.DataFrame(corrections, columns=['index', 'new_price'])
            corrections_df.to_csv(SPIKES_PATH if not TEST_MODE else SPIKES_PATH_TEST, index=False)
        #return self.items_phsm_df.groupby('name').apply(normalize_spikes,threshold=3)

        #return self.items_phsm_df

    @staticmethod
    def add_features(dataset,change_original=False):
        raise NotImplementedError("Run window with add_features implementation first") #Add features is often changed so it is implemented in other window so I don't have to create new object every time i change it



In [7]:
#@title Helpers 2
def split_dataset(X,y, test_size):

    splitter = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state=421)

    groups = X.index.get_level_values('name')

    train_idx, test_idx = next(splitter.split(X, y, groups))

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    return X_train, X_test, y_train, y_test

def autosave_model(model):
  folder_path=MODEL_SAVES_PATH
  curr_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
  file_path = os.path.join(folder_path, f"lightgbm_model_{curr_time}.txt")
  model.save_model(file_path)

In [8]:
#@title Load


dp = DataPrepper()
dp.load_data()
dp.preprocess_data()

# reduce_mem_usage in other places fucks something up, doesn't give much and is slow
dp.items_phsm_df= reduce_mem_usage(dp.items_phsm_df)
dp.rust_player_count_df = reduce_mem_usage(dp.rust_player_count_df)

display(dp.dataset.head())

Loaded


reduced df from 0.4959 MB to 0.2274 MB in 8.02 seconds


reduced df from 0.0593 MB to 0.0445 MB in 0.45 seconds


Unnamed: 0,date,price,volume,name,playerCount,supplyTotalEstimated,storePrice,glowRatio,cutoutRatio,hasGlowSights,facepunchSkin,itemType,itemCollection
0,2024-05-30,6.06,6,Abyss Boots,123565,10236,1.49,0.0,0.0,False,False,Boots,Abyss
1,2024-05-31,2.16,25,Abyss Boots,130051,10236,1.49,0.0,0.0,False,False,Boots,Abyss
2,2024-06-01,2.07,17,Abyss Boots,125705,10236,1.49,0.0,0.0,False,False,Boots,Abyss
3,2024-06-02,2.08,13,Abyss Boots,122606,10236,1.49,0.0,0.0,False,False,Boots,Abyss
4,2024-06-03,1.96,22,Abyss Boots,115998,10236,1.49,0.0,0.0,False,False,Boots,Abyss


In [9]:
#@title Update prices from file (spikes)

def update_prices_from_file(df, correction_file):
    # Wczytanie poprawek
    corrections = pd.read_csv(correction_file)

    # Ensure the correction DataFrame has appropriate types and indices
    corrections.set_index('index', inplace=True)

    # Update prices in the main DataFrame using the corrections
    df=df.copy()
    df.update(corrections[['new_price']].rename(columns={'new_price': 'price'}))

    return df

names=['Nocturnal Beast AR','Wooden Base Box','Bone Armor Bandana']  #Snakebite SAR

corrections = pd.read_csv(SPIKES_PATH if not TEST_MODE else SPIKES_PATH_TEST)
itemsphsm_before=dp.items_phsm_df.copy()     # !!!!!!!!!!!!

itemsphsm_after = update_prices_from_file(itemsphsm_before, SPIKES_PATH if not TEST_MODE else SPIKES_PATH_TEST)

itemsphsm_after.set_index(['name','date'],inplace=True)
itemsphsm_after.sort_index(inplace=True)

names=set()
for i in corrections[['index']].values:
  names.add(itemsphsm_before.loc[i].reset_index()['name'][0])
names=list(names)

itemsphsm_before.set_index(['name','date'],inplace=True)
itemsphsm_before.sort_index(inplace=True)

for name in names[::20]:
  b=itemsphsm_before.loc[name][['price']]
  a=itemsphsm_after.loc[name][['price']]
  #plot_pricehistory([b,a],['before','after'],f'spikes {name}',figsize=(12,6))


In [10]:
#@title Def add_features


def add_features(dataset):
  to_interpolate = ['price', 'volume', 'playerCount']
  bool_columns = []#[ 'facepunchSkin', 'is_weekend']#'hasGlowSights','hasGlow', 'hasCutout',
  int_columns = ['year', 'month', 'day', 'volume', 'playerCount', 'supplyTotalEstimated', 'weekday']
  categorical_columns = ['itemType', 'itemCollection']

  columns_to_keep = ['price', 'volume', 'playerCount', 'supplyTotalEstimated', 'storePrice', 'itemType', 'itemCollection'] # , 'glowRatio', 'cutoutRatio', 'facepunchSkin'

  dataset = dataset[columns_to_keep]

  original_column_order = dataset.columns.tolist()

  # Interpolating data
  interpolated_data = dataset.groupby('name')[to_interpolate].apply(
      lambda group: group.reset_index('name', drop=True).asfreq('D').interpolate(method='linear'))

  not_interpolated_data = dataset.groupby('name')[dataset.columns.difference(to_interpolate)].apply(
      lambda group: group.reset_index('name', drop=True).asfreq('D'))

  dataset = pd.concat([interpolated_data, not_interpolated_data], axis=1)[original_column_order]

  def compute_features(group):
    lags_price = [2,3, 4, 5, 6, 7, 14, 21]#[2, 3, 4, 5, 6, 7, 14, 21]
    lags_vol = [1, 2, 3, 7]#[1, 2, 3, 7]
    lags_player_count=[0,7,30]
    windows_price = [3, 7, 14, 21]
    windows_volume = [3, 21]
    windows_player_count=[7,14]
    ewma_windows = [7, 14, 21, 30]

    features = {}

    for lag in lags_price:
      features[f'price_lag{lag}'] = group['price'].shift(lag)

    for lag in lags_vol:
      features[f'vol_lag{lag}'] = group['volume'].shift(lag)

    for lag in lags_player_count:
      lag=lag+1   # lag+1 because rolling takes current data too, that'd be a data leak
      for window in windows_player_count:

        if len(group)+lag >= window:  # Ensure enough data points for rolling operations
          roll = group[['playerCount']]['playerCount'].shift(lag).rolling(window=window)
          features[f'playerCount_roll_mean_{window}_lag_{lag-1}'] = roll.mean()
        else:
          features[f'playerCount_roll_mean_{window}_lag_{lag-1}'] = np.nan

    for window in windows_price:
      if len(group) >= window:  # Ensure enough data points for rolling operations
          roll = group[['price']].shift(1).rolling(window=window)
          features.update({
            f'price_roll_mean_{window}': roll['price'].mean(),
            f'price_roll_std_{window}': roll['price'].std(),
            f'price_roll_var_{window}': roll['price'].var(),
            f'price_roll_sum_{window}': roll['price'].sum(),
          })
      else:
        features.update({
            f'price_roll_mean_{window}': np.nan,
            f'price_roll_std_{window}':np.nan,
            f'price_roll_var_{window}':np.nan,
            f'price_roll_sum_{window}': np.nan,
        })

    for window in windows_volume:
      if len(group) >= window:  # Ensure enough data points for rolling operations
          roll = group[['volume']].shift(1).rolling(window=window)
          features.update({
            f'vol_roll_mean_{window}': roll['volume'].mean(),
          })
      else:
        features.update({
            f'vol_roll_mean_{window}': np.nan,
        })

    features['price_trend'] = np.arange(len(group)) * group['price'].shift(1).pct_change().fillna(0)
    features['volume_trend'] = np.arange(len(group)) * group['volume'].shift(1).pct_change().fillna(0)

    for window in ewma_windows:
      features[f'price_ewma_{window}'] = group['price'].shift(1).ewm(span=window, adjust=False).mean()
      #features[f'vol_ewma_{window}'] = group['volume'].shift(1).ewm(span=window, adjust=False).mean()

    features['price_cumsum'] = group['price'].shift(1).cumsum()
    features['volume_cumsum'] = group['volume'].shift(1).cumsum()

    collection_group = group.groupby('itemCollection',observed=True)

    for window in windows_price:
      if len(group) >= window:
        # Rolling mean for price and volume across the collection
        collection_rolling = collection_group[['price', 'volume']].shift(1).rolling(window=window)
        features.update({
            f'collection_price_roll_mean_{window}': collection_rolling['price'].mean(),
            f'collection_price_roll_std_{window}': collection_rolling['price'].std(),
            f'collection_price_roll_var_{window}': collection_rolling['price'].var(),
            f'collection_price_roll_sum_{window}': collection_rolling['price'].sum(),
        })
      else:
        features.update({
            f'collection_price_roll_mean_{window}': np.nan,
            f'collection_price_roll_std_{window}': np.nan,
            f'collection_price_roll_var_{window}': np.nan,
            f'collection_price_roll_sum_{window}': np.nan,
        })

    # Średnia cena i wolumen dla całej kolekcji (tylko z przeszłych rekordów)
    collection_mean = collection_group[['price', 'volume']].shift(1).expanding().mean()
    features['collection_price_mean'] = collection_mean['price']
    features['collection_vol_mean'] = collection_mean['volume']

    return pd.concat([group, pd.DataFrame(features, index=group.index)], axis=1)

  dataset = dataset.groupby('name', group_keys=False).apply(compute_features)

  dates = dataset.index.get_level_values('date')
  dataset['year'] = dates.year
  dataset['month'] = dates.month
  dataset['day'] = dates.day
  dataset['weekday'] = dates.weekday
  #dataset['is_weekend'] = (dataset['weekday'] >= 5)
  dataset['d_from_nyear'] = (dates - pd.to_datetime(dates.year.astype(str) + '-01-01')).days
  dataset['d_to_june'] = (pd.to_datetime(dates.year.astype(str) + '-06-01') - dates).days


  dataset[bool_columns] = dataset[bool_columns].astype(bool)
  #int_columns_present = [col for col in int_columns if col in dataset.columns]
  dataset[int_columns] = dataset[int_columns].fillna(0).astype(int)
  dataset[categorical_columns] = dataset[categorical_columns].astype('category')

  dataset.dropna(subset=['itemType'], inplace=True)  # Drop interpolated rows, subset is whatever is not interpolated

  return dataset

dp.add_features = add_features


In [11]:
#@title Split
"""dataset = dp.dataset.copy()
dataset.set_index(['name','date'],inplace=True)
dataset = dp.add_features(dataset)

X_whole = dataset.drop(columns=['price'])
y_whole = dataset['price'].copy()

X_train, X_test, y_train, y_test = split_dataset(X_whole,y_whole)

train_lgb_dataset = lgb.Dataset(X_train, label=y_train,free_raw_data=0)
test_lgb_dataset = lgb.Dataset(X_test, label=y_test,free_raw_data=0)

# For convenience:
train_dataset=train_lgb_dataset.data.copy()       # Same jak bym dał do tego X_train i y_train chyba?
train_dataset.loc[:,'price']=train_lgb_dataset.label

test_dataset=test_lgb_dataset.data.copy()
test_dataset.loc[:,'price']=test_lgb_dataset.label"""
pass


In [12]:
#@title params
const_params={
    'boosting_type': 'gbdt',
    'objective': 'mae',
    'metric': ['mape','mae'],

    'seed': 42,
    'num_threads':4,   # 5K dart rounds x3, 4 threads are the best, 30% faster than 8 or -1 and 8% faster than second fastest number score
    'device': 'gpu' if gpu_available else 'cpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'verbose': 1,

    'learning_rate': 0.1,
    'num_boost_round': 500#1500
    }

model_params=const_params.copy()
if TEST_MODE:
  model_params.update({
      'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 10,
      'lambda_l1': 15, 'lambda_l2': 15,
      'min_data_in_leaf': 5, 'min_gain_to_split': 0.1, 'max_depth': 6,
      'num_leaves': 12, 'max_bin':400,
  })
else:
  model_params.update({
      'feature_fraction': 0.8, 'bagging_fraction': 0.9, 'bagging_freq': 20,
      'lambda_l1': 50, 'lambda_l2': 5,
      'min_data_in_leaf': 30, 'min_gain_to_split': 2, 'max_depth': 9,
      'num_leaves': 80, 'max_bin':512,
  })

In [13]:
#@title add features, split test dataset
dataset = dp.dataset.copy()
dataset.set_index(['name', 'date'], inplace=True)
dataset = dp.add_features(dataset)

_features_added=True


X_whole = dataset.drop(columns=['price'])
y_whole = dataset['price'].copy()

X, X_test, y, y_test = split_dataset(X_whole,y_whole,test_size=0.15)

#X, X_tmp, y, y_tmp = split_dataset(X_whole,y_whole,test_size=0.3)
#X_valid, X_test, y_valid, y_test = split_dataset(X_whole,y_whole,test_size=0.3)

del X_whole,y_whole



In [14]:
#@title Learning with cv
def learnCV(X,y,n_splits):
  all_y_pred_train,all_y_train = [],[]
  all_y_pred_test,all_y_test = [],[]
  all_y_pred_naive = []
  tscv = TimeSeriesSplit(n_splits=n_splits)

  for train_index, test_index in tscv.split(X):
      X_train, X_test = X.iloc[train_index], X.iloc[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]

      train_lgb_dataset = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
      test_lgb_dataset = lgb.Dataset(X_test, label=y_test, free_raw_data=False)

      model = lgb.train(
          model_params,
          train_lgb_dataset,
          valid_sets=[test_lgb_dataset],
          valid_names=['test'],
          callbacks=[
              lgb.log_evaluation(period=50),
              lgb.early_stopping(stopping_rounds=100, min_delta=[0.0008,0.001])
          ]
      )

      y_pred_train = pd.Series(model.predict(X_train, num_iteration=model.best_iteration), index=X_train.index)
      y_pred_test = pd.Series(model.predict(X_test, num_iteration=model.best_iteration), index=X_test.index)

      y_pred_naive = pd.Series(X_test['price_lag2'].fillna(X_test['storePrice']).fillna(3).to_numpy(), index=X_test.index)

      all_y_pred_train.append(y_pred_train)
      all_y_train.append(y_train)
      all_y_pred_test.append(y_pred_test)
      all_y_test.append(y_test)
      all_y_pred_naive.append(y_pred_naive)

  del X_train, X_test,y_train, y_test,train_lgb_dataset,test_lgb_dataset, y_pred_train,y_pred_test,y_pred_naive
  gc.collect()

  y_pred_train_concat = pd.concat(all_y_pred_train)
  y_train_concat = pd.concat(all_y_train)
  y_pred_test_concat = pd.concat(all_y_pred_test)
  y_test_concat = pd.concat(all_y_test)
  y_pred_naive_concat = pd.concat(all_y_pred_naive)
  return model,(y_pred_train_concat, y_train_concat, y_pred_test_concat, y_test_concat, y_pred_naive_concat)


n_splits = 5

gc.collect()
memory = psutil.virtual_memory().used

model,datasets = learnCV(X, y, n_splits)

memory2 = psutil.virtual_memory().used

print(f"RAM difference: {(memory2 -memory ) / (1024 ** 2):.2f} MB")


if not TEST_MODE:
  autosave_model(model)




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002341 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11989
[LightGBM] [Info] Number of data points in the train set: 875, number of used features: 73
[LightGBM] [Info] Start training from score 1.450000
Training until validation scores don't improve for 100 rounds
[50]	test's mape: 0.109237	test's l1: 0.223157
[100]	test's mape: 0.108951	test's l1: 0.219632
[150]	test's mape: 0.107777	test's l1: 0.215614
[200]	test's mape: 0.107777	test's l1: 0.215614
Early stopping, best iteration is:
[129]	test's mape: 0.108001	test's l1: 0.215953




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004277 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18703
[LightGBM] [Info] Number of data points in the train set: 1745, number of used features: 73
[LightGBM] [Info] Start training from score 1.420000
Training until validation scores don't improve for 100 rounds
[50]	test's mape: 0.127739	test's l1: 0.375524
[100]	test's mape: 0.12329	test's l1: 0.358573
[150]	test's mape: 0.122473	test's l1: 0.354141
[200]	test's mape: 0.121467	test's l1: 0.347381
[250]	test's mape: 0.120292	test's l1: 0.341895
[300]	test's mape: 0.119736	test's l1: 0.337977
[350]	test's mape: 0.119824	test's l1: 0.337186
Early stopping, best iteration is:
[263]	test's mape: 0.119346	test's l1: 0.337709
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003686 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info]



[50]	test's mape: 0.0822302	test's l1: 0.280382
[100]	test's mape: 0.0750563	test's l1: 0.248934
[150]	test's mape: 0.0751353	test's l1: 0.249537
Early stopping, best iteration is:
[69]	test's mape: 0.0753327	test's l1: 0.249778
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006706 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24763
[LightGBM] [Info] Number of data points in the train set: 3485, number of used features: 74
[LightGBM] [Info] Start training from score 1.950000
Training until validation scores don't improve for 100 rounds




[50]	test's mape: 0.0928533	test's l1: 0.212624
[100]	test's mape: 0.0929158	test's l1: 0.21239
Early stopping, best iteration is:
[37]	test's mape: 0.0931593	test's l1: 0.214737
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007419 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24858




[LightGBM] [Info] Number of data points in the train set: 4355, number of used features: 74
[LightGBM] [Info] Start training from score 2.120000
Training until validation scores don't improve for 100 rounds
[50]	test's mape: 0.0974933	test's l1: 0.197083
[100]	test's mape: 0.0970719	test's l1: 0.191046
[150]	test's mape: 0.0972349	test's l1: 0.189888
Early stopping, best iteration is:
[64]	test's mape: 0.096826	test's l1: 0.193105
RAM difference: 281.90 MB


In [15]:
show_metrics(*datasets, label="mae")
y_pred_train, y_pred_valid, y_pred_naive = datasets[0],datasets[2],datasets[-1]
sum=0
for d in datasets:
  sum+=getsizeof(y_pred_train)
print(round(sum/1024/1024,2),"MB")
round(getsizeof(y)/1024/1024,2)
#848MB predicted size

RMSE change: -0.56%   (naive->test)


Unnamed: 0,Metric,Train,Valid,Naive
0,Accuracy,93.01,90.059,89.518
1,RMSE,0.335,0.425,0.428
2,MAE,0.149,0.242,0.235
3,MAPE,0.07,0.099,0.105
4,Max Error,7.145,7.249,7.52
5,Median Absolute Error,0.069,0.138,0.14


1.15 MB


0.15

In [18]:
#@title Utils
"""test_names = X_test.index.get_level_values('name').unique()
train_names = X_train.index.get_level_values('name').unique()
all_names = test_names.union(train_names)
unique_dataset = dp.dataset.loc[dp.dataset.groupby('name')['price'].idxmax()]
sorted_dataset_price = unique_dataset.sort_values(by='price', ascending=False)"""
#display(sorted_dataset_price.head(5))
pass

#pd.set_option('display.max_columns', None)
#print("\nTest most freq:")
#display(X_test.groupby('name').size().sort_values(ascending=False).head(5))
#print("\nTrain most freq:")
#display(X_train.groupby('name').size().sort_values(ascending=False).head(5))

In [None]:
#@title Display pricehistories
names=['Labyrinth Door']
#names=[n for n in all_names if "Whiteout" in n]
#names=['Legacy Kevlar Helmet','Whiteout Helmet']
print(names)

phs=[]
end_names=[]
for name in names:
  try:
    ph = dp.dataset.loc[name][['price']]
  except KeyError:
    print(name," not in dataset")
    continue
  phs.append(ph)
  end_names.append(name)
  #pricehistory2 = test_dataset.loc[name][['price']]
if len(phs):
  plot_pricehistory(phs,end_names,"test")
  plot_pricehistory(phs,end_names,"test",relative_x_axis=True)


['Labyrinth Door']
Labyrinth Door  not in dataset


In [None]:
#@title Grid search results plotter
from collections import defaultdict, Counter
from itertools import combinations

def show_grid_search_results(gsearch, top_frac=0.2, dp_styled_res=False,show_overall_heatmap=True):
    results = pd.DataFrame(gsearch.cv_results_)
    selected_columns = ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
    results = results[selected_columns].sort_values(by='rank_test_score')
    styled_results = results.style.background_gradient(subset=['rank_test_score'], cmap='coolwarm')
    styled_results = styled_results.format({'mean_test_score': '{:.3f}', 'std_test_score': '{:.3f}'})

    if dp_styled_res:
        display(styled_results)

    top_records_num = int(len(results) * top_frac)
    top_results = results.nsmallest(top_records_num, 'rank_test_score')

    # Count parameter occurrences
    param_counts = Counter()
    for params in top_results['params']:
        for param, value in params.items():
            param_key = f"{param}={value}"
            param_counts[param_key] += 1

    # Filter out parameters that occur in all top records
    filtered_params = {k: v for k, v in param_counts.items() if v != top_records_num}

    # Convert Counter to DataFrame
    param_counts_df = pd.DataFrame.from_dict(filtered_params, orient='index', columns=['frequency']).sort_values('frequency', ascending=False)

    # Extract unique parameter names
    param_labels = param_counts_df.index
    param_names = [label.split('=')[0] for label in param_labels]
    unique_params = list(set(param_names))

    # Assign colors to unique parameter names
    colors = plt.cm.get_cmap('tab10', len(unique_params))
    color_map = {param: colors(i) for i, param in enumerate(unique_params)}

    # Plot bar chart of parameter frequencies
    fig, ax = plt.subplots(figsize=(10, 6))
    bars = ax.bar(param_counts_df.index, param_counts_df['frequency'], color=[color_map[param] for param in param_names])
    ax.set_title(f'Częstość argumentów w {top_records_num}/{len(results)} ({100*top_frac:.0f}%) najlepszych wynikach')
    ax.set_xlabel('Parametry')
    ax.set_ylabel('Częstość')
    ax.set_xticks(np.arange(len(param_labels)))
    ax.set_xticklabels(param_labels, rotation=45, ha='right', fontsize=10, color='black')

    for tick, param_name in zip(ax.get_xticklabels(), param_names):
        tick.set_color(color_map[param_name])

    plt.tight_layout()
    plt.show()

    if show_overall_heatmap:
      param_correlations = defaultdict(int)
      for params in top_results['params']:
          param_items = [f"{param}={value}" for param, value in params.items()]
          for combo in combinations(param_items, 2):
              param_correlations[combo] += 1

      correlation_matrix = pd.DataFrame(0, index=param_labels, columns=param_labels)
      for (param1, param2), count in param_correlations.items():
          correlation_matrix.loc[param1, param2] = count
          correlation_matrix.loc[param2, param1] = count

      correlation_matrix = correlation_matrix.loc[param_counts_df.index, param_counts_df.index]
      plt.figure(figsize=(9, 8))
      sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.0f', cbar=True, linewidths=0.5)
      plt.title('Częstości występowania par parametrów')
      plt.xlabel('Parametry')
      plt.ylabel('Parametry')
      plt.xticks(rotation=45, ha='right')
      plt.yticks(rotation=0)
      for tick, param_name in zip(plt.gca().get_xticklabels(), param_names):
          tick.set_color(color_map[param_name])
      for tick, param_name in zip(plt.gca().get_yticklabels(), param_names):
          tick.set_color(color_map[param_name])
      plt.tight_layout()
      plt.show()

    # Individual heatmaps for each pair of parameter types in filtered data
    filtered_param_names = list(set(param_names))

    for param1, param2 in combinations(filtered_param_names, 2):
        param1_values = sorted({params.get(param1) for params in top_results['params'] if param1 in params})
        param2_values = sorted({params.get(param2) for params in top_results['params'] if param2 in params})

        correlation_matrix = pd.DataFrame(0, index=param1_values, columns=param2_values)

        for params in top_results['params']:
            if param1 in params and param2 in params:
                correlation_matrix.loc[params[param1], params[param2]] += 1

        plt.figure(figsize=(8, 6))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.0f', cbar=True, linewidths=0.5)
        plt.title(f'Heatmap for {param1} vs {param2}')
        plt.xlabel(param2)
        plt.ylabel(param1)
        plt.xticks(rotation=45, ha='right')
        plt.yticks(rotation=0)
        plt.tight_layout()
        plt.show()


In [None]:
#@title Grid search
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

from datetime import datetime

# Uzyskanie bieżącego czasu
start_time = datetime.now()

def hyperparameter_optimization(X, y):
    lgb_estimator = lgb.LGBMRegressor(
        objective='mae',
        boosting_type='gbdt',
        metric='mape',
        device='gpu' if gpu_available else 'cpu',
        gpu_platform_id=0,
        gpu_device_id=0,
        seed=42,
        num_threads=4,
        verbose=-1,
        learning_rate=0.1,
    )

    param_grid = {
        'feature_fraction': [0.9,0.8,0.7],#[0.9,0.5],
        'bagging_fraction': [0.9],
        'bagging_freq': [20,25],
        'verbose':[-1],
        'min_data_in_leaf': [30],
        'min_gain_to_split': [0.1],
        'max_depth': [6,7,8,9],
        'lambda_l1': [15],#[10,15],
        'lambda_l2': [15],#[10,15],
        'learning_rate': [0.1],
        'num_leaves': [80],
        'num_boost_round': [200],
    }

    lgb_estimator = lgb.LGBMRegressor()

    gsearch = GridSearchCV(estimator=lgb_estimator, param_grid=param_grid, cv=4,verbose=1, n_jobs=2, scoring='neg_median_absolute_error')#neg_mean_absolute_error
    gsearch.fit(X, y)

    print("Best parameters found by grid search are:", gsearch.best_params_)
    print("Best score found by grid search is:", -gsearch.best_score_)

    return gsearch

gsearch1 = hyperparameter_optimization(X_train, y_train)

#0.149 'feature_fraction': 0.6,  'max_depth': -1,  'min_gain_to_split': 0.1, 'num_leaves': 12,
# 0.143  'feature_fraction': 0.9, 'max_bin': 512, 'min_gain_to_split': 0.1, 'num_boost_round': 300, 'num_leaves': 26,
#'max_bin': 400, 'max_depth': 6, 'min_data_in_leaf': 40, 'min_gain_to_split': 0.1, 'num_leaves': 26,
#'max_bin': 512, 'max_depth': -1, 'min_data_in_leaf': 40, 'min_gain_to_split': 0.1,  'num_leaves': 26,

#cv=5
# 'max_bin': 400, 'max_depth': 10, 'min_data_in_leaf': 60, 'min_gain_to_split': 0.1, 'num_leaves': 26,
# 0.117

#mean: 0.14340med 0.211mean
#med:  0.1434med  0.212mean
print("Czas wykonania:", datetime.now()-start_time)

#0.15median 'bagging_freq': 10, 'lambda_l1': 1, 'lambda_l2': 1, 'max_depth': -1, 'min_data_in_leaf': 40, 'min_gain_to_split': 0.1, 'num_leaves': 9
#0.22mean   'bagging_freq': 10,  'lambda_l1': 1, 'lambda_l2': 1, 'max_depth': -1, 'min_data_in_leaf': 40, 'min_gain_to_split': 0.1,  'num_leaves': 9,



#0.256 sredni czas na 2foldy train

Fitting 4 folds for each of 24 candidates, totalling 96 fits




Best parameters found by grid search are: {'bagging_fraction': 0.9, 'bagging_freq': 20, 'feature_fraction': 0.8, 'lambda_l1': 15, 'lambda_l2': 15, 'learning_rate': 0.1, 'max_depth': 9, 'min_data_in_leaf': 30, 'min_gain_to_split': 0.1, 'num_boost_round': 200, 'num_leaves': 80, 'verbose': -1}
Best score found by grid search is: 0.08808318078513527
Czas wykonania: 1:05:39.645659


In [None]:
show_grid_search_results(gsearch1,0.3,False,True)

NameError: name 'show_grid_search_results' is not defined

In [None]:
#@title params & learn
const_params={
    'boosting_type': 'gbdt',
    'objective': 'mae',
    'metric': ['mape','mae'],
    #'min_delta':[0.001,0.001],
    #'early_stopping_rounds':200,
    #'first_metric_only': "true",


    'seed': 42,
    'num_threads':4,   # 5K dart rounds x3, 4 threads are the best, 30% faster than 8 or -1 and 8% faster than second fastest number score
    'device': 'gpu' if gpu_available else 'cpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'verbose': 1,

    'learning_rate': 0.1,
    'num_boost_round': 500#1500
    }

model_params=const_params.copy()
if TEST_MODE:
  model_params.update({
      'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 10,
      'lambda_l1': 15, 'lambda_l2': 15,
      'min_data_in_leaf': 5, 'min_gain_to_split': 0.1, 'max_depth': 6,
      'num_leaves': 12, 'max_bin':400,
  })
else:
  model_params.update({
      'feature_fraction': 0.8, 'bagging_fraction': 0.9, 'bagging_freq': 20,
      'lambda_l1': 50, 'lambda_l2': 5,
      'min_data_in_leaf': 30, 'min_gain_to_split': 2, 'max_depth': 9,
      'num_leaves': 80, 'max_bin':512,
  })

model = lgb.train(model_params,
                  train_lgb_dataset,
                  valid_sets=[test_lgb_dataset],
                  valid_names=['test'],
                  callbacks=[lgb.log_evaluation(period=50),
                             lgb.early_stopping(stopping_rounds=100,min_delta=[0.0008,0.001],first_metric_only=False)
                             ])  #200 rounds:  L4 71s | T4 87s | CPU 110s

if not TEST_MODE:
  folder_path='/content/drive/MyDrive/SMF_files/model_saves'
  curr_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
  file_path = os.path.join(folder_path, f"lightgbm_model_{curr_time}.txt")
  model.save_model(file_path)

In [62]:
#@title Metrics
"""def show_metrics_standard(model, X_train, y_train, X_test, y_test, label="[no label]"):
    # Pobranie najlepszej liczby iteracji
    num_iteration = model.best_iteration

    print('='*8+" "+label+" "+'='*8)

    # Przewidywania za pomocą metody naiwnej
    y_pred_naive = pd.Series(X_test['price_lag2'].fillna(X_test['storePrice']).fillna(3).to_numpy(), index=X_test.index)
    rmse_naive = mean_squared_error(y_test, y_pred_naive, squared=False)

    # Przewidywania na danych treningowych
    y_pred_train = pd.Series(model.predict(X_train, num_iteration=num_iteration), index=X_train.index)
    rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)

    # Przewidywania na danych testowych
    y_pred = pd.Series(model.predict(X_test, num_iteration=num_iteration), index=X_test.index)
    rmse_test = mean_squared_error(y_test, y_pred, squared=False)

    # Wyświetlanie zmian RMSE w porównaniu do metody naiwnej
    print(f'RMSE change: {(1-rmse_test/rmse_naive)*-100:.2f}%   (naive->test)')

    # Wyświetlanie metryk
    results = {
        "Metric": ["Accuracy", "RMSE", "MAE", "MAPE", "Max Error", "Median Absolute Error"],
        "Train": calc_accuracy(y_pred_train, y_train, "Train accuracy:"),
        "Valid": calc_accuracy(y_pred, y_test, "Validation accuracy:"),
        "Naive": calc_accuracy(y_pred_naive, y_test, "Naive accuracy:")
    }

    df = pd.DataFrame(results)
    display(df)

    return y_pred, y_pred_train, y_pred_naive

y_pred, y_pred_train, y_pred_naive = show_metrics_standard(model, X_train, y_train, X_test, y_test, label="mae")

#y_pred,y_pred_train,y_pred_naive=show_metrics(model,"mae")
#y_pred2,y_pred_train2,_ = show_metrics(model2,"Model 2 reg_l1")"""
pass

In [16]:
#@title N-models params
const_params_n={
    'boosting_type': 'gbdt',
    'objective': 'mae',
    'metric': ['mape','mae'],

    'seed': 42,
    'num_threads':4,   # 5K dart rounds x3, 4 threads are the best, 30% faster than 8 or -1 and 8% faster than second fastest number score
    'device': 'gpu' if gpu_available else 'cpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'verbose': 1,

    'learning_rate': 0.1,
    'num_boost_round': 500
    }

model_params_n=const_params_n.copy()
if TEST_MODE:
  model_params_n.update({
      'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 10,
      'lambda_l1': 15, 'lambda_l2': 15,
      'min_data_in_leaf': 5, 'min_gain_to_split': 0.1, 'max_depth': 6,
      'num_leaves': 12, 'max_bin':400,
  })
else:
  model_params_n.update({
      'feature_fraction': 0.8, 'bagging_fraction': 0.9, 'bagging_freq': 20,
      'lambda_l1': 50, 'lambda_l2': 5,
      'min_data_in_leaf': 30, 'min_gain_to_split': 2, 'max_depth': 9,
      'num_leaves': 80, 'max_bin':512,
  })

In [61]:
#@title N-models old
'''
def add_shifted_targets(dataset, max_n):
    """
    Tworzy zbiór danych z przesuniętymi targetami dla różnych horyzontów prognozy.
    """
    target_columns = []
    for n in range(1, max_n + 1):
        dataset[f'price_t+n_{n}'] = dataset.groupby('name')['price'].shift(-n)
        target_columns.append(f'price_t+n_{n}')

    #dataset.drop(columns=['price'],inplace=True)
    #dataset.rename(columns={'price': 'price_lag0'}, inplace=True)
    return dataset, target_columns

def train_models_for_forecast_horizon(X_train, ys_train, X_test, ys_test, model_params):
    """
    Trenuje modele dla różnych horyzontów prognozy.
    """
    models = {}
    for i, y_col in enumerate(ys_train):
        y_train_tmp = ys_train[y_col].copy().dropna()
        X_train_tmp = X_train.loc[y_train_tmp.index]#.drop(columns=ys_train)

        y_test_tmp = ys_test[y_col].copy().dropna()
        X_test_tmp = X_test.loc[y_test_tmp.index]#.drop(columns=y_test_tmp)

        train_lgb_dataset = lgb.Dataset(X_train_tmp, label=y_train_tmp, free_raw_data=1)
        test_lgb_dataset = lgb.Dataset(X_test_tmp, label=y_test_tmp, free_raw_data=1)

        model = lgb.train(model_params, train_lgb_dataset,
                          valid_sets=[test_lgb_dataset],
                          valid_names=[y_col],
                          callbacks=[lgb.log_evaluation(period=50),
                                     lgb.early_stopping(stopping_rounds=100,min_delta=[0.001,0.001],first_metric_only=False)
                                     ])
        del train_lgb_dataset, test_lgb_dataset, X_train_tmp, y_train_tmp, X_test_tmp, y_test_tmp
        gc.collect()

        #model.save_model(f'model_horizon_{i+1}.txt')
        models[f'model_horizon_{i+1}'] = model
        del model
        print(f'Model for horizon {i+1} trained.')

    return models

def show_metrics(models, X_test, ys_test):

    #Wyświetla dokładność modeli dla poszczególnych horyzontów prognozy oraz średnią dokładność.

    metrics = {'mape': [], 'mae': [], 'rmse': []}

    for i, y_col in enumerate(ys_test):

        y_test_tmp = ys_test[y_col].dropna()
        X_test_tmp = X_test.loc[y_test_tmp.index]#.drop(columns=y_columns)

        preds = models[f'model_horizon_{i+1}'].predict(X_test_tmp)

        mape = mean_absolute_percentage_error(y_test_tmp, preds)
        mae = mean_absolute_error(y_test_tmp, preds)
        rmse = np.sqrt(mean_squared_error(y_test_tmp, preds))

        metrics['mape'].append(mape)
        metrics['mae'].append(mae)
        metrics['rmse'].append(rmse)

        print(f'Metrics for horizon {i+1}: MAPE: {mape:.4f}, MAE: {mae:.4f}, RMSE: {rmse:.4f}')

    for metric in metrics:
        avg_metric = np.mean(metrics[metric])
        print(f'Average {metric.upper()}: {avg_metric:.4f}')

# Tworzenie targetów dla N dni do przodu
N = 7
dataset_n = dp.dataset.copy()
dataset_n.set_index(['name', 'date'], inplace=True)
dataset_n = dp.add_features(dataset_n)

dataset_n, target_columns_n = add_shifted_targets(dataset_n, N)

common_idx = dataset_n.index #dataset_n[target_columns_n].dropna().index
display(dataset_n.head())

X_filtered = dataset_n.loc[common_idx].drop(columns=target_columns_n)
y_filtered = dataset_n[target_columns_n].loc[common_idx]

X_train_n, X_test_n, y_train_n, y_test_n = split_dataset(X_filtered, y_filtered)

#display(X_test_n.head())
#display(y_test_n.head())

models = train_models_for_forecast_horizon(X_train_n, y_train_n, X_test_n, y_test_n, model_params_n)

show_metrics(models, X_test_n, y_test_n)
'''
pass

In [70]:
#@title eval_models_n

def get_test_index_TSS(X,n_splits):
  """ Last test index from TimeSeriesSplit was never used for learning, only for validation.
      Returns that test index."""
  tscv = TimeSeriesSplit(n_splits_n)
  tscv.split(X)
  test_index=None
  for train_index_tmp, test_index_tmp in tscv.split(X):
      test_index=test_index_tmp
  return test_index

def eval_models_n(models, X_test, ys_test):

    df = pd.DataFrame(columns=[ 'MAPE', 'MAE', 'RMSE', 'Count'])

    for i, y_col in enumerate(ys_test):

        y_test_tmp = ys_test[y_col].dropna()
        X_test_tmp = X_test.loc[y_test_tmp.index]

        preds = models[f'model_horizon_{i+1}'].predict(X_test_tmp)

        mape = mean_absolute_percentage_error(y_test_tmp, preds)
        mae = mean_absolute_error(y_test_tmp, preds)
        rmse = np.sqrt(mean_squared_error(y_test_tmp, preds))
        count = len(preds)

        df.loc[i] = {'MAPE': round(mape,4), 'MAE': round(mae,4), 'RMSE': round(rmse,4), 'Count': count}

        print(f'Metrics for horizon {i+1}: MAPE: {mape:.4f}, MAE: {mae:.4f}, RMSE: {rmse:.4f}')

    total_count = df['Count'].sum()

    weighted_avg_mape = (df['MAPE'] * df['Count']).sum() / total_count
    weighted_avg_mae = (df['MAE'] * df['Count']).sum() / total_count
    weighted_avg_rmse = (df['RMSE'] * df['Count']).sum() / total_count

    print(f"\nMetrics weighted mean: MAPE: {weighted_avg_mape:.4f}, MAE: {weighted_avg_mae:.4f}, RMSE: {weighted_avg_rmse:.4f}")

    #display(df)

    return df


In [None]:
#@title N-models

def add_shifted_targets(dataset, max_n):
    """
    Tworzy zbiór danych z przesuniętymi targetami dla różnych horyzontów prognozy.
    """
    target_columns = []
    for n in range(1, max_n + 1):
        dataset[f'price_t+n_{n}'] = dataset.groupby('name')['price'].shift(-n)
        target_columns.append(f'price_t+n_{n}')

    #dataset.drop(columns=['price'],inplace=True)
    #dataset.rename(columns={'price': 'price_lag0'}, inplace=True)
    return dataset, target_columns


def train_models_for_forecast_horizon(X_n, y_n, target_cols, model_params,n_splits=4):

    models = {}
    for i, y_col in enumerate(target_cols):
      target = y_n[y_col].copy().dropna()
      X_n_tmp = X_n.loc[target.index]
      #display(X_n_tmp)
      #display(target)
      #return
      model, datasets = learnCV(X_n_tmp,target,n_splits)

      models[f'model_horizon_{i+1}'] = model
      if not TEST_MODE:
        autosave_model(model)
      del model
      print(f'Model for horizon {i+1} trained.')

    return models


N = 7
n_splits_n=2

if not _features_added:
  raise RuntimeError("Features not added")

dataset_n = dataset.copy()
dataset_n, target_columns_n = add_shifted_targets(dataset_n, N)

X_n = dataset_n.drop(columns=target_columns_n)
y_n = dataset_n[target_columns_n]

#X_train_n, X_test_n, y_train_n, y_test_n = split_dataset(X_n, y_n)

models = train_models_for_forecast_horizon(X_n, y_n, target_columns_n, model_params_n,n_splits_n)


test_index = get_test_index_TSS(X,n_splits_n)


_=eval_models_n(models, X_n.iloc[test_index], y_n.iloc[test_index])

del test_index



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037774 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19447
[LightGBM] [Info] Number of data points in the train set: 1893, number of used features: 74
[LightGBM] [Info] Start training from score 1.380000
Training until validation scores don't improve for 100 rounds
[50]	test's mape: 0.127217	test's l1: 0.437352
[100]	test's mape: 0.106706	test's l1: 0.359862
[150]	test's mape: 0.106486	test's l1: 0.359546
Early stopping, best iteration is:
[88]	test's mape: 0.107068	test's l1: 0.361003




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008623 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25183
[LightGBM] [Info] Number of data points in the train set: 3785, number of used features: 75
[LightGBM] [Info] Start training from score 1.850000
Training until validation scores don't improve for 100 rounds
[50]	test's mape: 0.0965223	test's l1: 0.213964
[100]	test's mape: 0.0962871	test's l1: 0.213104
Early stopping, best iteration is:
[48]	test's mape: 0.096627	test's l1: 0.214434
Model for horizon 1 trained.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003776 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19255
[LightGBM] [Info] Number of data points in the train set: 1860, number of used features: 74
[LightGBM] [Info] Start training from score 1.340000
Training until validation scores don't improve



[50]	test's mape: 0.186484	test's l1: 0.635358
[100]	test's mape: 0.182604	test's l1: 0.62068
[150]	test's mape: 0.178719	test's l1: 0.605013
[200]	test's mape: 0.179372	test's l1: 0.606629
Early stopping, best iteration is:
[123]	test's mape: 0.178791	test's l1: 0.605078




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007414 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25179
[LightGBM] [Info] Number of data points in the train set: 3718, number of used features: 75
[LightGBM] [Info] Start training from score 1.850000
Training until validation scores don't improve for 100 rounds
[50]	test's mape: 0.111608	test's l1: 0.254248
[100]	test's mape: 0.111993	test's l1: 0.255223
[150]	test's mape: 0.112132	test's l1: 0.254749
Early stopping, best iteration is:
[50]	test's mape: 0.111608	test's l1: 0.254248
Model for horizon 2 trained.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002599 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19043
[LightGBM] [Info] Number of data points in the train set: 1825, number of used features: 74
[LightGBM] [Info] Start training from score 1.320000




[50]	test's mape: 0.186428	test's l1: 0.632023
[100]	test's mape: 0.184143	test's l1: 0.621888
[150]	test's mape: 0.184926	test's l1: 0.624038
[200]	test's mape: 0.180968	test's l1: 0.610703
[250]	test's mape: 0.179707	test's l1: 0.60648
[300]	test's mape: 0.178958	test's l1: 0.603312
[350]	test's mape: 0.172165	test's l1: 0.579469
[400]	test's mape: 0.171114	test's l1: 0.575289
[450]	test's mape: 0.169876	test's l1: 0.570551
[500]	test's mape: 0.169549	test's l1: 0.569506
Did not meet early stopping. Best iteration is:
[412]	test's mape: 0.169695	test's l1: 0.569949




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006588 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25170
[LightGBM] [Info] Number of data points in the train set: 3650, number of used features: 75
[LightGBM] [Info] Start training from score 1.840000
Training until validation scores don't improve for 100 rounds
[50]	test's mape: 0.115253	test's l1: 0.255298
[100]	test's mape: 0.115114	test's l1: 0.25337
Early stopping, best iteration is:
[30]	test's mape: 0.115656	test's l1: 0.257027
Model for horizon 3 trained.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002335 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18843
[LightGBM] [Info] Number of data points in the train set: 1792, number of used features: 74
[LightGBM] [Info] Start training from score 1.300000
Training until validation scores don't improve fo



[50]	test's mape: 0.224768	test's l1: 0.758759
[100]	test's mape: 0.22428	test's l1: 0.756312
[150]	test's mape: 0.219743	test's l1: 0.739839
[200]	test's mape: 0.216198	test's l1: 0.726562
[250]	test's mape: 0.215258	test's l1: 0.723606
[300]	test's mape: 0.21376	test's l1: 0.718894
[350]	test's mape: 0.214832	test's l1: 0.722144
Early stopping, best iteration is:
[272]	test's mape: 0.213527	test's l1: 0.718095




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003903 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25169
[LightGBM] [Info] Number of data points in the train set: 3583, number of used features: 75
[LightGBM] [Info] Start training from score 1.840000
Training until validation scores don't improve for 100 rounds
[50]	test's mape: 0.121065	test's l1: 0.260699
[100]	test's mape: 0.121161	test's l1: 0.260406
Early stopping, best iteration is:
[28]	test's mape: 0.121266	test's l1: 0.262981
Model for horizon 4 trained.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002653 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18704
[LightGBM] [Info] Number of data points in the train set: 1759, number of used features: 74
[LightGBM] [Info] Start training from score 1.300000
Training until validation scores don't improve f



[50]	test's mape: 0.292375	test's l1: 0.970997
[100]	test's mape: 0.284042	test's l1: 0.945228
[150]	test's mape: 0.273222	test's l1: 0.908862
[200]	test's mape: 0.274849	test's l1: 0.914577
[250]	test's mape: 0.268818	test's l1: 0.893342
[300]	test's mape: 0.259673	test's l1: 0.86424
[350]	test's mape: 0.259865	test's l1: 0.864734
[400]	test's mape: 0.258746	test's l1: 0.86121
[450]	test's mape: 0.255646	test's l1: 0.850422
[500]	test's mape: 0.255382	test's l1: 0.849688
Did not meet early stopping. Best iteration is:
[433]	test's mape: 0.255591	test's l1: 0.850258
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003819 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25168
[LightGBM] [Info] Number of data points in the train set: 3516, number of used features: 75
[LightGBM] [Info] Start training from score 1.850000
Training until validation scores don't improve for 100 rounds




[50]	test's mape: 0.129593	test's l1: 0.282528
[100]	test's mape: 0.131317	test's l1: 0.285675
Early stopping, best iteration is:
[25]	test's mape: 0.130231	test's l1: 0.285518
Model for horizon 5 trained.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003526 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18534
[LightGBM] [Info] Number of data points in the train set: 1728, number of used features: 74
[LightGBM] [Info] Start training from score 1.290000
Training until validation scores don't improve for 100 rounds




[50]	test's mape: 0.319648	test's l1: 1.064
[100]	test's mape: 0.317914	test's l1: 1.05849
[150]	test's mape: 0.313303	test's l1: 1.04162
[200]	test's mape: 0.313062	test's l1: 1.04067
[250]	test's mape: 0.307642	test's l1: 1.02068
[300]	test's mape: 0.304627	test's l1: 1.01078
[350]	test's mape: 0.301627	test's l1: 1.00145
[400]	test's mape: 0.300694	test's l1: 0.998834
[450]	test's mape: 0.30117	test's l1: 1.00028
Early stopping, best iteration is:
[377]	test's mape: 0.300457	test's l1: 0.997449




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003751 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25170
[LightGBM] [Info] Number of data points in the train set: 3455, number of used features: 75
[LightGBM] [Info] Start training from score 1.860000
Training until validation scores don't improve for 100 rounds
[50]	test's mape: 0.13776	test's l1: 0.29669
[100]	test's mape: 0.137196	test's l1: 0.296617
Early stopping, best iteration is:
[48]	test's mape: 0.137784	test's l1: 0.296673
Model for horizon 6 trained.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002413 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18452
[LightGBM] [Info] Number of data points in the train set: 1697, number of used features: 74
[LightGBM] [Info] Start training from score 1.290000
Training until validation scores don't improve for



[50]	test's mape: 0.341365	test's l1: 1.13258
[100]	test's mape: 0.330422	test's l1: 1.09809
[150]	test's mape: 0.330777	test's l1: 1.09767
[200]	test's mape: 0.327956	test's l1: 1.08824
[250]	test's mape: 0.328472	test's l1: 1.08956
Early stopping, best iteration is:
[180]	test's mape: 0.326963	test's l1: 1.08556




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25164
[LightGBM] [Info] Number of data points in the train set: 3394, number of used features: 75
[LightGBM] [Info] Start training from score 1.870000
Training until validation scores don't improve for 100 rounds
[50]	test's mape: 0.138594	test's l1: 0.297354
[100]	test's mape: 0.1382	test's l1: 0.2978
Early stopping, best iteration is:
[43]	test's mape: 0.138549	test's l1: 0.297133


In [None]:
#@title N-iterative
def predict_iterative(dataset,item_name, N):
    if item_name not in dataset.index.get_level_values('name'):
        print(f"{item_name} is not in the given data")
        return None
    item_data = dataset.xs(item_name, level='name', drop_level=False)

    if len(item_data) == 0:
        print(f"{item_name} is not in the given data")
        return None

    N = min(N, len(item_data) - 1)

    item_actuals = item_data['price']
    item_1ahead_pred = y_pred[item_data.index]

    predictions = pd.DataFrame()
    predictions_perfect = pd.DataFrame()

    start_index = len(item_data) - N
    end_index = min(start_index + N, len(item_data))

    actual_prices = item_data.iloc[start_index:end_index]['price']
    bef_curr_row = item_data.iloc[[start_index - 1]]

    train_item_data = item_data.loc[:bef_curr_row.index[0]].copy()
    perfect_item_data = item_data.loc[:bef_curr_row.index[0]].copy()
    #display(train_item_data)
    huj_counter=0
    for i in range(start_index, end_index):
        new_row = bef_curr_row.reset_index(drop=False)
        new_row['date'] += pd.DateOffset(days=1)
        new_row['price'] = np.nan
        new_row.set_index(['name', 'date'], inplace=True)

        train_item_data = pd.concat([train_item_data, new_row], ignore_index=False)
        try:
          perfect_item_data = pd.concat([perfect_item_data, item_data.loc[new_row.index]], ignore_index=False)
        except KeyError as e:
          huj_counter+=1
          perfect_item_data=pd.concat([train_item_data, new_row], ignore_index=False)

        train_item_data = add_features(train_item_data)
        perfect_item_data = add_features(perfect_item_data)

        X = train_item_data.iloc[[-1]].drop(columns=['price'])
        X_perfect = perfect_item_data.iloc[[-1]].drop(columns=['price'])

        prediction = round(model.predict(X, num_iteration=model.best_iteration)[0], 2)
        prediction_perfect = round(model.predict(X_perfect, num_iteration=model.best_iteration)[0], 2)

        train_item_data.at[new_row.index[0], 'price'] = prediction
        perfect_item_data.at[new_row.index[0], 'price'] = prediction

        bef_curr_row = train_item_data.iloc[[-1]]

        predictions = pd.concat([predictions, pd.DataFrame({'price': [prediction]}, index=X.index)])
        predictions_perfect = pd.concat([predictions_perfect, pd.DataFrame({'price': [prediction_perfect]}, index=X.index)])

    print("huj_counter: ",huj_counter)
    actual_prices_in_period = item_data.iloc[start_index:end_index]['price']
    naive_prices_in_period = pd.Series(item_data['price_lag7'].fillna(X_test['storePrice']).fillna(3).iloc[start_index:end_index].to_numpy(), index=actual_prices_in_period.index)

    results = {
        "Metric": ["Accuracy", "RMSE", "MAE", "MAPE", "Max Error","Median Absolute Error"],
        "N-iter": calc_accuracy(predictions['price'], actual_prices_in_period),
        "N-iter_perfect": calc_accuracy(predictions_perfect['price'], actual_prices_in_period),
        "1ahead": calc_accuracy(item_1ahead_pred[start_index:end_index], actual_prices_in_period),
        "Naive": calc_accuracy(naive_prices_in_period, actual_prices_in_period),
        "start_index": [start_index] * 6,  # Adding start_index for each metric
        "end_index": [end_index] * 6      # Adding end_index for each metric
    }

    df = pd.DataFrame(results)

    return df, predictions, predictions_perfect, item_actuals, item_1ahead_pred, start_index, end_index


def plot_results(item_actuals, item_1ahead_pred, predictions, predictions_perfect, start_index, end_index, item_name):
  dates = item_actuals.index.get_level_values('date')
  sample_len = end_index - start_index + 1
  overhead = 15

  if len(dates) > 60:
      if sample_len + overhead <= 90:
          display_start_idx = -(sample_len + overhead)
      else:
          display_start_idx = -(sample_len + max(overhead, int(0.5 * sample_len)))
  else:
      display_start_idx = None

  plt.style.use('dark_background')
  plt.figure(figsize=(12, 6))
  plt.xticks(rotation=45)
  plt.plot(dates[display_start_idx:], item_actuals.values[display_start_idx:], label='actual')
  plt.plot(dates[display_start_idx:], item_1ahead_pred.values[display_start_idx:], label='prediction 1ahead')
  plt.plot(dates[start_index:end_index], predictions_perfect.values, label='prediction iter, perfect data', color='green')
  plt.plot(dates[start_index:end_index], predictions.values, label='prediction iter', color='red')
  plt.xlabel('Data')
  plt.ylabel('Cena')
  plt.title(f'{item_name}: Forecasting Results')
  plt.legend()
  plt.show()

def get_summaries(dataset,test_names, N, plot=True):
    summary_df = pd.DataFrame()
    for item_name in test_names:
        res=predict_iterative(dataset,item_name, N)
        if res is None:
            continue
        result, predictions, predictions_perfect, item_actuals, item_1ahead_pred, start_index, end_index = res
        if result is not None:
            result['item_name'] = item_name
            display(result)
            summary_df = pd.concat([summary_df, result], ignore_index=True)
            if plot:
                plot_results(item_actuals, item_1ahead_pred, predictions, predictions_perfect, start_index, end_index, item_name)
    return summary_df


def summarize_summary(summary_df):
    metrics = ['Accuracy', 'RMSE', 'MAE', 'MAPE', 'Max Error', 'Median Absolute Error']
    methods = ['N-iter_perfect', '1ahead', 'Naive']
    summary_stats = pd.DataFrame()

    for method in methods:
        grouped = summary_df[summary_df['Metric'].isin(metrics)].groupby('Metric',observed=True)

        # Initialize DataFrame to hold weighted metrics
        method_stats = pd.DataFrame(index=metrics)

        # Calculate weighted mean and weighted std for each metric
        for metric in metrics:
            metric_data = summary_df[summary_df['Metric'] == metric][method]
            weights = summary_df[summary_df['Metric'] == metric]['end_index'] - summary_df[summary_df['Metric'] == metric]['start_index']

            weighted_mean = np.average(metric_data, weights=weights)

            weighted_std = np.sqrt(np.average((metric_data - weighted_mean) ** 2, weights=weights))

            method_stats.loc[metric, f'Weighted Mean_{method}'] = weighted_mean
            method_stats.loc[metric, f'Weighted Std_{method}'] = weighted_std

        method_stats[f'Mean_{method}'] = grouped[method].mean()
        method_stats[f'Median_{method}'] = grouped[method].median()
        summary_stats = pd.concat([summary_stats, method_stats], axis=1)

    return summary_stats


In [None]:
#@title Launch N-iterative
N_days=200

random.seed(41)
item_names = ['Black Diamond Thompson', 'Acid Rock',*[random.choice(test_names) for _ in range(8)]]  # Add your actual test names here

summaries = get_summaries(test_dataset,item_names, N_days, plot=1)
final_summary = summarize_summary(summaries)

pd.set_option('display.max_columns', None)
display(final_summary)




#result, predictions, predictions_perfect, item_actuals, item_1ahead_pred,  start_index, end_index = foo('Black Diamond Thompson',30)
#result['item_name'] = item_name
#display(result)
#plot_results(item_actuals, item_1ahead_pred, predictions, predictions_perfect, start_index, end_index, item_name)

In [77]:
#@title plot predictions

tshow=X_test.copy()
tshow['pred1'] = y_pred

tshow['actual'] = y_test
for name in test_names[::9][:7]:
  phs=[]
  end_names=['actual','pred']

  try:
    ph = tshow.loc[name][['actual','pred1']]
    phs=[ph[['pred1']],ph[['actual']]]
  except KeyError as e:
    print(name," not in dataset")
    raise e
    continue

  end_names.append(name)

  plot_pricehistory(phs,end_names,name+' test')


NameError: name 'test_names' is not defined

In [None]:
trim=0
print_importances(model,trim,False,figsize=(10, 18))

In [None]:
for i in range(len(models)):
  trim=10
  print_importances(models[f"model_horizon_{i+1}"],0,False,name=f" N={i+1}'",figsize=(7, 2),end=10)


In [None]:
from google.colab import runtime
runtime.unassign()