<a href="https://colab.research.google.com/github/blackcaer/SMF-training/blob/main/SMFv2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Initializing
from google.colab import drive
drive.mount('/content/drive/')

#!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd  # https://github.com/microsoft/LightGBM/issues/5914

!pip install lightgbm #--config-settings=cmake.define.USE_GPU=ON

!pip install scikit-optimize

import lightgbm as lgb
from IPython.display import display

#import os
import json
#import time
import gc
import random
import psutil
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import traceback
from prettytable import PrettyTable

from datetime import datetime
from functools import partial

from google.colab import files
from sys import getsizeof
from time import time
import pprint
#import joblib

from collections import defaultdict, Counter
from itertools import combinations

from sklearn import preprocessing
#from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.metrics import root_mean_squared_error,mean_absolute_error,mean_absolute_percentage_error,median_absolute_error
from sklearn.model_selection import train_test_split, GroupShuffleSplit#, GridSearchCV
from sklearn.model_selection import TimeSeriesSplit

#from skopt import BayesSearchCV
#from skopt.callbacks import DeadlineStopper, DeltaYStopper
#from skopt.space import Real, Categorical, Integer

#from scipy.fftpack import fft
#from scipy.stats import zscore

import plotly.graph_objects as go

!pip install optuna
!pip install optuna-integration[lightgbm]
import optuna
from optuna.integration import LightGBMPruningCallback

from torch.cuda import get_device_name,is_available
#from os import cpu_count

#print(f"Liczba rdzeni procesora: {cpu_count()} (realnych 2x mniej prawdopodobnie)")
gpu_available = is_available()

if gpu_available:
    print("GPU avalibe.")
    print("Name of GPU:", get_device_name(0))
else:
    print("GPU is not avalibe.")


In [None]:
#@title Constants

ITEMS_PHSM_JSON = '/content/drive/My Drive/SMF_files/items_phsm.json'
ITEMS_PHSM_JSON_TEST = '/content/drive/My Drive/SMF_files/items_phsm_test.json'

PLAYER_COUNT_JSON = '/content/drive/My Drive/SMF_files/rust_player_count_interpolated.json'

SPIKES_TH=2
SPIKES_TH_TEST=1.5
SPIKES_PATH='/content/drive/My Drive/SMF_files/spikes_correction.csv'
SPIKES_PATH_TEST='/content/drive/My Drive/SMF_files/spikes_correction_test.csv'

MODEL_SAVES_PATH='/content/drive/MyDrive/SMF_files/model_saves'
OPT_SAVES_PATH="/content/drive/MyDrive/SMF_files/opt_saves/"

ITEMS_TO_EXCLUDE=['Metal Tree Door',]#'No Mercy Revolver - E Class','Neon Dragon Garage Door','Zipper Face'] # 'Metal Tree Door' - one of the rarest items in the game, very few sales since 2018

N_SPLITS = 5
TEST_MODE = 0


In [None]:
#@title Helpers 1
def get_learning_summary(y_pred_train, y_train, y_pred_test, y_test, y_pred_naive, label="[no label]"):
  """ Returns dataframe with metrics"""

  results = {
      "Metric": ["Accuracy", "RMSE", "MAE", "MAPE", "Max Error", "Median Absolute Error"],
      "Train": calc_accuracy(y_pred_train, y_train, "Train accuracy:"),
      "Valid": calc_accuracy(y_pred_test, y_test, "Validation accuracy:"),
      "Naive": calc_accuracy(y_pred_naive, y_test, "Naive accuracy:")
  }

  #display(df)
  return pd.DataFrame(results)

def calc_accuracy(pred, actual, label=""):
  """ Returns accuracy metrics: accuracy, rmse, mae, mape, max_error, median_absolute_error """
  N=3
  errors = np.abs(pred - actual)
  accuracy = round(100 * (1 - np.mean(errors / actual)), N)
  max_error = round(np.max(errors), N)
  std_deviation = round(np.std(errors), N)
  mae = round(np.mean(errors), N)
  median_absolute_error = round(np.median(errors), N)
  mape = round(mean_absolute_percentage_error(actual, pred), N)
  rmse = round(root_mean_squared_error(actual, pred), N)

  return accuracy, rmse, mae, mape, max_error, median_absolute_error

def print_importances(model,start=0,print_tab=False,name="",figsize=(10, 18),importance_type='gain',end=-1):
  importance = model.feature_importance(importance_type=importance_type)  # 'split' lub 'gain'
  feature_names = model.feature_name()

  trim=0
  importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
  importance_df = importance_df.sort_values(by='Importance', ascending=False)[start:end]

  # Wizualizacja
  print(f"Trimmed first {trim} features to see better")
  plt.figure(figsize=figsize)
  plt.barh(importance_df['Feature'], importance_df['Importance'])
  plt.xlabel('Importance')
  plt.title(f'Feature Importance {name}')
  plt.style.use('dark_background')
  plt.gca().invert_yaxis()
  plt.show()
  pd.set_option('display.max_rows', None)
  if print_tab:
    display(importance_df)

def aggregate_pricehistories(pricehistories):
    interpolated_histories = []
    max_date = max(df.index.max() for df in pricehistories)
    for df in pricehistories:
        df_resampled = df.resample('D').interpolate(method='linear')
        full_date_range = pd.date_range(start=df_resampled.index.min(), end=max_date, freq='D')
        df_reindexed = df_resampled.reindex(full_date_range)
        df_filled = df_reindexed.ffill()

        interpolated_histories.append(df_filled)

    all_data = pd.concat(interpolated_histories)

    sum = all_data.groupby(all_data.index).sum()
    med = all_data.groupby(all_data.index).median()
    mean = all_data.groupby(all_data.index).mean()

    plot_pricehistory([med[:],mean[:]],['med','mean'],"Test")
    plot_pricehistory([sum[:]],['sum'],"Test",1)
    return sum,med,mean

def col_from_idx(df,idx_name,pos=None):
  df[idx_name] = df.index.get_level_values(idx_name)
  if pos is not None:
    cols = df.columns.tolist()
    cols.insert(pos, cols.pop(cols.index(idx_name)))
    df = df[cols]

def plot_pricehistory(pricehistories: list, labels: list, title, day_interval=30, relative_x_axis=False, figsize=(16, 6)):
    fig = go.Figure()

    for i in range(len(pricehistories)):
        if relative_x_axis:
            days_from_start = (pricehistories[i].index - pricehistories[i].index[0]).days

            fig.add_trace(go.Scatter(
                x=days_from_start,
                y=pricehistories[i].values,
                mode='lines',
                name=labels[i]
            ))
        else:
            fig.add_trace(go.Scatter(
                x=pricehistories[i].index,
                y=pricehistories[i].values,
                mode='lines',
                name=labels[i]
            ))

    if relative_x_axis:
        fig.update_xaxes(title_text='Days from Start', tickvals=days_from_start[::day_interval])
    else:
        fig.update_xaxes(title_text='Date', tickformat='%Y-%m-%d', dtick=f'{day_interval*86400000}')  # dtick w milisekundach

    fig.update_yaxes(title_text='Price')

    fig.update_layout(
        title=title,
        xaxis=dict(
            tickangle=45,
            showgrid=True,
            gridcolor='gray',
            gridwidth=0.5
        ),
        yaxis=dict(
            showgrid=True,
            gridcolor='gray',
            gridwidth=0.5
        ),
        template='plotly_dark',
        legend=dict(
            x=0.01, y=0.99,
            bordercolor="Black",
            borderwidth=1
        ),
        autosize=False,
        width=figsize[0] * 100, height=figsize[1] * 100
    )

    fig.show()


"""reducing.py
Author: Kirgsn, 2018
"""
from joblib import Parallel, delayed
from fastprogress import master_bar, progress_bar

def measure_time_mem(func):
    def wrapped_reduce(self, df, *args, **kwargs):
        # pre
        mem_usage_orig = df.memory_usage().sum() / self.memory_scale_factor
        start_time = time()
        # exec
        ret = func(self, df, *args, **kwargs)
        # post
        mem_usage_new = ret.memory_usage().sum() / self.memory_scale_factor
        end_time = time()
        print(f'reduced df from {mem_usage_orig:.4f} MB '
              f'to {mem_usage_new:.4f} MB '
              f'in {(end_time - start_time):.2f} seconds')
        gc.collect()
        return ret
    return wrapped_reduce


class Reducer:
    """
    Class that takes a dict of increasingly big numpy datatypes to transform
    the data of a pandas dataframe into, in order to save memory usage.
    """
    memory_scale_factor = 1024**2  # memory in MB

    def __init__(self, conv_table=None, use_categoricals=True, n_jobs=-1):
        """
        :param conv_table: dict with np.dtypes-strings as keys
        :param use_categoricals: Whether the new pandas dtype "Categoricals"
                shall be used
        :param n_jobs: Parallelization rate
        """

        self.conversion_table = \
            conv_table or {'int': [np.int8, np.int16, np.int32, np.int64],
                           'uint': [np.uint8, np.uint16, np.uint32, np.uint64],
                           'float': [np.float32, ]}
        self.null_int = {   np.int8:  pd.Int8Dtype,
                            np.int16: pd.Int16Dtype,
                            np.int32: pd.Int32Dtype,
                            np.int64: pd.Int64Dtype,
                            np.uint8: pd.UInt8Dtype,
                            np.uint16:pd.UInt16Dtype,
                            np.uint32:pd.UInt32Dtype,
                            np.uint64:pd.UInt64Dtype}

        self.use_categoricals = use_categoricals
        self.n_jobs = n_jobs

    def _type_candidates(self, k):
        for c in self.conversion_table[k]:
            i = np.iinfo(c) if 'int' in k else np.finfo(c)
            yield c, i

    @measure_time_mem
    def reduce(self, df, verbose=False):
        """Takes a dataframe and returns it with all data transformed to the
        smallest necessary types.

        :param df: pandas dataframe
        :param verbose: If True, outputs more information
        :return: pandas dataframe with reduced data types
        """
        ret_list = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(progress_bar(list(delayed(self._reduce)
                                                (df[c], c, verbose) for c in
                                                df.columns)))
        del df
        gc.collect()
        return pd.concat(ret_list, axis=1)

    def _reduce(self, s, colname, verbose):
        try:
            isnull = False
            # skip NaNs
            if s.isnull().any():
                isnull = True
            # detect kind of type
            coltype = s.dtype
            if np.issubdtype(coltype, np.integer):
                conv_key = 'int' if s.min() < 0 else 'uint'
            elif np.issubdtype(coltype, np.floating):
                conv_key = 'float'
                asint = s.fillna(0).astype(np.int64)
                result = (s - asint)
                result = np.abs(result.sum())
                if result < 0.01:
                    conv_key = 'int' if s.min() < 0 else 'uint'
            else:
                if isinstance(coltype, object) and self.use_categoricals:
                    # check for all-strings series
                    if s.apply(lambda x: isinstance(x, str)).all():
                        if verbose: print(f'convert {colname} to categorical')
                        return s.astype('category')
                if verbose: print(f'{colname} is {coltype} - Skip..')
                return s
            # find right candidate
            for cand, cand_info in self._type_candidates(conv_key):
                if s.max() <= cand_info.max and s.min() >= cand_info.min:
                    if verbose: print(f'convert {colname} to {cand}')
                    if isnull:
                        return s.astype(self.null_int[cand]())
                    else:
                        return s.astype(cand)

            # reaching this code is bad. Probably there are inf, or other high numbs
            print(f"WARNING: {colname} doesn't fit the grid with \nmax: {s.max()} "
                f"and \nmin: {s.min()}")
            print('Dropping it..')
        except Exception as ex:
            print(f'Exception for {colname}: {ex}')
            return s

def reduce_mem_usage(df):
  return Reducer().reduce(df)

#====DataPrepper
class DataPrepper:
    def __init__(self):
        self.val_df = None
        self.train_df = None
        self.items_phsm_df = None
        self.rust_player_count_df = None
        self.dataset: pd.DataFrame = None
        self.columns_to_drop = [
            'previewUrl', 'views', 'timeCreated', 'timeRefreshed', 'isAvailableOnStore', 'creatorName',
            'appId',
            'id', 'nameId','hasGlow', 'hasCutout','timeAccepted'] # 'hasGlow', 'hasCutout' - no gain, no splits
        self.desired_column_order = ['date', 'price', 'volume', 'name', 'playerCount', 'supplyTotalEstimated',
                                     'storePrice',
                                     'glowRatio',  'cutoutRatio', 'hasGlowSights', 'facepunchSkin',
                                     'itemType',
                                     'itemCollection',
                                     ]

    def load_data(self):
        """
        Loads self.items_phsm_df and self.rust_player_count_df from files.
        """
        with open(ITEMS_PHSM_JSON if not TEST_MODE else ITEMS_PHSM_JSON_TEST, 'r') as f:
            items_phsm = json.load(f)

        with open(PLAYER_COUNT_JSON, 'r') as f:
            rust_player_count_json = json.load(f)

        items_phsm = [item for item in items_phsm if item['name'] not in ITEMS_TO_EXCLUDE]

        phsm_records = self.unfold_phsm(items_phsm)

        self.items_phsm_df = pd.DataFrame(phsm_records)

        self.rust_player_count_df = pd.DataFrame(rust_player_count_json)

        print("Loaded")

    def preprocess_data(self):
        """
        Prepares, merges data and assigns it to self.dataset
        """

        self.prep_player_count()
        self.prep_items_phsm()

        merged_df = pd.merge(self.items_phsm_df, self.rust_player_count_df, on='date',
                             how='left')

        ordered_df = merged_df.reindex(columns=self.desired_column_order)

        self.dataset = ordered_df

    @staticmethod
    def unfold_phsm(items_phsm):
        phsm_records = []
        for item in items_phsm:
            for phsm_entry in item['phsm']:
                record = {k: v for k, v in item.items() if k != 'phsm'}
                record.update(phsm_entry)
                phsm_records.append(record)
        return phsm_records

    def prep_player_count(self):
        self.rust_player_count_df = self.rust_player_count_df.rename(columns={'Date': 'date'})
        self.rust_player_count_df = self.rust_player_count_df.rename(columns={'Player_count': 'playerCount'})
        self.rust_player_count_df['date'] = pd.to_datetime(self.rust_player_count_df['date'])

        return self.rust_player_count_df

    def prep_items_phsm(self):
        self.items_phsm_df.rename(columns={'median': 'price'},inplace=True)
        self.items_phsm_df['date'] = pd.to_datetime(self.items_phsm_df['date'])
        self.items_phsm_df['timeAccepted'] = pd.to_datetime(self.items_phsm_df['timeAccepted'], errors='coerce')
        self.items_phsm_df['date'] = pd.to_datetime(self.items_phsm_df['date'])
        self.items_phsm_df.drop(columns=self.columns_to_drop,inplace=True)  # Delete unnecessary columns
        self.items_phsm_df = self.items_phsm_df.sort_values(by=['name', 'date'])


        self.items_phsm_df.reset_index(drop=True)

    @staticmethod
    def add_features(dataset,change_original=False):
        raise NotImplementedError("Run window with add_features implementation first") #Add features is often changed so it's implemented in other window so I don't have to create new object every time i change it

def getdata_dataset(x,name,showdata=False):
  grouped=x.groupby('name')['price']
  d={
     'df size': x.shape[0],
     'price mean': x['price'].mean(),
     'price median': x['price'].median(),
     'price std': x['price'].std(),

     'price items mean mean': grouped.mean().mean(),
     'price items mean median': grouped.mean().median(),
     'price items median mean': grouped.median().mean(),
     'price items median median': grouped.median().median(),

     'price items std mean': grouped.std().mean(),
     'price items std median': grouped.std().median(),

       }
  if showdata:
    print(name)
    for k,v in d.items():
      print(k,':',round(v,2))
  return d

def printdata_dataset(X,X_test):
  dict_x = getdata_dataset(X,"Train data: ")
  dict_xt = getdata_dataset(X_test,"\nTest data: ")

  d_x,d_xt = list(dict_x.values()), list(dict_xt.values())
  percent_diff = [(x - xt) / xt * 100 for x, xt in zip(d_x, d_xt)]

  table = PrettyTable()
  table.field_names = ["Key", "Value X", "Value X_test", "percent_diff"]

  for i in range(len(d_x)):
      table.add_row([list(dict_x.keys())[i], f"{d_x[i]:.2f}", f"{d_xt[i]:.2f}", f"{percent_diff[i]:.2f}%"])

  print(table)

In [None]:
#@title Helpers 2, Def prepare_datasets
def split_dataset(X, y, test_size):

    X.sort_index(level='date', inplace=True)
    y.sort_index(level='date', inplace=True)

    split_index = int(len(X) * (1 - test_size))

    train_idx, test_idx = np.arange(0, split_index), np.arange(split_index, len(X))

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    return X_train, X_test, y_train, y_test

def get_model_postfix(test_metrics,addidional_info=""):
  mae = test_metrics[test_metrics['Metric'] == 'MAE'].iloc[0, 1]    # Test dataset
  mape = test_metrics[test_metrics['Metric'] == 'MAPE'].iloc[0, 1]  # Test dataset
  median_ae = test_metrics[test_metrics['Metric'] == 'Median Absolute Error'].iloc[0, 2]  # Test dataset
  nmape = test_metrics[test_metrics['Metric'] == 'MAPE'].iloc[0, 2] # Naive mape
  return f"NMAPE{round(nmape * 1000)}MAE{round(mae * 1000)}MedAE{round(median_ae * 1000)}MAPE{round(mape * 1000)}{addidional_info}"

def autosave_model(model,name="lgbm_af",postfix=""):
  # DATASET_DATA - globally set contant with data about dataset
  try:
    folder_path=MODEL_SAVES_PATH
    curr_time = datetime.now().strftime('%Y-%m-%d_%H-%M')
    file_path = os.path.join(folder_path, f"{name}_{DATASET_DATA}_{curr_time}_{postfix}.txt")
    model.save_model(file_path)
  except Exception as e:
    print(f"Error saving model: {e}")
    traceback.print_exc()

def autosave_optimizer(opt):
  try:
    folder_path=OPT_SAVES_PATH
    curr_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    file_path = os.path.join(folder_path, f"bayesian_optimizer_TEST{TEST_MODE}_{curr_time}.txt")
    joblib.dump(opt, file_path)
  except Exception as e:
    print(f"Error saving optimizer: {e}")
    traceback.print_exc()

def create_naive_prediction(X_test,keys,verbose=True):
  """ keys - what column of X_test to use as prediction, for example price_lag1 """
  #if keys is None:
  #  keys=['price_lag1','price_lag2','price_lag3','storePrice']
  for key in keys:
    try:
      y_pred_naive = pd.Series(X_test[key].fillna(3).to_numpy(), index = X_test.index)  # there shouldn't be much nan's
      if verbose:
        print(f"Created naive prediction from {key}")
    except KeyError:
      continue
    else:
      break
  if y_pred_naive is None:
    raise RuntimeError("Assigning value to y_pred_naive failed. Check your X_test dataset. Checked keys: "+str(keys))
  return y_pred_naive

#@title def eval_models

def get_last_index_TSS(X,n_splits):
    """ Last test index from TimeSeriesSplit was never used for learning, only for validation.
        Returns that test index."""
    tscv = TimeSeriesSplit(n_splits)
    tscv.split(X)
    test_index=None
    for train_index_tmp, test_index_tmp in tscv.split(X):
        test_index=test_index_tmp
    return test_index

def eval_model(model,X,y,set_name,keys_naive):
    """ Evaluates model and returns dataframe with metrics. """
    pred = model.predict(X)
    pred_naive = create_naive_prediction(X,keys=keys_naive)

    results = {
      "Metric": ["Accuracy", "RMSE", "MAE", "MAPE", "Max Error", "Median Absolute Error"],
    }

    results[set_name] = calc_accuracy(pred, y, f"{set_name} accuracy:")
    results["Naive"] = calc_accuracy(pred_naive, y, "Naive accuracy:")

    return pd.DataFrame(results)

DATASET_DATA=""
def prepare_datasets(dp,target_window=1,target_lag=0,test_size=0.15,columns_to_drop=None):
  # target window has to be also in windows_price in add_features
  # TARGET_LAG == X means that rolling mean is rolled after X+1 days from data point. If TARGET_LAG==0, mean is rolled starting on next day after data point.
  if columns_to_drop is None:
    columns_to_drop = []

  global DATASET_DATA
  DATASET_DATA=f"tw_{target_window}_tl{target_lag}"

  target_data = {'window':target_window,'lag':target_lag}
  dataset = dp.dataset.copy()
  dataset.set_index(['name', 'date'], inplace=True)
  dataset = dp.add_features(dataset,target_data)

  dataset.drop(columns=columns_to_drop,inplace=True)  # Drop less important features

  X_whole = dataset.drop(columns=['target'])
  y_whole = dataset["target"]

  # Fix indexes
  y_whole.dropna(inplace=True)
  X_whole = X_whole.loc[y_whole.index]

  X, X_test, y, y_test = split_dataset(X_whole,y_whole,test_size=test_size)
  del X_whole,y_whole
  return X, X_test, y, y_test

In [None]:
#@title Load

dp = DataPrepper()
dp.load_data()
dp.preprocess_data()

# reduce_mem_usage in other places fucks types up, doesn't give much and is slow
dp.items_phsm_df= reduce_mem_usage(dp.items_phsm_df)
dp.rust_player_count_df = reduce_mem_usage(dp.rust_player_count_df)

display(dp.dataset.head())

In [None]:
#@title Def add features
def add_features(dataset, target_data):
    target_window = target_data['window']
    target_lag = target_data['lag']

    to_interpolate = ['price', 'volume', 'playerCount']
    bool_columns = []
    int_columns = ['year', 'month', 'day', 'volume', 'playerCount', 'supplyTotalEstimated', 'weekday']
    categorical_columns = ['itemType', 'itemCollection']

    columns_to_keep = ['price', 'volume', 'playerCount', 'supplyTotalEstimated', 'storePrice', 'itemType', 'itemCollection']

    dataset = dataset[columns_to_keep]
    original_column_order = dataset.columns.tolist()

    # Interpolating data
    interpolated_data = dataset.groupby('name')[to_interpolate].apply(
        lambda group: group.reset_index('name', drop=True).asfreq('D').interpolate(method='linear'))

    not_interpolated_data = dataset.groupby('name')[dataset.columns.difference(to_interpolate)].apply(
        lambda group: group.reset_index('name', drop=True).asfreq('D'))

    dataset = pd.concat([interpolated_data, not_interpolated_data], axis=1)[original_column_order]

    def compute_features(group):

        lags_price = [1, 2, 3, 4, 5, 6, 7, 14, 21, 28, 35, 56]
        lags_vol = [1, 2, 3, 7, 14, 21]
        lags_player_count = [1, 7, 14, 21, 28]

        windows_price = [3, 7, 14, 21, 56, 84]
        windows_volume = [3, 7, 14, 21, 56]
        windows_player_count = [7, 14, 21, 28, 56]
        ewma_windows = [7, 14, 21, 30, 56]

        features = {}

        lag_type_col = [1, 7, 14, 28, 56]

        for lag in lags_price:
          features[f'price_lag{lag}'] = group['price'].shift(lag)

        for lag in lags_vol:
          features[f'vol_lag{lag}'] = group['volume'].shift(lag)

        for lag in lags_player_count:
          lag = lag + 1  # lag+1 to avoid data leakage with rolling
          for window in windows_player_count:

            if len(group) + lag >= window:  # Ensure enough data points for rolling
              roll = group[['playerCount']]['playerCount'].shift(lag).rolling(window=window)
              features[f'playerCount_roll_mean_{window}_lag_{lag - 1}'] = roll.mean()
            else:
              features[f'playerCount_roll_mean_{window}_lag_{lag - 1}'] = np.nan

        for window in windows_price:
          if len(group) >= window:  # Ensure enough data points for rolling
              roll = group[['price']].shift(1).rolling(window=window)
              features.update({
                f'price_roll_mean_{window}': roll['price'].mean(),
                f'price_roll_std_{window}': roll['price'].std(),
                f'price_roll_var_{window}': roll['price'].var(),
                f'price_roll_sum_{window}': roll['price'].sum(),
              })
          else:
            features.update({
                f'price_roll_mean_{window}': np.nan,
                f'price_roll_std_{window}': np.nan,
                f'price_roll_var_{window}': np.nan,
                f'price_roll_sum_{window}': np.nan,
            })

        for window in windows_volume:
          if len(group) >= window:  # Ensure enough data points for rolling
              roll = group[['volume']].shift(1).rolling(window=window)
              features.update({
                f'vol_roll_mean_{window}': roll['volume'].mean(),
              })
          else:
            features.update({
                f'vol_roll_mean_{window}': np.nan,
            })

        item_type_group = group.groupby('itemType', observed=True)
        for window in windows_price:
            if len(group) >= window:  # Ensure enough data points for rolling
                roll = item_type_group[['price', 'volume']].shift(1).rolling(window=window)
                features[f'itemType_price_roll_mean_{window}'] = roll['price'].mean()
                features[f'itemType_price_roll_std_{window}'] = roll['price'].std()
                features[f'itemType_price_roll_var_{window}'] = roll['price'].var()
                features[f'itemType_price_roll_sum_{window}'] = roll['price'].sum()
            else:
                features[f'itemType_price_roll_mean_{window}'] = np.nan
                features[f'itemType_price_roll_std_{window}'] = np.nan
                features[f'itemType_price_roll_var_{window}'] = np.nan
                features[f'itemType_price_roll_sum_{window}'] = np.nan

        collection_group = group.groupby('itemCollection', observed=True)
        for window in windows_price:
          if len(group) >= window:  # Ensure enough data points for rolling
            roll = collection_group[['price', 'volume']].shift(1).rolling(window=window)
            features.update({
                f'collection_price_roll_mean_{window}': roll['price'].mean(),
                f'collection_price_roll_std_{window}': roll['price'].std(),
                f'collection_price_roll_var_{window}': roll['price'].var(),
                f'collection_price_roll_sum_{window}': roll['price'].sum(),
            })
          else:
            features.update({
                f'collection_price_roll_mean_{window}': np.nan,
                f'collection_price_roll_std_{window}': np.nan,
                f'collection_price_roll_var_{window}': np.nan,
                f'collection_price_roll_sum_{window}': np.nan,
            })

        collection_mean = collection_group[['price', 'volume']].shift(1).expanding().mean()
        features['collection_price_mean'] = collection_mean['price']
        features['collection_vol_mean'] = collection_mean['volume']

        features['price_trend'] = np.arange(len(group)) * group['price'].shift(1).pct_change().fillna(0)
        features['volume_trend'] = np.arange(len(group)) * group['volume'].shift(1).pct_change().fillna(0)

        for window in ewma_windows:
            features[f'price_ewma_{window}'] = group['price'].shift(1).ewm(span=window, adjust=False).mean()

        _target_price_roll_mean = group[['price']].shift(1).rolling(window=target_window)['price'].mean()
        features["target"] = _target_price_roll_mean.shift(-target_window - 1).shift(-target_lag)

        return pd.concat([group, pd.DataFrame(features, index=group.index)], axis=1)

    dataset = dataset.groupby('name', group_keys=False).apply(compute_features)

    dates = dataset.index.get_level_values('date')
    dataset['year'] = dates.year
    dataset['month'] = dates.month
    dataset['day'] = dates.day
    dataset['weekday'] = dates.weekday
    #dataset['is_weekend'] = (dataset['weekday'] >= 5)
    dataset['d_from_nyear'] = (dates - pd.to_datetime(dates.year.astype(str) + '-01-01')).days
    dataset['d_to_june'] = (pd.to_datetime(dates.year.astype(str) + '-06-01') - dates).days

    dataset[bool_columns] = dataset[bool_columns].astype(bool)
    dataset[int_columns] = dataset[int_columns].fillna(0).astype(int)
    dataset[categorical_columns] = dataset[categorical_columns].astype('category')

    dataset.dropna(subset=['itemType'], inplace=True)

    return dataset
dp.add_features = add_features

In [None]:
#@title Features to drop
columns_to_drop = ['collection_price_roll_std_84',
                   'price_lag6',  'itemType_price_roll_std_14',
                   'price_lag3',  'itemType_price_roll_var_84',
                   'itemType_price_roll_var_7',
                   'collection_price_roll_sum_84',
                   'collection_price_roll_var_84',
                   'collection_price_roll_mean_84',
                   'collection_price_roll_sum_56',
                   'collection_price_roll_std_56',
                   'vol_lag3',  'price_lag2',
                   'weekday',  'price_lag4',
                   'collection_price_roll_var_56',
                   'price_roll_var_56',
                   'collection_price_roll_mean_56',
                   'collection_price_roll_sum_21',
                   'itemType_price_roll_std_3',
                   'itemType_price_roll_var_3',
                   'itemType_price_roll_std_7',
                   'itemType_price_roll_var_14',
                   'vol_lag21',  'vol_lag7',
                   'collection_price_roll_std_3',
                   'collection_price_roll_std_7',
                   'collection_price_roll_var_7',
                   'collection_price_roll_sum_7',
                   'collection_price_roll_mean_14',
                   'collection_price_roll_std_14',
                   'collection_price_roll_var_14',
                   'collection_price_roll_sum_14',
                   'collection_price_roll_mean_21',
                   'collection_price_roll_std_21',
                   'collection_price_roll_var_21']

In [None]:
X, X_test, y, y_test = prepare_datasets(dp,7,28,columns_to_drop=columns_to_drop)
#printdata_dataset(X,X_test)

In [None]:
#@title Optuna config
OPTUNA_N_WARMUP_STEPS = 20
STUDY_NAME=f"optuna_study_v3"
STORAGE_NAME = f"sqlite:///{STUDY_NAME}.db"

scoring = make_scorer(mean_absolute_error, greater_is_better=False)
cv_strategy = list(TimeSeriesSplit(N_SPLITS).split(X))

OPTUNA_EARLY_STOPPING = 80

In [None]:
#@title def Optuna functions
MAIN_METRIC='l1'  # alias mae probably doesnt work in optuna
def objective(trial):
    param = {
        'boosting_type': 'gbdt',
        'objective': MAIN_METRIC,
        'metric': MAIN_METRIC,
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.5,log=True),
        'num_leaves': trial.suggest_int('num_leaves', 60, 800),
        'max_depth': trial.suggest_int('max_depth', 20, 200),
        'subsample': trial.suggest_float('subsample', 0.5, 0.95),
        'subsample_freq': trial.suggest_int('subsample_freq', 3, 24),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.7),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-7, 100.0,log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 0.0001,log=True),
        'min_child_samples': trial.suggest_int('min_child_samples', 32, 128),
        'min_split_gain': trial.suggest_categorical('min_split_gain', [0.00005,0.0001,0.001,0.005,0.01,0.1,0.5,2,4,10]),
        'max_bin': trial.suggest_int('max_bin', 2512, 8096),
        'seed': 40,
        'num_threads': -1,
        'device': 'gpu' if gpu_available else 'cpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
        'verbose': -1
    } #Nranges

    cv_results = lgb.cv(
        param,
        lgb.Dataset(X, label=y, free_raw_data=False),
        folds=cv_strategy,
        num_boost_round=800,
        return_cvbooster=False,
        callbacks=[LightGBMPruningCallback(trial, MAIN_METRIC),lgb.log_evaluation(period=100),
              lgb.early_stopping(stopping_rounds=OPTUNA_EARLY_STOPPING, min_delta=0.001)]
    )

    best_score = min(cv_results[f'valid {main-metric}-mean'])
    gc.collect()
    return best_score

def print_results(study):
  print('='*10)
  print(f"ITERATION {iter}\nBest parameters:")
  pprint.pprint(study.best_params)
  print(f"Best MAE score: {study.best_value:.3f}")
  print('='*10)

def configure_storage(study,study_name,storage_name):
  rdb_storage = optuna.storages.RDBStorage(url=storage_name)
  study._storage = rdb_storage
  study.study_name = study_name

def save_study(study_name,name):
  destination = "/content/drive/MyDrive/SMF_files/optuna_saves/"
  os.system(f"cp {study_name}.db {destination}{name}")

def optuna_automated(test_postfix,iters=1,n_trials=100,iter_timeout=60*60*1):
  for i in range(iters):
    iter = i+1
    print(f"Starting iteration {iter}:")
    try:
      study = optuna.create_study(direction='minimize',storage=STORAGE_NAME,study_name=STUDY_NAME, load_if_exists=True,pruner=optuna.pruners.MedianPruner(n_warmup_steps=OPTUNA_N_WARMUP_STEPS))

      configure_storage(study,STUDY_NAME,STORAGE_NAME)

      study.optimize(objective, n_trials=n_trials, timeout=iter_timeout, show_progress_bar=True,gc_after_trial=True)

      save_study(STUDY_NAME,f"study_{iter}.db"+test_postfix)

      print_results(study)
    except Exception as e:
      print('='*10+f"An error occurred during iteration {iter}: {e}")
      continue
  return study


In [None]:
test_postfix="unknown"
study = optuna_automated(test_postfix,iters=10,n_trials=300,iter_timeout=60*60*1)

In [None]:
#@title Optuna results

display(optuna.visualization.plot_optimization_history(study))
display(optuna.visualization.plot_parallel_coordinate(study))
display(optuna.visualization.plot_slice(study))
display(optuna.visualization.plot_param_importances(study))
display(optuna.importance.get_param_importances(study,normalize=0))

In [None]:
model_path = '/content/model.txt'
model = lgb.Booster(model_file=model_path)

In [None]:
#@title params
const_params={
    'boosting_type': 'gbdt',
    'objective': 'mae',
    'metric': ['mape','mae'],
    'seed': 42,
    'num_threads':-1,   # 5K dart rounds x3, 4 threads are the best, 30% faster than 8 or -1 and 8% faster than second fastest number score
    'device': 'gpu' if gpu_available else 'cpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'verbose': -1,
    'num_iterations': 1000,               # num_iterations
    }
model_params=const_params.copy()

model_params.update({
    'colsample_bytree': 0.8455589519488225,
    'learning_rate': 0.24539178594231917,
    'max_bin': 4376,
    'max_depth': 87,
    'min_child_samples': 107,
    'min_split_gain': 0.0001,
    'num_leaves': 30,
    'reg_alpha': 7.469125945743011e-08,
    'reg_lambda': 0.004114222327399969,
    'subsample': 0.7815644514881227,
    'subsample_freq': 2})

## !!! Remember to check n-estimators/num_boost_round
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
model_params.update(study.best_params)

In [None]:
#@title Def learnCV
NAIVE_KEYS=['price_roll_mean_7','price_lag1','price_lag2','price_lag3','storePrice']
def learnCV(model_params,X,y,n_splits,stopping_rounds):
  """ returns model and tuple with all training data from all splits and all validation (test) data from all splits and naive predictions for all splits"""
  all_y_pred_train,all_y_train = [],[]
  all_y_pred_test,all_y_test = [],[]
  all_y_pred_naive = []
  tscv = TimeSeriesSplit(n_splits=n_splits)

  for train_index, test_index in tscv.split(X):
      X_train, X_test = X.iloc[train_index], X.iloc[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]

      train_lgb_dataset = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
      test_lgb_dataset = lgb.Dataset(X_test, label=y_test, free_raw_data=False)

      model = lgb.train(
          model_params,
          train_lgb_dataset,
          valid_sets=[test_lgb_dataset],
          valid_names=['test'],
          callbacks=[
              lgb.log_evaluation(period=100),
              lgb.early_stopping(stopping_rounds=stopping_rounds)#, min_delta=[0.0008,0.001])
          ]
      )

      y_pred_train = pd.Series(model.predict(X_train, num_iteration=model.best_iteration), index=X_train.index)
      y_pred_test = pd.Series(model.predict(X_test, num_iteration=model.best_iteration), index=X_test.index)

      y_pred_naive = create_naive_prediction(X_test,keys=NAIVE_KEYS,verbose=True)

      all_y_pred_train.append(y_pred_train)
      all_y_train.append(y_train)
      all_y_pred_test.append(y_pred_test)
      all_y_test.append(y_test)
      all_y_pred_naive.append(y_pred_naive)

  del X_train, X_test,y_train, y_test,train_lgb_dataset,test_lgb_dataset, y_pred_train,y_pred_test,y_pred_naive
  gc.collect()

  y_pred_train_concat = pd.concat(all_y_pred_train)
  y_train_concat = pd.concat(all_y_train)
  y_pred_test_concat = pd.concat(all_y_pred_test)
  y_test_concat = pd.concat(all_y_test)
  y_pred_naive_concat = pd.concat(all_y_pred_naive)
  return model,(y_pred_train_concat, y_train_concat, y_pred_test_concat, y_test_concat, y_pred_naive_concat)  # all training data from all splits and all validation (test) data from all splits



In [None]:
additional_info=""

stopping_rounds = 100
model,datasets = learnCV(model_params,X, y, N_SPLITS,stopping_rounds)
learning_summary=get_learning_summary(*datasets, label="mae")
display(learning_summary)
y_pred_train_concat, y_train_concat, y_pred_test_concat, y_test_concat, y_pred_naive_concat= datasets

if not TEST_MODE:
  metrics_test = eval_model(model,X_test,y_test,"Test",NAIVE_KEYS)
  print('Test metrics: ')
  display(metrics_test)
  autosave_model(model,postfix=get_model_postfix(metrics_test,additional_info))

valid_idx = get_last_index_TSS(X,N_SPLITS)
X_valid = X.iloc[valid_idx]
y_valid = y.iloc[valid_idx]

print(get_model_postfix(metrics_test,additional_info))

In [None]:
display(metrics_test)
metrics_valid = eval_model(model,X_valid,y_valid,"Valid",NAIVE_KEYS)
display(metrics_valid)
metrics_train = eval_model(model,X,y,"Train",NAIVE_KEYS)
display(metrics_train)

In [None]:
metrics_test = eval_model(model,X_test,y_test,"Test",NAIVE_KEYS)
display(metrics_test)

In [None]:
trim=0
print_importances(model,trim,True,figsize=(10, 18),importance_type='gain')

In [None]:
#@title Analyzer

class Analyzer():
  def __init__(self,X,y,model=None,show=False):
    self.X = X
    self.y = y
    self.model = model
    self.selected_columns = ['volume', 'playerCount',
                    'supplyTotalEstimated', 'storePrice',
                    'itemType', 'itemCollection']
    self.create_dataset()
    self.group_data()
    self.add_info(show=show)

  def create_dataset(self):
    self.ds = self.X[self.selected_columns].copy()
    self.ds['target'] = self.y
    self.ds['pred'] = model.predict(self.X)

    self.ds['itemAE'] = (self.ds['target'] - self.ds['pred']).abs()
    self.ds['itemAPE'] = self.ds['itemAE'] / self.ds['target'].abs()

  def group_data(self):
    self.grouped = self.ds.groupby('name').agg(
      itemType=('itemType', 'first'),
      itemCollection=('itemCollection', 'first'),

      volume=('volume', 'mean'),
      playerCount=('playerCount', 'mean'),
      supplyTotalEstimated=('supplyTotalEstimated', 'mean'),
      storePrice=('storePrice', 'mean'),

      itemMAE=('itemAE', 'mean'),
      itemMedAE=('itemAE', 'median'),
      itemMAPE=('itemAPE', 'mean'),
      size=('target', 'size')
    )

  def add_info(self,show=False):
    self.item_counts = self.ds.groupby(['name']).size().reset_index(name='count')
    self.name_list = self.item_counts['count'].tolist()

    if show:
      print(f"X shape: {self.X.shape} y shape: {self.y.shape}")
      display(self.item_counts)

  def get_item_data(self,item_name):
    if item_name not in self.ds.index.get_level_values('name'):
      print(f"{item_name} is not in the given data")
      return None
    return self.ds.xs(item_name, level='name', drop_level=True)

  def analyze(self):
    ds = self.ds

  def plot(self,item_name,plot_name=None):
    if plot_name is None:
      plot_name = f"plot {item_name}"
    item_data = self.get_item_data(item_name)
    if item_data is None:
      return
    plot_pricehistory([item_data['pred'],item_data['target']],['pred','target'],plot_name)


In [None]:
analyzer = Analyzer(X,y,model,show=False)

In [None]:
analyzer_test = Analyzer(X_test,y_test,model,show=False)

In [None]:
X_test.shape

In [None]:
item_name="Whiteout Kilt"
analyzer.plot(item_name)
analyzer_test.plot(item_name)


In [None]:
item_name="Forest Raiders Pants"
analyzer.plot(item_name)
analyzer_test.plot(item_name)


In [None]:
display(analyzer.get_item_data("Forest Raiders Pants"))
display(analyzer_test.get_item_data("Forest Raiders Pants"))
display(analyzer.get_item_data("Blackout Kilt"))
display(analyzer_test.get_item_data("Blackout Kilt"))

In [None]:
from google.colab import runtime
runtime.unassign()