In [None]:
# Goal: Build first model(s) for selected time-series data, improved after feedback
# 
# Version history (only major changes):
# 2023-10-01: (v 014) Added secondary features
# 2023-10-01: (v 012) After-feedback improvements
# 2023-09-25: (v 010) Initial creation from template v1.0

# Part 1: System checks, imports

## Jupyter-related magic

In [None]:
# Enable auto-reload of imported modules
%load_ext autoreload
%autoreload 2

## System info

In [None]:
# Get basic info about current system
!nvidia-smi
!hostname
!uname -a
!df -kh /tmp

In [None]:
# Check location and version of python
!which python
!python -V

In [None]:
# Dump version of important packages - TODO
!python -m pip list | grep -E -i "catb|scikit|nump|pand"

## Set main dir, check Colab

In [None]:
# Autodetect Google Colab
TO_USE_COLAB = None
try:
    PATH_MOUNT = "/content/drive"
    from google.colab import drive
    drive.mount(PATH_MOUNT)
    TO_USE_COLAB = True
except:
    TO_USE_COLAB = False
    
print(f"{TO_USE_COLAB=}")

In [None]:
# Set main directory (root for all other paths)
import os
if TO_USE_COLAB:
    PATH_MAIN_DIR = f"{PATH_MOUNT}/MyDrive/<path-to-project TODO>"
else:
    PATH_MAIN_DIR = os.path.abspath(".")
assert os.path.isdir(PATH_MAIN_DIR)

print(f"Successfully checked: {PATH_MAIN_DIR=}")

## Imports

In [None]:
import datetime
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import sys
import time
import torch

In [None]:
# Local imports
#sys.path.append(os.path.join(PATH_MAIN_DIR, '../src'))

# TODO
# import st_utils as stu
# import st_data_handlers as stdh
#import df_utils as dfu

# Part 2: Settings and switches

## Settings: data files, columns

In [None]:
# Paths

print(f"{PATH_MAIN_DIR=}")  # Already set and checked above

DIR_DATA_SRC = os.path.join(PATH_MAIN_DIR, r'../_data/')
assert os.path.isdir(DIR_DATA_SRC)
print(f"Successfully checked: {DIR_DATA_SRC=}")

FNAME = os.path.join(DIR_DATA_SRC, '__2023-09-25T155258_transformed_data.csv')
assert os.path.isfile(FNAME)
print(f"Successfully checked: {FNAME=}")

In [None]:
# Column names for original domain (non-percentage)
COL__ORIG__MAIN_CC = "PALUM"  # Main time-series column name (Commodity Code)
COLS__ORIG__OTHER_CC = ["PCOAL", ]  # Other feature column names (Commodity Code, etc.)
COLS__ORIG__ALL_GOOD = [COL__ORIG__MAIN_CC] + COLS__ORIG__OTHER_CC

# Column names for percentage domain
SUFFIX__PCT = "_pct"  # Suffix for columns with percentage change values
COL__PCT__MAIN_CC = COL__ORIG__MAIN_CC + SUFFIX__PCT

In [None]:
print(f"Columns in original domain: {COLS__ORIG__ALL_GOOD}, main column: '{COL__ORIG__MAIN_CC}'")
print(f"Main column in percentage domain: '{COL__PCT__MAIN_CC}'")

In [None]:
# Size of sliding window for test set
TEST_WINDOW_SIZE = 24

## Settings: RANDOM_SEED, switchers

In [None]:
# This block contains "active" settings that contol notebook execution.

# Initial random state, to be used in init_seeds, etc.
RANDOM_SEED = 42


# Settings for "fast-check" mode
# If switched on, the notebook is supposed to run very fast (for example, <1 minute) to reveal obvious bugs.
#IS_FAST_CHECK = True
IS_FAST_CHECK = False

# For IS_FAST_CHECK mode: randomly drop most rows (without shuffling). Different train/test sizes are vital for debugging.
# TODO-case1: multiple files
# N_ROWS_IN_FAST_MODE_FOR_TRAIN_TEST = (1001, 999)
# TODO-case2: single file
# N_ROWS_IN_FAST_MODE = 1000


# Section switchers
DO_XXX = True


In [None]:
# Summarize major launch details in one string (examples: seed, data details, switchers, etc.)

LAUNCH_TAG = f"FC={int(IS_FAST_CHECK)};seed={RANDOM_SEED};XXX={int(DO_XXX)};test_window={TEST_WINDOW_SIZE}"

print(f"{LAUNCH_TAG=}")

# Part 3: Function definitions, start

In [None]:
# This part should contain only function definitions 

## Defs: Init seeds

In [None]:
# More info: https://pytorch.org/docs/stable/notes/randomness.html
def init_seeds(seed=42):
    # Python and CPU-related entropy  
    random.seed(seed)      
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    # torch.use_deterministic_algorithms(True)   # Raises a CUBLAS error on some cases
    # os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # Does not help for the error above

    # GPU-related entropy
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed) # gpu vars
        torch.backends.cudnn.benchmark = False  # See 
        torch.backends.cudnn.deterministic = True

## Everything is ready - go!

In [None]:
print(f"Initialize modules with {RANDOM_SEED=}")
init_seeds(RANDOM_SEED)

In [None]:
# Start the launch timer
print(f"{LAUNCH_TAG=}")
glob__start_time = time.time()

# Part 4: Data load and domain transformation

## Data: Do load

In [None]:
# Multiple files case
# df_train_X = pd.read_csv(FNAME_TRAIN_X, index_col=False)
# df_train_y = pd.read_csv(FNAME_TRAIN_Y, index_col=False)
# df_test_X = pd.read_csv(FNAME_TEST_X, index_col=False)
# df_test_y = pd.read_csv(FNAME_TEST_Y, index_col=False)
# print(df_train_X.shape, df_train_y.shape, df_test_X.shape, df_test_y.shape)

# if IS_FAST_CHECK:
#     dfu.drop_some_rows_inplace(df_train_X, df_train_y, n_final_rows=N_ROWS_IN_FAST_MODE_FOR_TRAIN_TEST[0], seed=RANDOM_SEED)
#     dfu.drop_some_rows_inplace(df_test_X, df_test_y, n_final_rows=N_ROWS_IN_FAST_MODE_FOR_TRAIN_TEST[1], seed=RANDOM_SEED)
    
# Single files case
df_src = pd.read_csv(FNAME, index_col=0)
print(df_src.shape)

# if IS_FAST_CHECK:
#     dfu.drop_some_rows_inplace(df_main, n_final_rows=N_ROWS_IN_FAST_MODE, seed=RANDOM_SEED)

In [None]:
df_src

In [None]:
# Leave only required columns
df = df_src[COLS__ORIG__ALL_GOOD].copy()
df

## Data: plot and transformation from original domain to percentage domain

In [None]:
df[COL__ORIG__MAIN_CC].plot(title=COL__ORIG__MAIN_CC, grid=True)

In [None]:
# Calculate columns in percentage domain
cols_orig = df.columns.to_list()
cols_pct_lag0 = []
for c in cols_orig:
    new_name = f"{c}{SUFFIX__PCT}"
    assert new_name not in df.columns
    df[new_name] = df[c].pct_change()
    cols_pct_lag0.append(new_name)
    
cols_pct_lag0

In [None]:
# Plot main (target) column in percentage domain
df[COL__PCT__MAIN_CC].plot(title=COL__PCT__MAIN_CC, grid=True)

In [None]:
df

In [None]:
print(f"{cols_orig=}")
print(f"{cols_pct_lag0=}")

# Part 5: feature engineering

## Add lag features for pct columns

In [None]:
%%time
lags = [1, 2, 3, 4, 5]  #, 6, 7, 8, 9, 10]

cols_pct_lagged = []
for col in cols_pct_lag0:
    print(f"{col=}")
    for lag in lags:
        col_name = f"{col}_(t-{lag})"
        df[col_name] = df[col].shift(lag)
        cols_pct_lagged.append(col_name)

df = df.copy()  # defragment dataframe
df

In [None]:
# Print all types of columns
print(f"{cols_orig=}")
print(f"{cols_pct_lag0=}")
print(f"{cols_pct_lagged=}")

In [None]:
# Set feature + target columns
feature_cols = cols_pct_lagged  # We should use features ONLY with lag > 0
target_col = COL__PCT__MAIN_CC

## Drop NA rows

In [None]:
# Calculate expected number of rows after NA removal
old_len = len(df)
expected_len = old_len - 1 - max(lags)

# Do drop na and check
df.dropna(axis='rows', inplace=True)
assert len(df) == expected_len, f"{len(df)=} vs {expected_len}"

In [None]:
df

## Add tsfresh features

In [None]:
# Goal: rename columns to remove "__" (unsupported by TSFresh)
import re
def replace_underscore(s: str):
    return re.sub('_+', '_', s)

In [None]:
from tsfresh.utilities.dataframe_functions import roll_time_series
from tsfresh import extract_features

def get_tsfresh_x_y(df_X: pd.DataFrame, df_y: pd.DataFrame, cols: list[str],
                    window_size: int = 7 * 24  # 1 week
                   ):

    assert len(df_X) > window_size, "Too small dataset, tricky exceptions are possible!"

    df_X2 = df_X[cols].copy()

    # Rename columns to remove "__" (unsupported by TSFresh)
    map = {}
    for c in cols:
        if "__" in c:
            map[c] = replace_underscore(c)
    print(f"DBG: renaming map: {map}")
    df_X2.rename(map, axis=1, inplace=True)

    # Generate fake "id" (required for TSFresh)
    assert "id" not in df_X2.columns
    df_X2["id"] = 1  # Fake id

    # Generate fake "time" (required for TSFresh)
    assert "time" not in df_X2.columns
    df_X2["time"] = range(len(df_X2))

    # Generate tsfresh features (rathe magical code)
    df_rolled = roll_time_series(df_X2, column_id="id", column_sort="time", min_timeshift=window_size, max_timeshift=window_size)
    df_features = extract_features(df_rolled, column_id="id", column_sort="time")

    # Prepare labels that are aligned with the features
    df_labels = df_y.shift(-window_size)[:-window_size]    
    assert len(df_labels) == len(df_features)

    return df_features, df_labels

In [None]:
%%time
df_tsf_features, df_tsf_labels = get_tsfresh_x_y(df, df[target_col], cols=feature_cols, window_size=7)
print(df_tsf_features.shape, df_tsf_labels.shape)

In [None]:
# Cut df to align with tsfresh rows
print(f"Before: {df.shape}")

df = df.iloc[-len(df_tsf_features):, :]

print(f"After: {df.shape}")

In [None]:
# Append tsf_features to df
print(f"Before: {df.shape}")

assert set(df.columns) & set(df_tsf_features.columns) == set(), "Column conflict detected!"
df = pd.concat([df, df_tsf_features.set_index(df.index)], axis="columns")  # Note: ignore_index=True will NOT work here (!)

feature_cols += df_tsf_features.columns.to_list()
print(f"After: {df.shape}")

## Drop NA feature columns

In [None]:
# This NA removal may be skipped for some models, supporting NA values in features

In [None]:
print(f"Before: {len(feature_cols)=}")
feature_cols = df[feature_cols].dropna(axis="columns").columns.to_list()                 
print(f"After: {len(feature_cols)=}")                 

## Remove trivial columns

In [None]:
%%time
print(f"Before: {len(feature_cols)=}")
for f in feature_cols.copy():
    val_counts = df[f].value_counts()
    if len(val_counts) == 1:
        print(f"Trivial feature removed:{f}")
        feature_cols.remove(f)
print(f"After: {len(feature_cols)=}")                         

In [None]:
# Check there are no duplicates in features
assert len(set(feature_cols)) == len(feature_cols), "Duplicates detected!"

# Do train-test cycles (sliding window)

In [None]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from tqdm.notebook import tqdm

In [None]:
%%time

# Example of sliding window approach:
# df=[0, 1, 2, 3, 4] (len=5), TEST_WINDOW_SIZE = 2
# Expected train/test sets: 
#  1) [0, 1, 2], [3]
#  2) [0, 1, 2, 3], [4]

# Calculate possible values of test indices
test_range = range(len(df) - TEST_WINDOW_SIZE, len(df))  # (5 - 2, 5) -> [3, 4]

y_trues = []
y_preds = []
for i, i_test in enumerate(tqdm(list(test_range))):
    
    # Split into train-test
    train_data = df.iloc[0:i_test]
    test_data = df.iloc[i_test:i_test+1]

    #Separate features and target
    X_train = train_data[feature_cols]
    y_train = train_data[target_col]

    X_test = test_data[feature_cols]
    y_test = test_data[target_col]
    idx_test = test_data.index
    
    # Initialize and fit a model
    
    # Model 1: trivial prediction (last value from train set)
#     model = None
#     y_pred = train_data[COL__PCT__MAIN_CC].iloc[-1:]
    
#     # Model 2: linear regression model
#     model = LinearRegression()
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
    
#     # Model 3: RF
    model = RandomForestRegressor(n_estimators=100, random_state=RANDOM_SEED)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Model 4: CB
#     model = CatBoostRegressor(random_state=RANDOM_SEED, verbose=False)
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
    
    # Convert prediction to scalar (to be sure)
    assert len(y_pred) == 1
    y_pred = y_pred[0]
    
    # Convert prediction from pct-domain to original domain
    prev_orig_value = train_data[COL__ORIG__MAIN_CC].iloc[-1:][0]  # Convert to scalar
    y_pred_orig = prev_orig_value * (1 + float(y_pred))
    y_true_orig = test_data[COL__ORIG__MAIN_CC].iloc[0]
    
    y_trues.append(y_true_orig)
    y_preds.append(y_pred_orig)

    # Calculate temp MAPE (for debug info)
    #mae_cur = mean_absolute_error([y_true_orig], [y_pred_orig])
    pred_err = y_pred_orig - y_true_orig
    mape_cur = mean_absolute_percentage_error([y_true_orig], [y_pred_orig])
    mape_avg = mean_absolute_percentage_error(y_trues, y_preds)
    print(f"{i=}, {idx_test=}, {pred_err=:.3f}, {mape_cur=:.3f}, {mape_avg=:.3f}")
    
# Calculate the average MAPE for the whole test window
mape = mean_absolute_percentage_error(y_trues, y_preds)
print(f"Average MAPE: {mape:.5f}")    

In [None]:
# VERSION 2 (fixed data leak from PCOAL lag0 feature)
# Average MAPE: 0.05870 - trivial prediction (take last value)
# Average MAPE: 0.05724 - LR
# Average MAPE: 0.05115 - RF
# Average MAPE: 0.05334 - CB

In [None]:
model

# Plot model importance (WARN: for last model only!)

In [None]:
if hasattr(model, 'feature_importances_'):
    
    feature_importance_tuples = [(k, v) for k, v in zip(model.feature_names_in_, model.feature_importances_)]
    sorted_feature_importance_tuples = sorted(feature_importance_tuples, key=lambda x: x[1], reverse=True)
    sorted_feature_names, sorted_importances = zip(*sorted_feature_importance_tuples)

    plt.xticks(rotation='vertical')
    plt.bar(x=sorted_feature_names, height=sorted_importances)
    plt.title("Feature imporatances")
else:
    print("No feature importances found")

# Plot results (for whole test period)

In [None]:
# Draw predictions and ground truth on a single chart
plt.plot(y_preds, "bo-", label="Pred")
plt.plot(y_trues, "gs--", label="True")
plt.legend()
plt.grid(True)

In [None]:
# Draw predictions vs ground truth
plt.scatter(x=y_preds, y=y_trues)

# Draw diagonal
val_min = min(y_preds, y_trues)
val_max = max(y_preds, y_trues)
plt.plot([val_min, val_max], [val_min, val_max], linestyle='-', color='lightblue', label='Diagonal')

plt.grid(True)
plt.gca().set_xlabel("Predictions")
plt.gca().set_ylabel("Ground truth")

# Finalize notebook

In [None]:
print(f"Elapsed notebook seconds: {time.time() - glob__start_time:.1f}")