# Notebook for training predictive models
### Import packages

In [1]:
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, BatchNormalization, Dropout, Reshape, LSTM
from tensorflow.keras.models import Model, load_model
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from datetime import date

import tensorflow as tf
import pandas as pd
import numpy as np
import random
import glob
import os

from predicted_sales_utils import split_match_ids, get_next_model_filename, euclidean_distance_loss, total_error_loss, define_regularizers, embedding_config, prepare_EL_input_data, prepare_LSTM_input_data, create_embeddings, smooth_predictions_xy, run_model, evaluate_model, print_column_variance
from predicted_sales_utils import prepare_df, add_can_be_sequentialized, extract_variables, load_tf_model

2024-04-23 19:53:53.105930: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


### Global variables

In [2]:
# Define numerical, categorical, and y columns
numerical_cols = []
categorical_cols = []

# Define parameters for model training
n_epochs = 1
batch_size = 32
sequence_length = 3    # Sequence length for LSTM model

### Load frames

In [3]:
# Read data from parquet and perform basic filtering
def load_data():
  # Load sales_per_month_df from parquet
  sales_per_month_df = pd.read_parquet('Predicted-sales.parquet')

  # Remove 'Övrigt' entries
  sales_per_month_df = sales_per_month_df[sales_per_month_df['Product'] != 'Övrigt']
  sales_per_month_df = sales_per_month_df[sales_per_month_df['Machine Sub Group'] != 'Övrigt']

  return sales_per_month_df

# Create a train, val, and test split based on all 'Machine ID's
def train_test_val_split_by_machine_id(df, test_size=0.2, val_size=0.2):
    # Create unique list of Machine IDs
    machine_ids = df['Machine ID'].unique()

    # Split Machine IDs into train and test+val sets
    train_ids, test_val_ids = train_test_split(machine_ids, test_size=test_size+val_size, random_state=42)

    # Split test+val into test and val sets
    test_ids, val_ids = train_test_split(test_val_ids, test_size=val_size/(test_size+val_size), random_state=42)

    # Assign records to train, test, or val sets based on Machine ID
    train_df = df[df['Machine ID'].isin(train_ids)]
    test_df = df[df['Machine ID'].isin(test_ids)]
    val_df = df[df['Machine ID'].isin(val_ids)]

    return train_df, val_df, test_df

# Load the data and create three DataFrames
sales_per_month_df = load_data()

# Remove 'Övrigt' products
sales_per_month_df = sales_per_month_df[sales_per_month_df['Product'] != 'Övrigt']
sales_per_month_df = sales_per_month_df[sales_per_month_df['Machine Group'] != 'Övrigt']
sales_per_month_df['Product Name'] = sales_per_month_df['Product']
sales_per_month_df = sales_per_month_df[sales_per_month_df['Product'] == 'Cloetta Kexchoklad']

train_df, val_df, test_df = train_test_val_split_by_machine_id(sales_per_month_df)

In [4]:
train_df.columns

Index(['Product', 'Machine ID', 'Timestep', 'Sales (kr)', 'Price',
       'Number of Sales', 'Category', 'Year', 'Month', 'Sales Next Month (kr)',
       'New Product', 'Total Sales (kr)', 'Dryck Sales (kr)', 'Mat Sales (kr)',
       'Snacks Sales (kr)', 'Sport Sales (kr)', 'Bil Sales (kr)',
       'Naive Static (kr)', 'Error Naive Static (%)', 'Naive Static Plus (kr)',
       'Error Naive Static Plus (%)', 'Naive Sequential (kr)',
       'Error Naive Sequential (%)', 'Naive Sequential Plus (kr)',
       'Error Naive Sequential Plus (%)', 'Machine Group', 'Machine Sub Group',
       'Machine Model', 'Average Increase', 'Product Name'],
      dtype='object')

## Predictive model 1
### NN with Embedding layers
Player-based model

In [5]:
# Define the architecture of the neural network model with embeddings layers
def define_NN_model(numerical_input_shape, categorical_cols, l1=0, l2=0):
    # Inputs for each categorical feature
    categorical_inputs = []
    categorical_flats = []
    for col in categorical_cols:
        # Replace spaces with underscores in the input name
        input_name = f'input_{col.replace(" ", "_")}'
        embedding_name = f'embedding_{col.replace(" ", "_")}'

        cat_input = Input(shape=(1,), name=input_name)  # Input for each categorical feature
        emb_layer = Embedding(
            input_dim=embedding_config[col]['n_categories'],
            output_dim=embedding_config[col]['output_dim'],
            input_length=1,
            name=embedding_name
        )(cat_input)
        flat_layer = Flatten()(emb_layer)
        categorical_inputs.append(cat_input)
        categorical_flats.append(flat_layer)

    # Prepare input layer for numerical data
    numerical_input = Input(shape=(numerical_input_shape,), name='numerical_input')

    # Concatenate all flattened embeddings with the numerical input
    concatenated_features = Concatenate()([*categorical_flats, numerical_input]) if categorical_flats else numerical_input

    # Dense layers
    regularizer = define_regularizers(l1, l2)  # Set regularizer
    dense_layer_1 = Dense(64, activation='relu', kernel_regularizer=regularizer)(concatenated_features)
    dense_layer_2 = Dense(32, activation='relu', kernel_regularizer=regularizer)(dense_layer_1)
    output_layer = Dense(1, name='output_layer')(dense_layer_2)  # Output layer 'Sales Next Month (kr)'

    # Building the model
    model = Model(inputs=[*categorical_inputs, numerical_input], outputs=output_layer)

    return model

# Train NN model
def train_NN_model(train_df, val_df, numerical_cols, categorical_cols, l1=0, l2=0, special_text=None):
    # Prepare inputs
    X_train_input, y_train = prepare_EL_input_data(train_df, numerical_cols, categorical_cols)
    X_val_input, y_val = prepare_EL_input_data(val_df, numerical_cols, categorical_cols)

    # Define the model
    model = define_NN_model(len(numerical_cols), categorical_cols, l1, l2)

    # Compile the model
    model.compile(optimizer='adam', loss='mean_absolute_error')

    # Train the model with the corrected input format
    history = model.fit(X_train_input, y_train, validation_data=(X_val_input, y_val), epochs=n_epochs, batch_size=batch_size, verbose=2)

    # Save the trained model to disk
    model_filename = get_next_model_filename("pSales_NN")
    model.save(model_filename)

    # Generate the corresponding txt filename
    output_txt_filename = os.path.splitext(model_filename)[0] + ".txt"

    # Write the output directly to the txt file
    with open(output_txt_filename, 'w') as f:
        # Write the some general info at the begging of the file
        today_date = date.today().strftime("%Y-%m-%d")
        f.write(f"{today_date}\n")
        f.write(f"epochs={n_epochs}\n")
        f.write(f"numerical_cols={numerical_cols}\n")
        f.write(f"categorical_cols={categorical_cols}\n")
        if l1 != 0: f.write(f"l1={l1}\n")
        if l2 != 0: f.write(f"l2={l2}\n")
        if special_text: f.write(f"{special_text}\n")

        # Write the training results
        f.write("\nTraining results:\n")
        for key, value in history.history.items():
            rounded_values = [round(v, 2) for v in value]
            f.write(f"{key}: {rounded_values}\n")

In [6]:
# Train the NN model with embedding layers
# numerical_cols = ['Price', 'Number of Sales', 'Category', 'New Product', 'Total Sales (kr)', 'Dryck Sales (kr)', 'Mat Sales (kr)', 'Snacks Sales (kr)', 'Sport Sales (kr)', 'Bil Sales (kr)']
n_epochs = 3
categorical_cols = ['Category', 'Machine Group', 'Month']
numerical_cols = ['Price', 'Number of Sales', 'Total Sales (kr)', 'Dryck Sales (kr)', 'Mat Sales (kr)', 'Snacks Sales (kr)', 'Sport Sales (kr)', 'Bil Sales (kr)', 'Average Increase']

# train_NN_model(train_df, val_df, numerical_cols, categorical_cols, l2=1e-5)

### Evaluate model

In [7]:
# # Print test results
# model_names = [f'pSales_NN_v{i}' for i in range(5, 6)]
# for model_name in model_names:
#     error = evaluate_model(test_df, model_name)
#     print(f"{model_name}: {error}")

# Test different alpha values
# model_name = 'pSales_NN_v3'
# test_df = run_model(test_df, model_name)
# for alpha in [0.9, 0.93, 0.96, 0.99, 1]:
#     test_df = test_df.copy()
#     test_df = smooth_predictions_xy(test_df, model_name, alpha=alpha)
#     error = total_error_loss(test_df, model_name)
#     print(f"{round(alpha, 2)}: {error}")

# # Print column variance
# model_name = 'pSales_NN_v3'
# print_column_variance(test_df, model_name, 'Machine Group')

## Predictive model 2
### LSTM model
Player-based model

In [8]:
# Define the architecture of the LSTM model with embeddings layers
def define_LSTM_model(numerical_input_shape, categorical_cols, sequence_length, l1=0, l2=0):
    categorical_inputs = []
    categorical_flats = []
    
    # Create inputs for each categorical feature
    for col in categorical_cols:
        input_name = f'input_{col.replace(" ", "_")}'
        embedding_name = f'embedding_{col.replace(" ", "_")}'

        cat_input = Input(shape=(1,), name=input_name)
        emb_layer = Embedding(
            input_dim=embedding_config[col]['n_categories'],
            output_dim=embedding_config[col]['output_dim'],
            input_length=1,
            name=embedding_name
        )(cat_input)
        flat_layer = Flatten()(emb_layer)
        categorical_inputs.append(cat_input)
        categorical_flats.append(flat_layer)

    # Prepare input layer for sequential numerical data
    numerical_input = Input(shape=(sequence_length, numerical_input_shape), name='numerical_input')
    lstm_layer = LSTM(64, return_sequences=False, name='lstm_numerical')(numerical_input)

    # Concatenate embeddings with numerical input
    if categorical_flats:
        concatenated_features = Concatenate()([*categorical_flats, lstm_layer])
    else:
        concatenated_features = lstm_layer  # Only use LSTM output if no categorical data

    # Dense layers
    regularizer = define_regularizers(l1, l2)  # Set regularizer
    dense_layer_1 = Dense(64, activation='relu', kernel_regularizer=regularizer)(concatenated_features)
    dense_layer_2 = Dense(32, activation='relu', kernel_regularizer=regularizer)(dense_layer_1)
    output_layer = Dense(1, name='output_layer')(dense_layer_2)  # Output layer 'Sales Next Month (kr)'

    # Building the model
    model = Model(inputs=[*categorical_inputs, numerical_input], outputs=output_layer)

    return model

def train_LSTM_model(train_df, val_df, numerical_cols, categorical_cols, sequence_length, l1=0, l2=0, special_text=None):
    # Prepare inputs
    X_train_input, y_train = prepare_LSTM_input_data(train_df, numerical_cols, categorical_cols, sequence_length)
    X_val_input, y_val = prepare_LSTM_input_data(val_df, numerical_cols, categorical_cols, sequence_length)

    # Define the model
    model = define_LSTM_model(len(numerical_cols), categorical_cols, sequence_length, l1, l2)

    # Compile the model
    model.compile(optimizer='adam', loss='mean_absolute_error')

    # Train the model with the corrected input format
    history = model.fit(X_train_input, y_train, validation_data=(X_val_input, y_val), epochs=n_epochs, batch_size=batch_size, verbose=2)

    # Save the trained model to disk
    model_filename = get_next_model_filename("pSales_LSTM")
    model.save(model_filename)

    # Generate the corresponding txt filename
    output_txt_filename = os.path.splitext(model_filename)[0] + ".txt"

    # Write the output directly to the txt file
    with open(output_txt_filename, 'w') as f:
        # Write the some general info at the begging of the file
        today_date = date.today().strftime("%Y-%m-%d")
        f.write(f"{today_date}\n")
        f.write(f"epochs={n_epochs}\n")
        f.write(f"sequence_length={sequence_length}\n")
        f.write(f"numerical_cols={numerical_cols}\n")
        f.write(f"categorical_cols={categorical_cols}\n")
        if l1 != 0: f.write(f"l1={l1}\n")
        if l2 != 0: f.write(f"l2={l2}\n")
        if special_text: f.write(f"{special_text}\n")

        # Write the training results
        f.write("\nTraining results:\n")
        for key, value in history.history.items():
            rounded_values = [round(v, 2) for v in value]
            f.write(f"{key}: {rounded_values}\n")

In [9]:
# small_df = test_df[(test_df['Machine ID'] == 997510210) & (test_df['Product'] == 'Vitamin Well Refresh') & (test_df['Month'] < 7)]

# sequence_length = 3
# categorical_cols = ['Category', 'Month']
# numerical_cols = ['Price', 'Number of Sales']

# X_small_input, y_small = prepare_LSTM_input_data(small_df, numerical_cols, categorical_cols, sequence_length)

# X_small_input

In [10]:
# tf.keras.backend.clear_session()
n_epochs = 3
sequence_length = 3
categorical_cols = []
numerical_cols = ['Price', 'Number of Sales', 'Total Sales (kr)']

train_LSTM_model(train_df, val_df, numerical_cols, categorical_cols, sequence_length, l2=1e-5)

2024-04-23 19:54:00.143460: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38551 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:e3:00.0, compute capability: 8.0


Epoch 1/3


2024-04-23 19:54:03.485975: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8401
2024-04-23 19:54:03.590518: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2024-04-23 19:54:03.592062: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x14dd440c6dc0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-04-23 19:54:03.592079: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2024-04-23 19:54:03.596563: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-04-23 19:54:03.703868: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of th

67/67 - 4s - loss: 230.6442 - val_loss: 265.3925 - 4s/epoch - 60ms/step
Epoch 2/3
67/67 - 0s - loss: 171.4241 - val_loss: 193.8450 - 168ms/epoch - 3ms/step
Epoch 3/3
67/67 - 0s - loss: 133.6713 - val_loss: 154.5726 - 166ms/epoch - 2ms/step
Epoch 1/3
59/59 - 3s - loss: 234.7517 - val_loss: 276.1885 - 3s/epoch - 44ms/step
Epoch 2/3
59/59 - 0s - loss: 180.0569 - val_loss: 207.7946 - 153ms/epoch - 3ms/step
Epoch 3/3
59/59 - 0s - loss: 137.7166 - val_loss: 160.5589 - 154ms/epoch - 3ms/step
Epoch 1/3
46/46 - 3s - loss: 247.3797 - val_loss: 307.7054 - 3s/epoch - 55ms/step
Epoch 2/3
46/46 - 0s - loss: 228.2562 - val_loss: 265.6542 - 124ms/epoch - 3ms/step
Epoch 3/3
46/46 - 0s - loss: 177.9030 - val_loss: 224.7046 - 123ms/epoch - 3ms/step


In [13]:
# Print test results
model_names = [f'pSales_LSTM_v{i}' for i in range(4, 7)]
for model_name in model_names:
    error = evaluate_model(test_df, model_name)
    print(f"{model_name}: {error}")

# Test different alpha values
# model_name = 'pSales_NN_v3'
# test_df = run_model(test_df, model_name)
# for alpha in [0.9, 0.93, 0.96, 0.99, 1]:
#     test_df = test_df.copy()
#     test_df = smooth_predictions_xy(test_df, model_name, alpha=alpha)
#     error = total_error_loss(test_df, model_name)
#     print(f"{round(alpha, 2)}: {error}")

# # Print column variance
# model_name = 'pSales_NN_v3'
# print_column_variance(test_df, model_name, 'Machine Group')

pSales_LSTM_v4: 37.26
pSales_LSTM_v5: 36.48
pSales_LSTM_v6: 45.71


### Visualize training results

In [None]:
# # Visualize training results
# model_name = 'NN_embedding_model_3'
# training_results = {
#     'loss': [2.0478146076202393, 2.0088889598846436, 2.0007753372192383, 1.9968146085739136, 1.9937269687652588, 1.9921172857284546, 1.990675687789917, 1.9893001317977905, 1.9881930351257324, 1.9875684976577759, 1.9872304201126099, 1.9865171909332275, 1.9859004020690918, 1.985435128211975, 1.9848004579544067, 1.983401894569397, 1.9824390411376953, 1.9820188283920288, 1.981824517250061, 1.9817743301391602],
#     'val_loss': [4.535243034362793, 4.51762580871582, 4.469428539276123, 4.436275482177734, 4.456634521484375, 4.815524578094482, 4.3103556632995605, 4.498797416687012, 4.790141582489014, 4.464589595794678, 4.674554347991943, 4.561259746551514, 4.533383369445801, 4.472135066986084, 4.466953754425049, 4.478504180908203, 4.723540782928467, 4.859069347381592, 4.496937274932861, 4.377903461456299]
# }

# visualize_training_results(training_results, model_name)