In [69]:
import os
import pickle


import pandas as pd
import sklearn
from pmdarima import auto_arima
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [70]:
input_folder = "../data/"
model_folder = "../models/"
df = pd.read_csv(os.path.join(input_folder, "processed_steel_data.csv"))
df_electricity = pd.read_csv(os.path.join(input_folder, "processed_seletricity_price_index.csv"))

new_column_names = {col: f"{col}_steel_index" for col in df.columns if col != "time"}
df = df.rename(columns=new_column_names)


new_column_names = {col: f"{col}_electricity_index" for col in df_electricity.columns if col != "time"}
df_electricity = df_electricity.rename(columns=new_column_names)

df["time"] = df["time"].apply(lambda x: f"{x}-01")
df_electricity["time"] = df_electricity["time"].apply(lambda x: f"{x}-01")


df = pd.merge(df, df_electricity, on="time", how="left")

In [71]:
df = df.replace(':', method='ffill')
df

Unnamed: 0,time,Germany_steel_index,Greece_steel_index,Italy_steel_index,Netherlands_steel_index,Sweden_steel_index,Germany_electricity_index
0,2013-01-01,111.5,106.3,108.3,113.3,103.7,118.2
1,2013-02-01,111.5,106.1,107.6,114.0,103.2,119.9
2,2013-03-01,111.5,105.4,108.4,113.0,101.1,119.7
3,2013-04-01,110.2,105,108.1,112.7,100.4,108.0
4,2013-05-01,109.7,104.9,107.7,111.5,100.9,87.1
...,...,...,...,...,...,...,...
124,2023-05-01,172.0,133.1,138.4,170.8,200.9,259.9
125,2023-06-01,166.6,131,134.6,165.6,201.1,302.1
126,2023-07-01,161.3,130,130.2,157.8,197.1,248.2
127,2023-08-01,158.9,129.6,126.6,155.3,189.3,300.1


In [72]:
def create_lag_features(data,label_column=["Germany"], feature_cols=["Germany"], horizon=3,lags=[1,2,3]):
    """
    Create lag features for time series data.
    
    Parameters:
    - data: pandas DataFrame with 'timestamp' and 'value' columns.
    - lag: number of lags to create.
    
    Returns:
    - pandas DataFrame with lag features.
    """
    data_lagged = data.copy()

    delayed_cols = []
    for feature in feature_cols:
        for i in lags:
            col_name = f'{feature}_t-{i-1}'
            delay = i+horizon
            data_lagged[col_name] = data[feature].shift(delay)
            delayed_cols.append(col_name)

    data_lagged["target"] = data_lagged[label_column].shift(horizon)
    delayed_cols.append("target")

    return data_lagged[delayed_cols].dropna()

def create_lag_features_forecast(data, feature_cols=["Germany"], lags=[0,1,2]):
    data_lagged = data.copy()

    delayed_cols = []
    for feature in feature_cols:
        for i in lags:
            col_name = f'{feature}_t-{i}'
            data_lagged[col_name] = data[feature].shift(i)
            delayed_cols.append(col_name)

    return data_lagged[delayed_cols].dropna()


In [73]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np


def random_forest_time_series_prediction(data, target_column: str, feature_cols: list,n_estimators=100, max_depth=None, horizon=3, random_state=42):
    """
    Predict time series data using a Random Forest Regressor with lag features.
    
    Parameters:
    - data: pandas DataFrame with 'timestamp' and 'value' columns.
    - target_column: the column to predict.
    - n_estimators: number of trees in the forest.
    - max_depth: maximum depth of the tree.
    - test_size: the proportion of the dataset to include in the test split.
    - random_state: seed for random number generation.
    
    Returns:
    - trained Random Forest model and the test predictions.
    """

    time_data = np.array(data["time"])
    data = data.drop("time", axis=1)
    # Create lag features
    lagged_data = create_lag_features(data, label_column=[target_column],feature_cols=[target_column]+feature_cols, horizon=3,lags=[1,2,3])# 

    
    # Split the data into train and test sets
    train_data, test_data = lagged_data[:-horizon], lagged_data[-horizon:]
    train_time, test_time = time_data[:-horizon], time_data[-horizon:]
    print(test_time)
    # Prepare features and target variables
    
    y_train = train_data["target"]
    X_train = train_data.drop("target", axis=1)

    y_test = test_data["target"]
    X_test = test_data.drop("target", axis=1)
    
    
    # Create and train the Random Forest model
    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    predictions = model.predict(X_test)
    
    # Evaluate the model
    mse = np.sqrt(mean_squared_error(y_test, predictions))
    print("Training completed!")
    print(f'Root Mean Squared Error on Test Set: {mse}')
    
    return model, predictions, lagged_data



In [74]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

def generate_upcoming_months(start_date, n):
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    upcoming_months = []

    for i in range(n):
        next_month = start_date + relativedelta(months=1)
        upcoming_months.append(next_month.strftime("%Y-%m-%d"))
        start_date = next_month

    return upcoming_months

In [75]:
def zip_string(arr1, arr2):
    return "\n".join([f"{arr1[i]}: {arr2[i]}" for i in range(len(arr1))])+"\n"

In [76]:
data_explanation = "Short-term business statistics (STS) provide index data on various economic activities. Percentage changes,"
data_explanation+= "The column Germany_steel_index represents the STS for Basic iron and steel and ferro-alloys"
data_explanation+= "The column Germany_electricity_index represents the STS for Electricity"
data_explanation+= "The t-x where x represents the delay in the features e-g t-1 represents the previous months data in the given column"

In [77]:
def formulate_explanation_string(model, forecast_features, last_date, horizon, last_label_data):
    feat_names = model.feature_names_in_
    feat_importance = model.feature_importances_    


    predictions = model.predict(forecast_features)
    pred_dates = generate_upcoming_months(last_date, horizon)

    data_explanation = ""
    data_explanation += f"The model made the following predictions for the next {horizon} months:\n"
    data_explanation += zip_string(pred_dates, predictions)
    data_explanation += f"The previous {horizon} months had the following values:\n"
    data_explanation += zip_string(np.array(last_label_data.iloc[:, 0]), np.array(last_label_data.iloc[:, 1]))

    data_explanation += "The model used the following features with the respective importances:"
    data_explanation +=zip_string(feat_names, feat_importance)

    data_explanation += "Short-term business statistics (STS) provide index data on various economic activities. Percentage changes,\n"
    data_explanation += "The column Germany_steel_index represents the STS for Basic iron and steel and ferro-alloys\n"
    data_explanation += "The column Germany_electricity_index represents the STS for Electricity\n"
    data_explanation += "The t-x where x represents the delay in the features e-g t-1 represents the previous months data in the given column\n"

    return data_explanation

In [78]:
def use_model(df: pd.DataFrame, df_name="steel_index", type: str= "Forecast", target_col = "Germany_steel_index", horizon=3):
    # Set up parameters
    last_date = df["time"].max()
    model_filename = f"{df_name}-{last_date}-{horizon}.pkl"
    model_path = os.path.join(model_folder,model_filename)
    feature_cols = ["Germany_electricity_index"]


    # Load or train the model
    model = None

    df = df[["time", target_col]+feature_cols]
    if os.path.exists(os.path.join(model_folder, model_filename)):
        with open(model_path, 'rb') as file:
            model = pickle.load(file)
    else: 
        model, _, _ = random_forest_time_series_prediction(df, target_column=target_col, feature_cols=feature_cols, horizon=horizon)
        with open(model_path, 'wb') as file:
            pickle.dump(model, file)
    
    forecast_features = create_lag_features_forecast(df, [target_col]+feature_cols).tail(horizon)
    
    last_label_data = df[["time", target_col]].tail(horizon)
    print(formulate_explanation_string(model, forecast_features, last_date, horizon, last_label_data))

    return model
    print(last_date)
model = use_model(df, horizon=3)

The model made the following predictions for the next 3 months:
2023-10-01: 171.7189999999999
2023-11-01: 171.38899999999987
2023-12-01: 171.40699999999987
The previous 3 months had the following values:
2023-07-01: 161.3
2023-08-01: 158.9
2023-09-01: 157.0
The model used the following features with the respective importances:Germany_steel_index_t-0: 0.2839260308502419
Germany_steel_index_t-1: 0.310658020913513
Germany_steel_index_t-2: 0.3052224592438692
Germany_electricity_index_t-0: 0.0741619640668506
Germany_electricity_index_t-1: 0.021995389702004323
Germany_electricity_index_t-2: 0.004036135223521021
Short-term business statistics (STS) provide index data on various economic activities. Percentage changes,
The column Germany_steel_index represents the STS for Basic iron and steel and ferro-alloys
The column Germany_electricity_index represents the STS for Electricity
The t-x where x represents the delay in the features e-g t-1 represents the previous months data in the given colum

In [79]:
# Access model parameters
print(model.feature_importances_)

[0.28392603 0.31065802 0.30522246 0.07416196 0.02199539 0.00403614]


In [81]:

!pip install bokeh

Collecting bokeh
  Downloading bokeh-3.1.1-py3-none-any.whl (8.3 MB)
[K     |████████████████████████████████| 8.3 MB 101 kB/s eta 0:00:01     |█████████████                   | 3.4 MB 255 kB/s eta 0:00:20
[?25hCollecting Jinja2>=2.9
  Downloading Jinja2-3.1.2-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 37 kB/s eta 0:00:01
Collecting xyzservices>=2021.09.1
  Downloading xyzservices-2023.10.1-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 581 kB/s eta 0:00:01
[?25hCollecting contourpy>=1
  Downloading contourpy-1.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (301 kB)
[K     |████████████████████████████████| 301 kB 3.6 MB/s eta 0:00:01
[?25hCollecting pillow>=7.1.0
  Downloading Pillow-10.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 3.5 MB/s eta 0:00:01
Collecting MarkupSafe>=2.0
  Downloading MarkupSafe-2.1.3-cp38-cp38-manylinux_2_17_

In [None]:
from bokeh.plotting import figure, output_file, save

# prepare some data
x = [1, 2, 3, 4, 5]
y = [4, 5, 5, 7, 2]

# set output to static HTML file
output_file(filename="custom_filename.html", title="Static HTML file")

# create a new plot with a specific size
p = figure(sizing_mode="stretch_width", max_width=500, height=250)

# add a circle renderer
circle = p.circle(x, y, fill_color="red", size=15)

# save the results to a file
save(p)

In [80]:
from bokeh.plotting import figure, show
from bokeh.sampledata.penguins import data
from bokeh.transform import factor_cmap, factor_mark
from bokeh.plotting import figure, output_file, save

SPECIES = sorted(data.species.unique())
MARKERS = ['hex', 'circle_x', 'triangle']

p = figure(title = "Penguin size", background_fill_color="#fafafa")
p.xaxis.axis_label = 'Flipper Length (mm)'
p.yaxis.axis_label = 'Body Mass (g)'

p.scatter("flipper_length_mm", "body_mass_g", source=data,
          legend_group="species", fill_alpha=0.4, size=12,
          marker=factor_mark('species', MARKERS, SPECIES),
          color=factor_cmap('species', 'Category10_3', SPECIES))

p.legend.location = "top_left"
p.legend.title = "Species"

output_file
save(p)

ModuleNotFoundError: No module named 'bokeh'