In [110]:
# Import libraries 
import os
import pandas as pd
import numpy as np
import json

import mlflow

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from pathlib import Path
from functools import reduce

from datetime import datetime
from hts import HTSRegressor
import hts.functions
import collections
from hts.hierarchy import HierarchyTree
from sklearn.metrics import mean_squared_error

import warnings
warnings.simplefilter("ignore")

# settings
plt.style.use('seaborn')
plt.rcParams["figure.figsize"] = (20, 8)

In [123]:
# Read the data
df = pd.read_csv('data_from_2010_to_2019_unemployment_rate.csv', converters={'ags2': str, 'ags5': str})
df.shape

(48120, 3)

In [79]:
df.head()

Unnamed: 0,ags5,date,unemployment_rate
0,1001,2010-01-31,13.7
1,1001,2010-02-28,14.1
2,1001,2010-03-31,13.6
3,1001,2010-04-30,13.1
4,1001,2010-05-31,12.5


In [133]:
df.tail()

Unnamed: 0,ags5,date,unemployment_rate,ags2
48115,16077,2019-08-31,7.0,16
48116,16077,2019-09-30,6.5,16
48117,16077,2019-10-31,6.5,16
48118,16077,2019-11-30,6.3,16
48119,16077,2019-12-31,6.5,16


## Data Preparation

In [124]:
# Add AGS 2
def get_ags2(x):
    return x[0:2]

df['ags2'] = data['ags5'].apply(get_ags2)
df.head()

Unnamed: 0,ags5,date,unemployment_rate,ags2
0,1001,2010-01-31,13.7,1
1,1001,2010-02-28,14.1,1
2,1001,2010-03-31,13.6,1
3,1001,2010-04-30,13.1,1
4,1001,2010-05-31,12.5,1


## ML Flow Experiment Setup 

In [201]:
def train_heirarchical_cluster_model(data, agregate_col, params, cluster_type="ags2"):
    
    ''' Generate a run name '''
    run_name = 'hierarchical_' + '_'.join(list(params.values())[0:2])
    
    with mlflow.start_run(run_name=run_name):
        
        # Create a list of kreis
        kreis_list = list(data['ags5'].unique())
        
        ''' Generate the dataset from the cluster with the ags and total summation '''
        print("Generating the hierarchical dataset...")
    
        # Filter Data by relevant columns 
        relevant_cols = ['ags5', 'unemployment_rate', 'date']
        relevant_cols.append(agregate_col)
        df = data[relevant_cols]
    
        # Get bottom level data - ags5
        df_ags5 = df.pivot(index="date", columns="ags5", values="unemployment_rate")
        
        # Get middle level data - aggregate_col
        df_middle = df.groupby(["date", agregate_col]).sum().reset_index(drop=False).pivot(index="date", 
                                                                           columns=agregate_col, 
                                                                           values="unemployment_rate")
        
        # Get the top level data
        df_total = df.groupby("date")["unemployment_rate"].sum().to_frame().rename(columns={"unemployment_rate": "total"})
        
        # Join the data frames
        hdf = df_ags5.join(df_ags2).join(df_total)

        # Set the index in datetime format
        hdf.index = pd.to_datetime(hdf.index)
        
        print("The dataset size is", hdf.shape)
        
        # Create the hierarchical cluster set 
        cluster_set = df.groupby(agregate_col)['ags5'].apply(lambda x: list(set(x))).to_dict()
        
        # Add total to the dictionary
        cluster_set['total'] = list(cluster_set.keys())
    
        ''' Model Fitting '''
        
        # Get the params
        model_type = params['model']
        rev_type = params['revision_method']
        time_steps = params['time_steps']
        
        # Divide the data into train and test sets
        train_hdf = hdf.head(len(hdf) - time_steps)
        test_hdf = hdf.tail(time_steps)
        
        print(f"Fitting the model {model_type} with revision method {rev_type}.")
        
        # Fit the model 
        hts_model = HTSRegressor(model=model_type, revision_method=rev_type, n_jobs=0)
        hts_model.fit(train_hdf, cluster_set)
        
        print(f"Predicting for the next {time_steps} time steps.")
        
        # Get the predictions 
        preds = hts_model.predict(steps_ahead=time_steps)
        
        ''' Model Evaluation '''
        
        # Get the predicted vales 
        actual_preds = preds.tail(time_steps)
        
        # Check if there are negative values in the predictions 
        negative_pred = (actual_preds < 0).values.any()
        if negative_pred:
            print("There are negative values in the predictions.")
        else: 
            print("No negative values found in the predictions")
            
        # Check if the prediction and test have the same size
        assert actual_preds.shape[0] == test_hdf.shape[0]
        
        # Calculate the mse for each kreis
        total_mse = 0
        total_rmse = 0
        for kreis in kreis_list: 
            total_mse  += mean_squared_error(y_pred=actual_preds[kreis].values, y_true=test_hdf[kreis].values, squared=True)
            total_rmse += mean_squared_error(y_pred=actual_preds[kreis].values, y_true=test_hdf[kreis].values, squared=False)
#             print(total_mse, total_rmse)
        
        # Calculate average mse 
        average_mse = total_mse/len(kreis_list)
        average_rmse = total_rmse/len(kreis_list)
        print("The average error is:", average_mse)
        
        
        ''' Log experiment details in ML Flow '''
        # Log params
        mlflow.log_params(params)
        mlflow.log_param("Cluster Type", cluster_type)
        mlflow.log_param("Cluster Set", cluster_set)
        
        # Log metrics
        mlflow.log_metric("mse", average_mse)
        mlflow.log_metric("rmse", average_rmse)
        
        negative_pred = 1 if negative_pred else 0 
        mlflow.log_metric("negative_preds", negative_pred)        
        
        return preds
        
        
        

## Model Testing and Parameter tuning

In [202]:
# Set the params 
params = {
    'model':'sarimax',
    'revision_method':'BU',
    'time_steps': 12
}

# Run the function 
predictions = train_heirarchical_cluster_model(data=df,
                                 agregate_col='ags2', 
                                 params=params,
                                 cluster_type="ags2")

Generating the hierarchical dataset...
The dataset size is (120, 418)
Fitting the model sarimax with revision method BU.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [00:14<00:00, 28.39it/s]
Fitting models:  17%|███████████                                                     | 72/418 [00:00<00:00, 356.27it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:01<00:00, 351.04it/s]


No negative values found in the predictions
The average error is: 0.16016703958156228


Revision types to the model.

* **AHP** — average historical proportions (top-down approach),
* **PHA** — proportions of historical averages (top-down approach),
* **FP** — the forecasted proportions (top-down approach),
* **OLS** — the optimal combination using OLS,
* **WLSS** - optimal combination using structurally weighted OLS,
* **WLSV** - optimal combination using variance-weighted OLS.

In [196]:
# Run all combinations for models 
model_types = ['auto_arima', 'sarimax']
revisions = ['BU', 'AHP', 'PHA', 'FP', 'OLS', 'WLSS', 'WLSV']

In [197]:
for m in model_types:
    for r in revisions:
        print(f"Model: {m} and Revision: {r}")
        
        # Change params 
        params['model'] = m
        params['revision_method'] = r
        
        # Run the prediction model  
        predictions = train_heirarchical_cluster_model(data=df,
                                         agregate_col='ags2', 
                                         params=params)

Model: auto_arima and Revision: BU
Generating the hierarchical dataset...
The dataset size is (120, 418)
Fitting the model auto_arima with revision method BU.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [19:29<00:00,  2.80s/it]
Fitting models:   3%|█▉                                                              | 13/418 [00:00<00:03, 124.47it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:02<00:00, 206.02it/s]


No negative values found in the predictions
The average error is: 0.1707600377188778
Model: auto_arima and Revision: AHP
Generating the hierarchical dataset...
The dataset size is (120, 418)
Fitting the model auto_arima with revision method AHP.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [19:34<00:00,  2.81s/it]
Fitting models:  11%|███████▏                                                        | 47/418 [00:00<00:01, 227.70it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:01<00:00, 245.72it/s]


No negative values found in the predictions
The average error is: 0.41566813827985644
Model: auto_arima and Revision: PHA
Generating the hierarchical dataset...
The dataset size is (120, 418)
Fitting the model auto_arima with revision method PHA.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [19:16<00:00,  2.77s/it]
Fitting models:   6%|███▉                                                            | 26/418 [00:00<00:01, 244.67it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:01<00:00, 245.27it/s]


No negative values found in the predictions
The average error is: 0.45455251183218054
Model: auto_arima and Revision: FP
Generating the hierarchical dataset...
The dataset size is (120, 418)
Fitting the model auto_arima with revision method FP.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [23:20<00:00,  3.35s/it]
Fitting models:   6%|███▊                                                            | 25/418 [00:00<00:01, 235.66it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:01<00:00, 228.76it/s]


(16, 120)
There are negative values in the predictions.
The average error is: 25.978983421447687
Model: auto_arima and Revision: OLS
Generating the hierarchical dataset...
The dataset size is (120, 418)
Fitting the model auto_arima with revision method OLS.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [21:34<00:00,  3.10s/it]
Fitting models:   6%|███▉                                                            | 26/418 [00:00<00:01, 257.44it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:01<00:00, 281.09it/s]


There are negative values in the predictions.
The average error is: 84.86883952201418
Model: auto_arima and Revision: WLSS
Generating the hierarchical dataset...
The dataset size is (120, 418)
Fitting the model auto_arima with revision method WLSS.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [23:42<00:00,  3.40s/it]
Fitting models:   6%|███▋                                                            | 24/418 [00:00<00:01, 230.77it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:01<00:00, 234.43it/s]


There are negative values in the predictions.
The average error is: 4.7193515930921786
Model: auto_arima and Revision: WLSV
Generating the hierarchical dataset...
The dataset size is (120, 418)
Fitting the model auto_arima with revision method WLSV.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [20:44<00:00,  2.98s/it]
Fitting models:   7%|████▍                                                           | 29/418 [00:00<00:01, 267.31it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:01<00:00, 234.19it/s]


There are negative values in the predictions.
The average error is: 100.80797780902124
Model: sarimax and Revision: BU
Generating the hierarchical dataset...
The dataset size is (120, 418)
Fitting the model sarimax with revision method BU.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [00:13<00:00, 31.23it/s]
Fitting models:  21%|█████████████▏                                                  | 86/418 [00:00<00:00, 418.41it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:00<00:00, 434.47it/s]


No negative values found in the predictions
The average error is: 0.16016703958156228
Model: sarimax and Revision: AHP
Generating the hierarchical dataset...
The dataset size is (120, 418)


Fitting models:   0%|                                                                          | 0/418 [00:00<?, ?it/s]

Fitting the model sarimax with revision method AHP.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [00:16<00:00, 25.50it/s]
Fitting models:  10%|██████                                                          | 40/418 [00:00<00:00, 390.43it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:01<00:00, 389.45it/s]


No negative values found in the predictions
The average error is: 0.43353423050913853
Model: sarimax and Revision: PHA
Generating the hierarchical dataset...
The dataset size is (120, 418)
Fitting the model sarimax with revision method PHA.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [00:13<00:00, 30.80it/s]
Fitting models:  10%|██████▍                                                         | 42/418 [00:00<00:00, 408.74it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:01<00:00, 401.55it/s]


No negative values found in the predictions
The average error is: 0.47289509863730717
Model: sarimax and Revision: FP
Generating the hierarchical dataset...
The dataset size is (120, 418)
Fitting the model sarimax with revision method FP.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [00:13<00:00, 31.03it/s]
Fitting models:  10%|██████                                                          | 40/418 [00:00<00:01, 372.36it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:01<00:00, 401.33it/s]


(16, 120)
No negative values found in the predictions
The average error is: 26.527558187863697
Model: sarimax and Revision: OLS
Generating the hierarchical dataset...
The dataset size is (120, 418)
Fitting the model sarimax with revision method OLS.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [00:13<00:00, 30.58it/s]
Fitting models:  10%|██████▍                                                         | 42/418 [00:00<00:00, 390.16it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:01<00:00, 390.35it/s]


There are negative values in the predictions.
The average error is: 86.52685123570778
Model: sarimax and Revision: WLSS
Generating the hierarchical dataset...
The dataset size is (120, 418)
Fitting the model sarimax with revision method WLSS.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [00:14<00:00, 29.33it/s]
Fitting models:   7%|████▎                                                           | 28/418 [00:00<00:01, 279.01it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:01<00:00, 247.71it/s]


There are negative values in the predictions.
The average error is: 4.790132076611419
Model: sarimax and Revision: WLSV
Generating the hierarchical dataset...
The dataset size is (120, 418)
Fitting the model sarimax with revision method WLSV.


Fitting models: 100%|████████████████████████████████████████████████████████████████| 418/418 [00:15<00:00, 26.35it/s]
Fitting models:   9%|█████▋                                                          | 37/418 [00:00<00:01, 358.69it/s]

Predicting for the next 12 time steps.


Fitting models: 100%|███████████████████████████████████████████████████████████████| 418/418 [00:01<00:00, 265.39it/s]


There are negative values in the predictions.
The average error is: 103.25736327264667
