In [1]:
# Import libraries 
import os
import pandas as pd
import numpy as np
import json

import mlflow

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from pathlib import Path
from functools import reduce

from datetime import datetime
from hts import HTSRegressor
import hts.functions
import collections
from hts.hierarchy import HierarchyTree
from sklearn.metrics import mean_squared_error

import warnings
warnings.simplefilter("ignore")

# settings
plt.style.use('seaborn')
plt.rcParams["figure.figsize"] = (20, 8)

In [6]:
# Read the data
df = pd.read_csv('./../Prakhar_drafts/data_from_2010_to_2019_unemployment_rate.csv', converters={'ags2': str, 'ags5': str})
df.shape

(48120, 3)

In [2]:
# read the pca clusters by Amit 
df = pd.read_csv('./../Prakhar_drafts/data_from_2010_to_2019_unemployment_rate.csv', converters={'ags2': str, 'ags5': str})
df2 = pd.read_csv('./../final_dfs/for_modeling/df_final_stationery.csv', converters={'ags2': str, 'ags5': str}) 
df = pd.merge(df, df2[['cluster','ags5']], on = 'ags5')

In [7]:
df.head()

Unnamed: 0,ags5,date,unemployment_rate
0,1001,2010-01-31,13.7
1,1001,2010-02-28,14.1
2,1001,2010-03-31,13.6
3,1001,2010-04-30,13.1
4,1001,2010-05-31,12.5


In [8]:
df.tail()

Unnamed: 0,ags5,date,unemployment_rate
48115,16077,2019-08-31,7.0
48116,16077,2019-09-30,6.5
48117,16077,2019-10-31,6.5
48118,16077,2019-11-30,6.3
48119,16077,2019-12-31,6.5


## Data Preparation

In [11]:
# Add AGS 2
def get_ags2(x):
    return x[0:2]

df['ags2'] = df['ags5'].apply(get_ags2)
df.head()

Unnamed: 0,ags5,date,unemployment_rate,ags2
0,1001,2010-01-31,13.7,1
1,1001,2010-02-28,14.1,1
2,1001,2010-03-31,13.6,1
3,1001,2010-04-30,13.1,1
4,1001,2010-05-31,12.5,1


## ML Flow Experiment Setup 

In [55]:
def train_heirarchical_cluster_model(data, agregate_col, params, cluster_type="ags2"):
    
    ''' Generate a run name '''
    run_name = 'hierarchical_' + '_'.join(str(x) for x in list(params.values())[0:5])
    
    with mlflow.start_run(run_name=run_name):
        
        # Create a list of kreis
        kreis_list = list(data['ags5'].unique())
        
        ''' Generate the dataset from the cluster with the ags and total summation '''
        print("Generating the hierarchical dataset...")
    
        # Filter Data by relevant columns 
        relevant_cols = ['ags5', 'unemployment_rate', 'date']
        relevant_cols.append(agregate_col)
        df = data[relevant_cols]
    
        # Get bottom level data - ags5
        df_ags5 = df.pivot(index="date", columns="ags5", values="unemployment_rate")
        
        # Get middle level data - aggregate_col
        df_middle = df.groupby(["date", agregate_col]).sum().reset_index(drop=False).pivot(index="date", 
                                                                           columns=agregate_col, 
                                                                           values="unemployment_rate")
        
        # Get the top level data
        df_total = df.groupby("date")["unemployment_rate"].sum().to_frame().rename(columns={"unemployment_rate": "total"})
        
        # Join the data frames
        hdf = df_ags5.join(df_middle).join(df_total)

        # Set the index in datetime format
        hdf.index = pd.to_datetime(hdf.index)
        
        print("The dataset size is", hdf.shape)
        
        # Create the hierarchical cluster set 
        cluster_set = df.groupby(agregate_col)['ags5'].apply(lambda x: list(set(x))).to_dict()
        
        # Add total to the dictionary
        cluster_set['total'] = list(cluster_set.keys())
    
        ''' Model Fitting '''
        
        # Get the params
        model_type = params['model']
        rev_type = params['revision_method']
        time_steps = params['time_steps']
        g = params['growth']              
        sm = params['seasonality_mode'] 
        sp = params['seasonality_prior_scale'] 
                    
        # Divide the data into train and test sets
        train_hdf = hdf.head(len(hdf) - time_steps)
        test_hdf = hdf.tail(time_steps)
        
        print(f"Fitting the model {model_type} with revision method {rev_type} and growths: {g} and seasonality modes: {sm} and seasonality prior scales: {sp}.")
        
        # Fit the model 
        hts_model = HTSRegressor(model=model_type, revision_method=rev_type, 
                                 seasonality_mode = sm, seasonality_prior_scale=sp,
                                    daily_seasonality=False, yearly_seasonality=True)

        hts_model.fit(train_hdf, cluster_set)
        
        print(f"Predicting for the next {time_steps} time steps.")
        
        # Get the predictions 
        preds = hts_model.predict(steps_ahead=time_steps)
        
        ''' Model Evaluation '''
        
        # Get the predicted vales 
        actual_preds = preds.tail(time_steps)
        
        # Check if there are negative values in the predictions 
        negative_pred = (actual_preds < 0).values.any()
        if negative_pred:
            print("There are negative values in the predictions.")
        else: 
            print("No negative values found in the predictions")
            
        # Check if the prediction and test have the same size
        assert actual_preds.shape[0] == test_hdf.shape[0]
        
        # Calculate the mse for each kreis
        total_mse = 0
        total_rmse = 0
        for kreis in kreis_list: 
            total_mse  += mean_squared_error(y_pred=actual_preds[kreis].values, y_true=test_hdf[kreis].values, squared=True)
            total_rmse += mean_squared_error(y_pred=actual_preds[kreis].values, y_true=test_hdf[kreis].values, squared=False)
#             print(total_mse, total_rmse)
        
        # Calculate average mse 
        average_mse = total_mse/len(kreis_list)
        average_rmse = total_rmse/len(kreis_list)
        print("The average error is:", average_mse)
        
        
        ''' Log experiment details in ML Flow '''
        # Log params
        mlflow.log_params(params)
        mlflow.log_param("Cluster Type", cluster_type)
        mlflow.log_param("Cluster Set", cluster_set)
        
        # Log metrics
        mlflow.log_metric("mse", average_mse)
        mlflow.log_metric("rmse", average_rmse)
        
        negative_pred = 1 if negative_pred else 0 
        mlflow.log_metric("negative_preds", negative_pred)        
        
        return preds
        
        
        

## Model Testing and Parameter tuning

In [56]:
# Set the params 
params = {
    'model':'prophet',
    'revision_method':'BU',
    'time_steps': 12, 
    'growth' : "linear",
    'seasonality_mode' : "additive",
    'seasonality_prior_scale' : 10,
}

# Run the function 
#predictions = train_heirarchical_cluster_model(data=df,
#                                 agregate_col='ags2', 
#                                 params=params,
#                                 cluster_type="ags2")

Revision types to the model.

* **AHP** — average historical proportions (top-down approach),
* **PHA** — proportions of historical averages (top-down approach),
* **FP** — the forecasted proportions (top-down approach),
* **OLS** — the optimal combination using OLS,
* **WLSS** - optimal combination using structurally weighted OLS,
* **WLSV** - optimal combination using variance-weighted OLS.

In [57]:
# Run all combinations for models 
model_types = ['prophet']
revisions = ['BU', 'AHP', 'PHA']
growths = ['linear', 'logistic']
seasonality_modes = ['additive','multiplicative']
seasonality_prior_scales = [1,10,50]

In [58]:

for m in model_types:
    for r in revisions:
        for g in growths: 
            for sm in seasonality_modes:
                for sp in seasonality_prior_scales: 
                    print(f"Model: {m} and Revision: {r} and growths: {g} and seasonality modes: {sm} and seasonality prior scales: {sp}")

                    # Change params 
                    params['model'] = m
                    params['revision_method'] = r
                    params['growth'] = g                    
                    params['seasonality_mode'] = sm
                    params['seasonality_prior_scale'] = sp                    
                    
                    # Run the prediction model  
                    predictions = train_heirarchical_cluster_model(data=df,
                                                     agregate_col='ags2', 
                                                     params=params)

Model: prophet and Revision: BU and growths: linear and seasonality modes: additive and seasonality prior scales: 1
Generating the hierarchical dataset...
The dataset size is (120, 418)
Fitting the model prophet with revision method BU and growths: linear and seasonality modes: additive and seasonality prior scales: 1.


Fitting models:  10%|█         | 1/10 [1:35:58<14:23:47, 5758.63s/it]


KeyboardInterrupt: 

NameError: name 'Prophet' is not defined