In [1]:
import os
import mlflow
import numpy as np
import pandas as pd

from math import sqrt
from joblib import delayed
from joblib import Parallel

from warnings import catch_warnings
from warnings import filterwarnings
from multiprocessing import cpu_count
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.statespace.sarimax import SARIMAX

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from pathlib import Path
from functools import reduce
from datetime import datetime

from prophet import Prophet

Importing plotly failed. Interactive plots will not work.


# Data

In [2]:
df = pd.read_csv('./../final_dfs/for_modeling/df_final_date_wide_2007.csv', converters={'ags2': str, 'ags5': str} )
df.set_index('ags5', drop=True, inplace=True)
df = df.drop("Unnamed: 0", axis=1)
df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')
df = df[(df['date'] >= '2010-01-01') & (df['date'] < '2020-01-01')]
df = df.rename(columns={'date': 'ds', 'unemployment_rate': 'y'})

# 401 Model for each kreis

In [3]:
# one-step forecast
def last_obs(history):
    res = history.iloc[-12:]["y"]
    res2 = res
    res = res.append(res2)
    return res

In [4]:
config = ['linear', 15.0, True, 'additive']
last_obs(df)

ags5
16077    8.0
16077    8.0
16077    7.6
16077    7.2
16077    7.1
16077    7.0
16077    7.0
16077    7.0
16077    6.5
16077    6.5
16077    6.3
16077    6.5
16077    8.0
16077    8.0
16077    7.6
16077    7.2
16077    7.1
16077    7.0
16077    7.0
16077    7.0
16077    6.5
16077    6.5
16077    6.3
16077    6.5
Name: y, dtype: float64

In [5]:
# root mean squared error or rmse
def measure_rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

In [6]:
# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
    return data[:-n_test], data[-n_test:]

In [7]:
(train_test_split(df, 24)[1] )

Unnamed: 0_level_0,ags2,ds,number_of_company_deletions,number_of_company_liquidations,number_of_start_ups,number_of_companies_administration,number_of_companies_agriculture,number_of_companies_arts_entertainment,number_of_companies_communication,number_of_companies_construction,...,realized_short_time_work_companies,realized_short_time_work_people,underemployment_without_short_time _work,unemployment_benefit_entitled,unemployment_benefit_recipients,y,registerd_jobs,unemployed,employees_social_security_at_work,employees_social_security_at_residence
ags5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16076,16,2019-01-01,4.0,6.0,9.0,99.0,158.0,131.0,48.0,773.0,...,,,3454.0,4494.0,1029.0,5.3,898,2633,29989,37861
16076,16,2019-02-01,6.0,4.0,7.0,-99.0,-99.0,-99.0,-99.0,-99.0,...,,,3435.0,4497.0,1042.0,5.2,901,2605,29965,37904
16076,16,2019-03-01,2.0,1.0,10.0,105.0,141.0,103.0,51.0,685.0,...,,,3313.0,4465.0,918.0,4.9,959,2451,30043,37979
16076,16,2019-04-01,2.0,3.0,5.0,121.0,117.0,135.0,44.0,682.0,...,,,3209.0,4434.0,822.0,4.6,912,2307,30129,38029
16076,16,2019-05-01,3.0,1.0,2.0,125.0,154.0,158.0,58.0,803.0,...,,,3091.0,4394.0,781.0,4.5,923,2216,30146,38023
16076,16,2019-06-01,1.0,0.0,2.0,125.0,154.0,158.0,59.0,803.0,...,,,3039.0,4327.0,775.0,4.4,987,2151,30145,38028
16076,16,2019-07-01,3.0,1.0,3.0,125.0,155.0,158.0,59.0,805.0,...,,,3098.0,4272.0,838.0,4.5,930,2192,30117,37888
16076,16,2019-08-01,4.0,1.0,3.0,125.0,155.0,158.0,59.0,806.0,...,,,3072.0,4158.0,849.0,4.5,829,2209,30208,38103
16076,16,2019-09-01,2.0,1.0,4.0,122.0,149.0,137.0,56.0,770.0,...,,,3043.0,4115.0,812.0,4.4,751,2149,30306,38262
16076,16,2019-10-01,4.0,1.0,2.0,122.0,149.0,138.0,57.0,771.0,...,,,3069.0,4069.0,828.0,4.4,725,2168,30278,38190


In [8]:
# walk-forward validation for univariate data
def walk_forward_validation(data, n_test):
    # split dataset
    train, test = train_test_split(data, n_test)
    # fit model and make forecast for history
    yhat = last_obs(train)
    # estimate prediction error
    error = measure_rmse(test['y'], yhat)
    return error

In [12]:
def run_model(data, n_test): 
    with mlflow.start_run(run_name="baseline"):
        ags5 = list(set(df.index.values.tolist()))
        rmse = []
        for i in ags5:
            df_ags5 = df.filter(regex=i, axis=0)
            current_rmse = walk_forward_validation(df_ags5, n_test)
            print(current_rmse)
            rmse.append(current_rmse)

        # Log params
        params = {"growth":"NONE"}

        mlflow.log_params(params)

        # Log metrics
        mlflow.log_metric('rmse', np.mean(rmse))
        res = pd.DataFrame(
                            {'ags5': ags5,
                             'rmse': rmse,
                            })
        return res

In [13]:
res = run_model(df, 24)

1.601821879402742
0.4476792006187764
0.31754264805429416
0.32015621187164245
0.5656854249492381
0.33166247903554
1.256317369669517
0.18257418583505533
0.4092676385936225
0.404145188432738
0.595119035711904
0.4462809279665294
0.2692582403567252
0.3541421560145962
0.3329164059239696
0.4377975178854564
0.5090841449767088
0.6794605703546504
0.15545631755148018
1.2773670837573148
1.2588023408515465
0.13228756555322954
0.26614532371118865
1.1414537514357146
0.45552167895721496
1.1916375287812986
1.168866972756096
0.7410578025138571
0.7874007874011809
1.255985668708047
0.43253130907869936
0.4092676385936226
0.18484227510682363
0.14433756729740646
0.3645773809037893
0.49201964730418374
0.7020802423275182
0.28577380332470403
0.5228129047119375
0.8497548666135039
0.5216160784842943
1.0214368964029708
0.7419007121351663
0.48476798574163293
0.6664583007710735
0.2843120351538663
0.7373940601876312
0.8440971508067068
0.4546060565661952
0.3082207001484488
0.7852812659593167
0.33478849044334436
1.1372

In [11]:
res

Unnamed: 0,ags5,rmse
0,12070,1.601822
1,7235,0.447679
2,9773,0.317543
3,9673,0.320156
4,8212,0.565685
...,...,...
396,3460,0.561249
397,9564,0.660177
398,9777,0.267706
399,9464,1.404902


In [23]:
#res.to_csv("naive_forcasting_rmse.csv")
