In [3]:
import os
import mlflow
import numpy as np
import pandas as pd

from math import sqrt
from joblib import delayed
from joblib import Parallel

from warnings import catch_warnings
from warnings import filterwarnings
from multiprocessing import cpu_count
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.statespace.sarimax import SARIMAX

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from pathlib import Path
from functools import reduce
from datetime import datetime

from prophet import Prophet

Importing plotly failed. Interactive plots will not work.


# Data

In [4]:
df = pd.read_csv('./../final_dfs/for_modeling/df_final_date_wide_2007.csv', converters={'ags2': str, 'ags5': str} )
df.set_index('ags5', drop=True, inplace=True)
df = df.drop("Unnamed: 0", axis=1)
df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')
df = df[(df['date'] >= '2010-01-01') & (df['date'] < '2020-01-01')]
df = df.rename(columns={'date': 'ds', 'unemployment_rate': 'y'})

# 401 Model for each kreis

In [5]:
# one-step forecast
def last_obs(history):
    res = history.iloc[-12:]["y"]
    return res

In [6]:
config = ['linear', 15.0, True, 'additive']
last_obs(df)

ags5
16077    8.0
16077    8.0
16077    7.6
16077    7.2
16077    7.1
16077    7.0
16077    7.0
16077    7.0
16077    6.5
16077    6.5
16077    6.3
16077    6.5
Name: y, dtype: float64

In [7]:
# root mean squared error or rmse
def measure_rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

In [8]:
# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
    return data[:-n_test], data[-n_test:]

In [57]:
(train_test_split(df, 12)[1] )

Unnamed: 0_level_0,ags2,ds,number_of_company_deletions,number_of_company_liquidations,number_of_start_ups,number_of_companies_administration,number_of_companies_agriculture,number_of_companies_arts_entertainment,number_of_companies_communication,number_of_companies_construction,...,realized_short_time_work_companies,realized_short_time_work_people,underemployment_without_short_time _work,unemployment_benefit_entitled,unemployment_benefit_recipients,y,registerd_jobs,unemployed,employees_social_security_at_work,employees_social_security_at_residence
ags5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16077,16,2019-01-01,7.0,4.0,5.0,93.0,106.0,97.0,38.0,486.0,...,,,4836.0,7762.0,1012.0,8.0,997,3675,27701,33710
16077,16,2019-02-01,4.0,1.0,4.0,-99.0,-99.0,-99.0,-99.0,-99.0,...,,,4842.0,7711.0,1005.0,8.0,1014,3673,27702,33723
16077,16,2019-03-01,2.0,1.0,2.0,101.0,84.0,77.0,34.0,425.0,...,,,4740.0,7685.0,927.0,7.6,1048,3489,27818,33784
16077,16,2019-04-01,3.0,0.0,1.0,111.0,95.0,111.0,39.0,423.0,...,,,4615.0,7602.0,822.0,7.2,1107,3319,27992,33885
16077,16,2019-05-01,7.0,0.0,2.0,112.0,101.0,123.0,44.0,482.0,...,,,4575.0,7495.0,763.0,7.1,1059,3214,27960,33890
16077,16,2019-06-01,5.0,0.0,1.0,111.0,95.0,120.0,42.0,475.0,...,,,4497.0,7387.0,767.0,7.0,1047,3140,27912,33883
16077,16,2019-07-01,1.0,3.0,4.0,111.0,96.0,120.0,42.0,475.0,...,,,4504.0,7294.0,821.0,7.0,1044,3146,27829,33779
16077,16,2019-08-01,3.0,3.0,2.0,111.0,96.0,121.0,42.0,477.0,...,,,4513.0,7188.0,842.0,7.0,990,3168,27980,33983
16077,16,2019-09-01,1.0,1.0,3.0,111.0,93.0,116.0,38.0,451.0,...,,,4390.0,7145.0,786.0,6.5,984,2929,28228,34292
16077,16,2019-10-01,4.0,1.0,3.0,111.0,94.0,116.0,38.0,452.0,...,,,4296.0,7010.0,791.0,6.5,886,2909,28190,34301


In [12]:
# walk-forward validation for univariate data
def walk_forward_validation(data, n_test):
    # split dataset
    train, test = train_test_split(data, n_test)
    # fit model and make forecast for history
    yhat = last_obs(train)
    # estimate prediction error
    error = measure_rmse(test['y'], yhat)
    return error

In [20]:
def run_model(data, n_test): 
    with mlflow.start_run(run_name="baseline"):
        ags5 = list(set(df.index.values.tolist()))
        rmse = []
        for i in ags5:
            df_ags5 = df.filter(regex=i, axis=0)
            current_rmse = walk_forward_validation(df_ags5, n_test)
            print(current_rmse)
            rmse.append(current_rmse)

        # Log params
        #params = {"growth":"NONE"}

        #mlflow.log_params(params)

        # Log metrics
        #mlflow.log_metric('rmse', np.mean(rmse))
        res = pd.DataFrame(
                            {'ags5': ags5,
                             'rmse': rmse,
                            })
        return res

In [None]:
res = run_model(df, 12)

0.539289656245448
0.20412414523193173
0.32015621187164245
0.17559422921421233
0.9596006113656521
0.35000000000000026
0.08164965809277261
0.05773502691896256
0.7488880646220324
0.1707825127659934
0.3354101966249685
0.17320508075688776
0.10801234497346433
0.8411301920630363
0.19364916731037085
0.1224744871391589
0.23452078799117157
0.4396968652757639
0.4453463071962463
0.5469613027140647
0.1581138830084192
0.36285901761795397
0.6788470618138768
0.5066228051190222
0.34278273002005216
0.6383572667401853
0.16329931618554527
0.14719601443879746
0.19364916731037082
0.4991659710623978
0.19148542155126763
0.3785938897200182
0.40620192023179774
0.20412414523193162
0.14142135623730953
0.06454972243679027
0.32914029430219166
0.45552167895721496
0.15545631755148037
0.31754264805429416
0.16583123951776998
0.09574271077563373
0.15275252316519458
0.25980762113533157
0.3883726732577013
0.2581988897471612
0.2753785273643052
0.47434164902525683
0.4368447474027052
0.15545631755148023
0.11902380714238091
0

In [19]:
print(res)

None
