In [1]:
import os
import mlflow
import numpy as np
import pandas as pd

from math import sqrt
from joblib import delayed
from joblib import Parallel

from warnings import catch_warnings
from warnings import filterwarnings
from multiprocessing import cpu_count
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.statespace.sarimax import SARIMAX

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from pathlib import Path
from functools import reduce
from datetime import datetime

from prophet import Prophet

Importing plotly failed. Interactive plots will not work.


# Data

In [41]:
df = pd.read_csv('./../../final_dfs/for_modeling/df_final_date_long_2007.csv', converters={'ags2': str, 'ags5': str} )
# fix ags5 missing 0 
df['ags5'] = np.where(df.ags5.str.len() == 4, '0' + df['ags5'], df['ags5'])
df.set_index('ags5', drop=True, inplace=True)
df = df.drop("Unnamed: 0", axis=1)
df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')
df = df[df['variable'] == 'unemployment_rate']
df = df.rename(columns={'value': 'y'})
df

Unnamed: 0_level_0,ags2,variable,date,y
ags5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
01001,1,unemployment_rate,2007-05-01,12.7
01001,1,unemployment_rate,2007-06-01,12.2
01001,1,unemployment_rate,2007-07-01,12.5
01001,1,unemployment_rate,2007-08-01,12.3
01001,1,unemployment_rate,2007-09-01,11.7
...,...,...,...,...
16077,16,unemployment_rate,2021-01-01,7.3
16077,16,unemployment_rate,2021-02-01,7.3
16077,16,unemployment_rate,2021-03-01,7.2
16077,16,unemployment_rate,2021-04-01,6.9


In [53]:
df = pd.read_csv('./../../final_dfs/for_modeling/df_final_date_wide_2007.csv', converters={'ags2': str, 'ags5': str} )
# fix ags5 missing 0 
df['ags5'] = np.where(df.ags5.str.len() == 4, '0' + df['ags5'], df['ags5'])
df.set_index('ags5', drop=True, inplace=True)
#df = df.drop("Unnamed: 0", axis=1)
df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')
df

Unnamed: 0_level_0,date,number_of_company_deletions,number_of_company_liquidations,number_of_start_ups,number_of_companies_administration,number_of_companies_agriculture,number_of_companies_arts_entertainment,number_of_companies_communication,number_of_companies_construction,number_of_companies_domestic_staff,...,employees_social_security_at_residence,employees_social_security_at_work,realized_short_time_work_companies,realized_short_time_work_people,registerd_jobs,underemployment_without_short_time _work,unemployed,unemployment_benefit_entitled,unemployment_benefit_recipients,unemployment_rate
ags5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01001,2021-03-01,13.0,6.0,16.0,41.0,15.0,148.0,163.0,241.0,0.0,...,,,,,745.0,,4709.0,11193.617211,,9.1
01001,2021-04-01,13.0,5.0,18.0,38.0,15.0,147.0,159.0,243.0,0.0,...,,,,,762.0,,4740.0,11111.999397,,9.1
01002,2021-03-01,12.0,3.0,27.0,135.0,34.0,362.0,405.0,653.0,0.0,...,,,,,2322.0,,11966.0,30985.965972,,8.6
01002,2021-04-01,9.0,3.0,25.0,126.0,35.0,354.0,388.0,647.0,0.0,...,,,,,2403.0,,12072.0,30847.303052,,8.7
01003,2021-03-01,10.0,2.0,28.0,81.0,74.0,296.0,265.0,670.0,0.0,...,,,,,2157.0,,10275.0,23703.798710,,8.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16075,2021-04-01,3.0,3.0,5.0,128.0,114.0,141.0,32.0,576.0,0.0,...,,,,,886.0,,2151.0,3102.771109,,5.0
16076,2021-03-01,4.0,3.0,5.0,124.0,153.0,187.0,59.0,788.0,1.0,...,,,,,667.0,,2618.0,3890.171929,,5.4
16076,2021-04-01,1.0,2.0,1.0,122.0,152.0,186.0,58.0,787.0,1.0,...,,,,,722.0,,2487.0,3874.523927,,5.1
16077,2021-03-01,4.0,0.0,4.0,114.0,100.0,158.0,42.0,455.0,1.0,...,,,,,749.0,,3222.0,6219.427429,,7.2


# 401 Model for each kreis

In [42]:
# one-step forecast
def last_obs(history):
    res = history.iloc[-12:]["y"]
    res2 = res
    res = res.append(res2)
    res3 = history.iloc[-12:-7]["y"]
    res = res.append(res3)    
    return res

In [43]:
df2 = df[df['date'] < '2019-01-01']
last_obs(df2)

ags5
16077    8.8
16077    8.6
16077    8.2
16077    8.0
16077    7.6
16077    7.3
16077    7.5
16077    7.4
16077    7.3
16077    7.3
16077    7.3
16077    7.4
16077    8.8
16077    8.6
16077    8.2
16077    8.0
16077    7.6
16077    7.3
16077    7.5
16077    7.4
16077    7.3
16077    7.3
16077    7.3
16077    7.4
16077    8.8
16077    8.6
16077    8.2
16077    8.0
16077    7.6
Name: y, dtype: float64

In [28]:
# root mean squared error or rmse
def measure_rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))   

In [29]:
# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
    return data[:-n_test], data[-n_test:]

In [7]:
(train_test_split(df, 24)[1] )

Unnamed: 0_level_0,ags2,ds,number_of_company_deletions,number_of_company_liquidations,number_of_start_ups,number_of_companies_administration,number_of_companies_agriculture,number_of_companies_arts_entertainment,number_of_companies_communication,number_of_companies_construction,...,realized_short_time_work_companies,realized_short_time_work_people,underemployment_without_short_time _work,unemployment_benefit_entitled,unemployment_benefit_recipients,y,registerd_jobs,unemployed,employees_social_security_at_work,employees_social_security_at_residence
ags5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16076,16,2019-01-01,4.0,6.0,9.0,99.0,158.0,131.0,48.0,773.0,...,,,3454.0,4494.0,1029.0,5.3,898,2633,29989,37861
16076,16,2019-02-01,6.0,4.0,7.0,-99.0,-99.0,-99.0,-99.0,-99.0,...,,,3435.0,4497.0,1042.0,5.2,901,2605,29965,37904
16076,16,2019-03-01,2.0,1.0,10.0,105.0,141.0,103.0,51.0,685.0,...,,,3313.0,4465.0,918.0,4.9,959,2451,30043,37979
16076,16,2019-04-01,2.0,3.0,5.0,121.0,117.0,135.0,44.0,682.0,...,,,3209.0,4434.0,822.0,4.6,912,2307,30129,38029
16076,16,2019-05-01,3.0,1.0,2.0,125.0,154.0,158.0,58.0,803.0,...,,,3091.0,4394.0,781.0,4.5,923,2216,30146,38023
16076,16,2019-06-01,1.0,0.0,2.0,125.0,154.0,158.0,59.0,803.0,...,,,3039.0,4327.0,775.0,4.4,987,2151,30145,38028
16076,16,2019-07-01,3.0,1.0,3.0,125.0,155.0,158.0,59.0,805.0,...,,,3098.0,4272.0,838.0,4.5,930,2192,30117,37888
16076,16,2019-08-01,4.0,1.0,3.0,125.0,155.0,158.0,59.0,806.0,...,,,3072.0,4158.0,849.0,4.5,829,2209,30208,38103
16076,16,2019-09-01,2.0,1.0,4.0,122.0,149.0,137.0,56.0,770.0,...,,,3043.0,4115.0,812.0,4.4,751,2149,30306,38262
16076,16,2019-10-01,4.0,1.0,2.0,122.0,149.0,138.0,57.0,771.0,...,,,3069.0,4069.0,828.0,4.4,725,2168,30278,38190


In [30]:
# walk-forward validation for univariate data
def walk_forward_validation(data, n_test):
    # split dataset
    train, test = train_test_split(data, n_test)
    # fit model and make forecast for history
    yhat = last_obs(train)
    # estimate prediction error
    error = measure_rmse(test['y'], yhat)
    return error

In [31]:
def run_model(data, n_test): 
    with mlflow.start_run(run_name="baseline"):
        ags5 = list(set(df.index.values.tolist()))
        rmse = []
        for i in ags5:
            df_ags5 = df.filter(regex=i, axis=0)
            current_rmse = walk_forward_validation(df_ags5, n_test)
            print(current_rmse)
            rmse.append(current_rmse)

        # Log params
        params = {"growth":"NONE"}

        mlflow.log_params(params)

        # Log metrics
        mlflow.log_metric('rmse', np.mean(rmse))
        res = pd.DataFrame(
                            {'ags5': ags5,
                             'rmse': rmse,
                            })
        return res

In [44]:
res = run_model(df, 29)

0.5159524169453267
0.6343609328319709
0.6085596335497508
0.8786745503980647
0.6499336836196815
0.506849634234985
0.39348136703902076
0.6805170448092112
0.35959747611403814
0.9305170976212727
0.5461463742157092
0.4853011862537638
0.5346607045271956
0.5753559617824611
0.49896444486694574
0.6156017521428764
1.1805901504960454
0.7291941702530502
0.5514870180108347
1.7479051501134448
0.5738556829269761
0.7549834435270749
0.8568184936347272
0.6800101419121984
0.6153216148122641
0.6099745613689487
0.6250517219977991
0.8072943316604139
0.5659901302904556
0.6192320048754497
0.4593848960932658
1.0357406145795651
0.7741513707457903
0.7647402083105717
0.7999999999999999
0.5426626563719763
0.5723514714723391
0.6443387414963526
0.7397576490380784
0.39740537802445325
0.48848816736454564
0.9273618495495702
0.4343048349433522
0.7568081795869118
0.8804387620914377
0.716071658906883
0.6346326659490946
0.5845717560154908
0.4590094281955587
0.8957755260685838
0.5474076839192095
0.9533968309373488
0.4649805

In [45]:
res

Unnamed: 0,ags5,rmse
0,03454,0.515952
1,08125,0.634361
2,05166,0.608560
3,08237,0.878675
4,12071,0.649934
...,...,...
396,05774,0.645141
397,08127,0.519615
398,05154,0.471608
399,05958,0.466831


In [23]:
#res.to_csv("naive_forcasting_rmse.csv")
