In [128]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from pathlib import Path
from functools import reduce

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [129]:
df = pd.read_csv('./corona_data_sets/arbeitsmarktentwicklung.csv', converters={'ags2': str, 'ags5': str})
df = df.drop(['kreis', '_id'], 1)
print(df)

    ags2          bundesland   ags5  kr_ka_au_202001  kr_ka_au_202002  \
0     01  Schleswig-Holstein  01001                0                0   
1     01  Schleswig-Holstein  01002                5                4   
2     01  Schleswig-Holstein  01003                0                0   
3     01  Schleswig-Holstein  01004                0                0   
4     01  Schleswig-Holstein  01051                0                0   
..   ...                 ...    ...              ...              ...   
396   16           Thüringen  16073                0                5   
397   16           Thüringen  16074                0                3   
398   16           Thüringen  16075                5                7   
399   16           Thüringen  16076                3                0   
400   16           Thüringen  16077                0                3   

     kr_ka_au_202003  kr_ka_au_202004  kr_ka_au_202005  kr_ka_au_202006  \
0                185              758           

In [130]:
df = pd.wide_to_long(df, i=['ags2', 'ags5','bundesland'], stubnames ='kr' , j = 'variable', sep='_',  suffix=r'\w+')
df = df.reset_index(drop=False)

In [131]:
set(df['variable'])

{'al_201901',
 'al_201902',
 'al_201903',
 'al_201904',
 'al_201905',
 'al_201906',
 'al_201907',
 'al_201908',
 'al_201909',
 'al_201910',
 'al_201911',
 'al_201912',
 'al_202001',
 'al_202002',
 'al_202003',
 'al_202004',
 'al_202005',
 'al_202006',
 'al_202007',
 'al_202008',
 'al_202009',
 'al_202010',
 'al_202011',
 'al_202012',
 'al_202101',
 'al_202102',
 'al_202103',
 'al_202104',
 'alga_201901',
 'alga_201902',
 'alga_201903',
 'alga_201904',
 'alga_201905',
 'alga_201906',
 'alga_201907',
 'alga_201908',
 'alga_201909',
 'alga_201910',
 'alga_201911',
 'alga_201912',
 'alga_202001',
 'alga_202002',
 'alga_202003',
 'alga_202004',
 'alga_202005',
 'alga_202006',
 'alga_202007',
 'alga_202008',
 'alga_202009',
 'alga_202010',
 'alga_202011',
 'alga_202012',
 'alq_201901',
 'alq_201902',
 'alq_201903',
 'alq_201904',
 'alq_201905',
 'alq_201906',
 'alq_201907',
 'alq_201908',
 'alq_201909',
 'alq_201910',
 'alq_201911',
 'alq_201912',
 'alq_202001',
 'alq_202002',
 'alq_202003',

In [132]:
df["date"] = df.variable.apply(lambda x: x[-6:])
df["variable"] = df.variable.apply(lambda x: ''.join(x.split())[:-7].upper())

In [133]:
df

Unnamed: 0,ags2,ags5,bundesland,variable,kr,date
0,01,01001,Schleswig-Holstein,KA_AU,0.000000,202001
1,01,01001,Schleswig-Holstein,KA_AU,0.000000,202002
2,01,01001,Schleswig-Holstein,KA_AU,185.000000,202003
3,01,01001,Schleswig-Holstein,KA_AU,758.000000,202004
4,01,01001,Schleswig-Holstein,KA_AU,70.000000,202005
...,...,...,...,...,...,...
89418,16,16077,Thüringen,RLB,6196.000000,202012
89419,16,16077,Thüringen,RLB,6236.000000,202101
89420,16,16077,Thüringen,RLB,6248.782315,202102
89421,16,16077,Thüringen,RLB,6219.427429,202103


In [134]:
df['date'] = pd.to_datetime(df['date'], format = '%Y%m')
df = df.rename(columns={"kr": "value"})

In [135]:
set(df['variable'])

{'AL',
 'ALGA',
 'ALQ',
 'GA',
 'KA_AP',
 'KA_AU',
 'KA_RP',
 'KA_RU',
 'RLB',
 'SVB_AO',
 'SVB_WO',
 'UB'}

In [136]:
di = {'AL': 'unemployed', 
      'ALGA': 'unemployment_benefit_recipients',
      'ALQ': 'unemployment_rate',
     'GA': 'registerd_jobs',
     'KA_AP': 'displayed_short_time_work_people',
     'KA_AU': 'displayed_short_time_work_companies',
     'KA_RP': 'realized_short_time_work_people',
     'KA_RU': 'realized_short_time_work_companies',
     'RLB': 'unemployment_benefit_entitled',
     'SVB_AO': 'employees_social_security_at_residence' ,
     'SVB_WO': 'employees_social_security_at_residence' 'employees_social_security_at_work',
     'UB': 'underemployment_without_short_time _work'}
df = df.replace({"variable": di})
df

Unnamed: 0,ags2,ags5,bundesland,variable,value,date
0,01,01001,Schleswig-Holstein,displayed_short_time_work_companies,0.000000,2020-01-01
1,01,01001,Schleswig-Holstein,displayed_short_time_work_companies,0.000000,2020-02-01
2,01,01001,Schleswig-Holstein,displayed_short_time_work_companies,185.000000,2020-03-01
3,01,01001,Schleswig-Holstein,displayed_short_time_work_companies,758.000000,2020-04-01
4,01,01001,Schleswig-Holstein,displayed_short_time_work_companies,70.000000,2020-05-01
...,...,...,...,...,...,...
89418,16,16077,Thüringen,unemployment_benefit_entitled,6196.000000,2020-12-01
89419,16,16077,Thüringen,unemployment_benefit_entitled,6236.000000,2021-01-01
89420,16,16077,Thüringen,unemployment_benefit_entitled,6248.782315,2021-02-01
89421,16,16077,Thüringen,unemployment_benefit_entitled,6219.427429,2021-03-01


In [138]:
#df.to_csv('./final_dfs/labor_market_by_date.csv')