In [149]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from pathlib import Path
from functools import reduce

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [150]:
path = Path('./corona_data_sets') # use your path
all_files = list(path.rglob("*.csv"))

In [151]:
d = {}

for p in all_files:
    df = pd.read_csv(p, index_col=None, header=0, converters={'ags2': str, 'ags5': str})
    df = df.drop(['bundesland', '_id'], 1)
    d[p.stem] = df

In [152]:
d["firmenveraenderungen"]

Unnamed: 0,ags2,ags5,kreis,variable,d201901,d201902,d201903,d201904,d201905,d201906,...,d202007,d202008,d202009,d202010,d202011,d202012,d202101,d202102,d202103,d202104
0,01,01001,"Flensburg, Stadt",kr_firm_br_a_m,14,-99,12,9,12,12,...,15,15,15,15,15,15,15,15,15,15
1,01,01002,"Kiel, Landeshauptstadt",kr_firm_br_a_m,36,-99,23,31,34,33,...,34,35,35,35,35,35,35,34,34,35
2,01,01003,"Lübeck, Hansestadt",kr_firm_br_a_m,100,-99,80,80,88,88,...,74,75,74,74,75,73,73,73,74,74
3,01,01004,"Neumünster, Stadt",kr_firm_br_a_m,30,-99,22,23,28,28,...,28,28,28,28,28,29,29,29,29,28
4,01,01051,Dithmarschen,kr_firm_br_a_m,410,-99,362,383,411,413,...,385,383,380,383,388,387,387,387,386,387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10023,16,16074,Saale-Holzland-Kreis,kr_firm_liq_m,4,1,3,2,2,2,...,1,0,2,2,0,0,8,0,3,3
10024,16,16075,Saale-Orla-Kreis,kr_firm_liq_m,4,2,0,2,0,2,...,1,1,1,1,1,0,5,2,1,3
10025,16,16076,Greiz,kr_firm_liq_m,6,4,1,3,1,0,...,2,3,4,3,0,3,2,2,3,2
10026,16,16077,Altenburger Land,kr_firm_liq_m,4,1,1,0,0,0,...,1,1,0,0,1,1,3,0,0,1


## firmenveraenderungen - company changes

In [153]:
company_changes = d["firmenveraenderungen"]
filter_col = [col for col in company_changes if col.startswith('d20')]
company_changes["value"] = company_changes[filter_col].mean(axis=1)
company_changes = company_changes.drop(filter_col, 1)
company_changes['value'] = company_changes['value'].astype(float)
#company_changes = company_changes.pivot_table(index=['ags2','ags5'], columns='variable', values='value')
company_changes


Unnamed: 0,ags2,ags5,kreis,variable,value
0,01,01001,"Flensburg, Stadt",kr_firm_br_a_m,5.821429
1,01,01002,"Kiel, Landeshauptstadt",kr_firm_br_a_m,24.464286
2,01,01003,"Lübeck, Hansestadt",kr_firm_br_a_m,65.535714
3,01,01004,"Neumünster, Stadt",kr_firm_br_a_m,18.428571
4,01,01051,Dithmarschen,kr_firm_br_a_m,359.250000
...,...,...,...,...,...
10023,16,16074,Saale-Holzland-Kreis,kr_firm_liq_m,1.750000
10024,16,16075,Saale-Orla-Kreis,kr_firm_liq_m,1.285714
10025,16,16076,Greiz,kr_firm_liq_m,1.964286
10026,16,16077,Altenburger Land,kr_firm_liq_m,1.214286


In [154]:
#company_changes.pivot_table(index=['ags2','ags5'], columns='variable', values='value')


In [155]:
#company_changes.to_csv('./corona_data_sets_modified/firmenveraenderungen.csv')


## ausbildungssituation - training situation

In [156]:
training_situation = d["ausbildungssituation"]
training_situation.rename(columns={"kr_schuel":"number_of_students"}, inplace=True)
training_situation = training_situation[["ags2","ags5","kreis","number_of_students"]]
training_situation

Unnamed: 0,ags2,ags5,kreis,number_of_students
0,01,01001,"Flensburg, Stadt",11610
1,01,01002,"Kiel, Landeshauptstadt",23898
2,01,01003,"Lübeck, Hansestadt",20478
3,01,01004,"Neumünster, Stadt",10380
4,01,01051,Dithmarschen,13158
...,...,...,...,...
396,16,16073,Saalfeld-Rudolstadt,8772
397,16,16074,Saale-Holzland-Kreis,7190
398,16,16075,Saale-Orla-Kreis,7441
399,16,16076,Greiz,8403


In [157]:
#training_situation.to_csv('./corona_data_sets_modified/training_situation.csv')


## krankenhausdaten - hospital data


In [158]:
hospital_data = d["krankenhausdaten"]
hospital_data.rename(columns={"kr_kh":"number_of_hospitals",
                              "kr_kh_bett": "number_of_hospital_beds", 
                             "kr_kh_bett_ew":"number_of_hospital_beds_adj", 
                             "kr_khp": "hospital_patiants"}, 
                     inplace=True)
hospital_data = hospital_data[["ags2","ags5","kreis","number_of_hospitals","number_of_hospital_beds","number_of_hospital_beds_adj","hospital_patiants"]]
hospital_data

Unnamed: 0,ags2,ags5,kreis,number_of_hospitals,number_of_hospital_beds,number_of_hospital_beds_adj,hospital_patiants
0,01,01001,"Flensburg, Stadt",3,860,9.6,18796
1,01,01002,"Kiel, Landeshauptstadt",16,2498,10.1,53016
2,01,01003,"Lübeck, Hansestadt",13,1975,9.1,53406
3,01,01004,"Neumünster, Stadt",5,693,8.7,19792
4,01,01051,Dithmarschen,3,656,4.9,32195
...,...,...,...,...,...,...,...
396,16,16073,Saalfeld-Rudolstadt,3,994,9.3,30316
397,16,16074,Saale-Holzland-Kreis,3,654,7.9,20517
398,16,16075,Saale-Orla-Kreis,1,120,1.5,22511
399,16,16076,Greiz,2,403,4.1,29220


In [159]:
#hospital_data.to_csv('./corona_data_sets_modified/hospital_data.csv')

## private_finanzen - private finance

In [160]:
d["private_finanzen"]

Unnamed: 0,ags2,ags5,kreis,kr_ein_md,kr_schu_qu,kr_hh_eink_kl1,kr_hh_eink_kl2,kr_hh_eink_kl3,kr_hh_eink_kl4,kr_hh_eink_kl5,kr_hh_eink_kl6,kr_kk_hh,kr_kk_ew
0,01,01001,"Flensburg, Stadt",2986,16.0,13209,12385,13092,5441,3096,3497,34496,19556
1,01,01002,"Kiel, Landeshauptstadt",3304,12.1,22126,38523,39483,18567,11162,7841,35246,19612
2,01,01003,"Lübeck, Hansestadt",3036,15.1,22881,30159,32571,15805,10999,9039,37219,20820
3,01,01004,"Neumünster, Stadt",2842,17.9,7208,13291,10435,3812,3201,2805,38141,19561
4,01,01051,Dithmarschen,2914,12.8,11806,11575,17816,9873,7834,7676,46945,23486
...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,16,16073,Saalfeld-Rudolstadt,2466,8.3,12237,13267,19810,5653,3271,2547,39242,20959
397,16,16074,Saale-Holzland-Kreis,2288,7.6,5131,9216,14445,5254,3446,2379,42956,20631
398,16,16075,Saale-Orla-Kreis,2253,7.6,7388,7994,17082,4532,2501,1632,40511,20610
399,16,16076,Greiz,2233,7.8,11098,10354,18760,5324,3370,2287,40784,21275


In [161]:
private_finance = d["private_finanzen"]
private_finance.rename(columns={"kr_ein_md":"median_income",
                              "kr_kk_hh": "purchasing_power_per_household", 
                              "kr_kk_ew": "purchasing_power_per_person", 
                              "kr_schu_qu":"debtor_quota", 
                             "kr_hh_eink_kl1": "household_in_income_calss_1",
                             "kr_hh_eink_kl2": "household_in_income_calss_2",
                             "kr_hh_eink_kl3": "household_in_income_calss_3",                                
                             "kr_hh_eink_kl4": "household_in_income_calss_4",
                             "kr_hh_eink_kl5": "household_in_income_calss_5",
                             "kr_hh_eink_kl6": "household_in_income_calss_6"}, 
                     inplace=True)
private_finance = private_finance[["ags2","ags5","kreis","median_income","purchasing_power_per_household","purchasing_power_per_person"
                                   ,"debtor_quota","household_in_income_calss_1", "household_in_income_calss_2", "household_in_income_calss_3",
                                  "household_in_income_calss_4", "household_in_income_calss_5", "household_in_income_calss_6"]]
private_finance

Unnamed: 0,ags2,ags5,kreis,median_income,purchasing_power_per_household,purchasing_power_per_person,debtor_quota,household_in_income_calss_1,household_in_income_calss_2,household_in_income_calss_3,household_in_income_calss_4,household_in_income_calss_5,household_in_income_calss_6
0,01,01001,"Flensburg, Stadt",2986,34496,19556,16.0,13209,12385,13092,5441,3096,3497
1,01,01002,"Kiel, Landeshauptstadt",3304,35246,19612,12.1,22126,38523,39483,18567,11162,7841
2,01,01003,"Lübeck, Hansestadt",3036,37219,20820,15.1,22881,30159,32571,15805,10999,9039
3,01,01004,"Neumünster, Stadt",2842,38141,19561,17.9,7208,13291,10435,3812,3201,2805
4,01,01051,Dithmarschen,2914,46945,23486,12.8,11806,11575,17816,9873,7834,7676
...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,16,16073,Saalfeld-Rudolstadt,2466,39242,20959,8.3,12237,13267,19810,5653,3271,2547
397,16,16074,Saale-Holzland-Kreis,2288,42956,20631,7.6,5131,9216,14445,5254,3446,2379
398,16,16075,Saale-Orla-Kreis,2253,40511,20610,7.6,7388,7994,17082,4532,2501,1632
399,16,16076,Greiz,2233,40784,21275,7.8,11098,10354,18760,5324,3370,2287


In [162]:
private_finance.to_csv('./corona_data_sets_modified/private_finance.csv')

## arbeitsmarktentwicklung - labor market development

In [163]:
labor_market_development = d["arbeitsmarktentwicklung"]
filter_col = [col for col in labor_market_development if col.startswith('kr_al_')]
labor_market_development["mean_unemployed"] = labor_market_development[filter_col].mean(axis=1)
filter_col = [col for col in labor_market_development if col.startswith('kr_alq_')]
labor_market_development["mean_unemployment_rate"] = labor_market_development[filter_col].mean(axis=1)
filter_col = [col for col in labor_market_development if col.startswith('kr_ga_')]
labor_market_development["mean_registerd_jobs"] = labor_market_development[filter_col].mean(axis=1)
filter_col = [col for col in labor_market_development if col.startswith('kr_alga_')]
labor_market_development["mean_unemployment_benefit_recipients"] = labor_market_development[filter_col].mean(axis=1)
filter_col = [col for col in labor_market_development if col.startswith('kr_svb_wo_')]
labor_market_development["employees_social_security_at_residence"] = labor_market_development[filter_col].mean(axis=1)
filter_col = [col for col in labor_market_development if col.startswith('kr_svb_ao_')]
labor_market_development["employees_social_security_at_work"] = labor_market_development[filter_col].mean(axis=1)

labor_market_development = labor_market_development[["ags2","ags5","kreis","mean_unemployed","mean_unemployment_rate","mean_registerd_jobs"
                                   ,"mean_unemployment_benefit_recipients","employees_social_security_at_residence", "employees_social_security_at_work"]]
labor_market_development

Unnamed: 0,ags2,ags5,kreis,mean_unemployed,mean_unemployment_rate,mean_registerd_jobs,mean_unemployment_benefit_recipients,employees_social_security_at_residence,employees_social_security_at_work
0,01,01001,"Flensburg, Stadt",4494.357143,8.614286,770.500000,1283.500000,32548.333333,44834.333333
1,01,01002,"Kiel, Landeshauptstadt",11290.750000,8.128571,2615.571429,2531.833333,92457.666667,126471.333333
2,01,01003,"Lübeck, Hansestadt",9083.571429,7.864286,2430.035714,2466.583333,80897.166667,100578.666667
3,01,01004,"Neumünster, Stadt",3689.357143,8.578571,1146.178571,941.750000,30258.166667,41293.166667
4,01,01051,Dithmarschen,4128.964286,5.925000,969.214286,1403.750000,49071.500000,43052.000000
...,...,...,...,...,...,...,...,...,...
396,16,16073,Saalfeld-Rudolstadt,3049.107143,5.678571,1126.642857,1169.375000,41676.166667,35465.166667
397,16,16074,Saale-Holzland-Kreis,1937.750000,4.428571,1112.357143,823.166667,34155.666667,27027.166667
398,16,16075,Saale-Orla-Kreis,2068.678571,4.825000,1045.821429,964.625000,33840.666667,29995.666667
399,16,16076,Greiz,2469.928571,5.028571,735.035714,1009.041667,37722.666667,29908.666667


In [164]:
#labor_market_development.to_csv('./corona_data_sets_modified/labor_market_development.csv')

## haushalte - households

In [165]:
households = d["haushalte"]
households.rename(columns={"kr_hh_1p":"households_of_1_person",
                            "kr_hh_2p":"households_of_2_person",
                            "kr_hh_3p":"households_of_3_person",
                            "kr_hh_4p":"households_of_4_person",
                             "kr_hh_5p":"households_of_5_person_or_more",
                             "kr_hh_kind": "household_with_kids",
                             "kr_hh_k_00u03": "household_with_kids_under_3",
                             "kr_hh_k_03u06": "household_with_kids_over_3_under_6",
                             "kr_hh_k_06u10": "household_with_kids_over_6_under_10",
                             "kr_hh_k_10u15": "household_with_kids_over_10_under_15",
                             "kr_hh_k_15u18": "household_with_kids_over_15_under_18",
                             "kr_hh_dinks": "household_with_double_income_no_kids"}, 
                     inplace=True)
households = households[["ags2","ags5","kreis",
                                   "households_of_1_person","households_of_2_person","households_of_3_person"
                                   ,"households_of_4_person","households_of_5_person_or_more", "household_with_kids", 
                                   "household_with_kids_under_3","household_with_kids_over_3_under_6", "household_with_kids_over_6_under_10",
                                   "household_with_kids_over_10_under_15","household_with_kids_over_15_under_18","household_with_double_income_no_kids"]]
households

Unnamed: 0,ags2,ags5,kreis,households_of_1_person,households_of_2_person,households_of_3_person,households_of_4_person,households_of_5_person_or_more,household_with_kids,household_with_kids_under_3,household_with_kids_over_3_under_6,household_with_kids_over_6_under_10,household_with_kids_over_10_under_15,household_with_kids_over_15_under_18,household_with_double_income_no_kids
0,01,01001,"Flensburg, Stadt",25107,16482,5165,3534,332,9586,1783,1599,2109,2371,1529,5347
1,01,01002,"Kiel, Landeshauptstadt",68879,41792,13932,10623,2019,27434,4954,4487,5586,7062,4083,14410
2,01,01003,"Lübeck, Hansestadt",57781,41983,12226,8290,1078,22583,3908,3668,4719,5792,3664,13044
3,01,01004,"Neumünster, Stadt",14953,17073,4749,3428,441,9004,1396,1422,1906,2445,1540,5073
4,01,01051,Dithmarschen,15617,39929,7318,3236,546,9795,1374,1474,2065,2803,1807,8248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,16,16073,Saalfeld-Rudolstadt,23337,22553,6558,3658,692,10132,1553,1659,2239,2434,1526,6329
397,16,16074,Saale-Holzland-Kreis,12729,16264,6573,3542,761,9855,1458,1626,2174,2414,1451,4546
398,16,16075,Saale-Orla-Kreis,14015,18458,5379,2723,568,7782,1133,1246,1722,2003,1120,4793
399,16,16076,Greiz,19192,21603,6498,3345,567,9263,1187,1573,1965,2412,1386,5744


In [166]:
#households.to_csv('./corona_data_sets_modified/households.csv')

## point_of_interest - point of interest

In [167]:
point_of_interest = d["point_of_interest"]
point_of_interest.rename(columns={"kr_fris":"number_of_hairdressers",
                            "kr_rest":"number_of_restaurants",
                            "kr_shops":"number_of_shops",
                            "kr_gewerbe":"number_of_companies",
                            "kr_mall":"number_of_shopping_malls",
                             "kr_vereine":"number_of_clubs",
                             "kr_poi_1":"number_of_education_places",
                             "kr_poi_2":"number_of_shopping_places",
                             "kr_poi_3":"number_of_free_time_places",
                             "kr_poi_4":"number_of_tourism_places",
                             "kr_poi_5":"number_of_food_and_drink_places",
                             "kr_poi_6":"number_of_public_institution_places",
                             "kr_poi_7":"number_of_traffic_places",
                             "kr_poi_8":"number_of_health_places",
                             "kr_poi_9":"number_of_business_places"}, 
                     inplace=True)
point_of_interest = point_of_interest[["ags2","ags5","kreis",
                                   "number_of_hairdressers","number_of_restaurants","number_of_shops"
                                   ,"number_of_companies","number_of_shopping_malls", "number_of_clubs", 
                                   "number_of_education_places","number_of_shopping_places", "number_of_free_time_places",
                                   "number_of_tourism_places","number_of_tourism_places","number_of_food_and_drink_places",
                                    "number_of_public_institution_places","number_of_traffic_places","number_of_health_places","number_of_business_places"]]
point_of_interest

Unnamed: 0,ags2,ags5,kreis,number_of_hairdressers,number_of_restaurants,number_of_shops,number_of_companies,number_of_shopping_malls,number_of_clubs,number_of_education_places,number_of_shopping_places,number_of_free_time_places,number_of_tourism_places,number_of_tourism_places.1,number_of_food_and_drink_places,number_of_public_institution_places,number_of_traffic_places,number_of_health_places,number_of_business_places
0,01,01001,"Flensburg, Stadt",82,131,361,5856,6,185,93,754,62,32,32,145,37,115,378,941
1,01,01002,"Kiel, Landeshauptstadt",213,343,621,14017,9,611,238,1543,173,62,62,393,107,185,1106,2189
2,01,01003,"Lübeck, Hansestadt",219,277,674,12357,14,491,162,1521,115,102,102,361,87,193,904,1921
3,01,01004,"Neumünster, Stadt",81,115,296,4639,7,120,57,616,50,14,14,126,34,109,272,762
4,01,01051,Dithmarschen,155,250,386,8984,3,248,110,1118,121,326,326,292,62,195,476,1375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,16,16073,Saalfeld-Rudolstadt,131,192,364,6641,4,253,109,1001,33,120,120,272,135,137,407,1249
397,16,16074,Saale-Holzland-Kreis,103,104,219,5078,0,227,100,667,19,59,59,144,89,127,273,917
398,16,16075,Saale-Orla-Kreis,122,128,262,5600,0,182,104,860,23,69,69,196,95,133,286,1142
399,16,16076,Greiz,132,140,279,6686,0,277,113,1002,22,57,57,194,64,159,367,1209


In [168]:
#point_of_interest.to_csv('./corona_data_sets_modified/point_of_interest.csv', index = False)