In [47]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from pathlib import Path
from functools import reduce

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [48]:
df = pd.read_csv('./corona_data_sets/firmenveraenderungen.csv', converters={'ags2': str, 'ags5': str})
df = df.drop(['kreis', '_id', 'bundesland'], 1)
df

Unnamed: 0,ags2,ags5,variable,d201901,d201902,d201903,d201904,d201905,d201906,d201907,...,d202007,d202008,d202009,d202010,d202011,d202012,d202101,d202102,d202103,d202104
0,01,01001,kr_firm_br_a_m,14,-99,12,9,12,12,12,...,15,15,15,15,15,15,15,15,15,15
1,01,01002,kr_firm_br_a_m,36,-99,23,31,34,33,33,...,34,35,35,35,35,35,35,34,34,35
2,01,01003,kr_firm_br_a_m,100,-99,80,80,88,88,88,...,74,75,74,74,75,73,73,73,74,74
3,01,01004,kr_firm_br_a_m,30,-99,22,23,28,28,28,...,28,28,28,28,28,29,29,29,29,28
4,01,01051,kr_firm_br_a_m,410,-99,362,383,411,413,415,...,385,383,380,383,388,387,387,387,386,387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10023,16,16074,kr_firm_liq_m,4,1,3,2,2,2,1,...,1,0,2,2,0,0,8,0,3,3
10024,16,16075,kr_firm_liq_m,4,2,0,2,0,2,0,...,1,1,1,1,1,0,5,2,1,3
10025,16,16076,kr_firm_liq_m,6,4,1,3,1,0,1,...,2,3,4,3,0,3,2,2,3,2
10026,16,16077,kr_firm_liq_m,4,1,1,0,0,0,3,...,1,1,0,0,1,1,3,0,0,1


In [49]:
df = pd.wide_to_long(df, i=['ags2', 'ags5', 'variable'], stubnames ='d' , j = 'date')
df = df.reset_index(drop=False)

In [50]:
df = df.rename(columns = {"d":"value"})

In [51]:
set(df['variable'])

{'kr_firm_br_99_m',
 'kr_firm_br_a_m',
 'kr_firm_br_b_m',
 'kr_firm_br_c_m',
 'kr_firm_br_d_m',
 'kr_firm_br_e_m',
 'kr_firm_br_f_m',
 'kr_firm_br_g_m',
 'kr_firm_br_h_m',
 'kr_firm_br_i_m',
 'kr_firm_br_j_m',
 'kr_firm_br_k_m',
 'kr_firm_br_l_m',
 'kr_firm_br_m_m',
 'kr_firm_br_n_m',
 'kr_firm_br_o_m',
 'kr_firm_br_p_m',
 'kr_firm_br_q_m',
 'kr_firm_br_r_m',
 'kr_firm_br_s_m',
 'kr_firm_br_t_m',
 'kr_firm_br_u_m',
 'kr_firm_liq_m',
 'kr_firm_loesch_m',
 'kr_firm_neug_m'}

In [52]:
di = {"kr_firm_br_99_m":"number_of_companies_unknown_sector",
    "kr_firm_br_a_m":"number_of_companies_agriculture",
    "kr_firm_br_b_m":"number_of_companies_mining",
    "kr_firm_br_c_m":"number_of_companies_manufacturing",
    "kr_firm_br_d_m":"number_of_companies_energy",
    "kr_firm_br_e_m":"number_of_companies_water_and_sewage",
    "kr_firm_br_f_m":"number_of_companies_construction",
    "kr_firm_br_g_m":"number_of_companies_repair_motor_vehicles",
    "kr_firm_br_h_m":"number_of_companies_transport",
    "kr_firm_br_i_m":"number_of_companies_hospitality",
    "kr_firm_br_j_m":"number_of_companies_communication",
    "kr_firm_br_k_m":"number_of_companies_financial_and_insurance",
    "kr_firm_br_l_m":"number_of_companies_real_estat",
    "kr_firm_br_m_m":"number_of_companies_technical_services",
    "kr_firm_br_n_m":"number_of_companies_economic_services",
    "kr_firm_br_o_m":"number_of_companies_administration",
    "kr_firm_br_p_m":"number_of_companies_education",
    "kr_firm_br_q_m":"number_of_companies_health_and_social_services",
    "kr_firm_br_r_m":"number_of_companies_arts_entertainment",
    "kr_firm_br_s_m":"number_of_companies_rendering_other_services",
    "kr_firm_br_t_m":"number_of_companies_domestic_staff",
    "kr_firm_br_u_m":"number_of_companies_extraterritorial",
    "kr_firm_liq_m":"number_of_company_liquidations",
    "kr_firm_loesch_m":"number_of_company_deletions",
    "kr_firm_neug_m":"number_of_start_ups"}

df = df.replace({"variable": di})
df

Unnamed: 0,ags2,ags5,variable,date,value
0,01,01001,number_of_companies_agriculture,201901,14
1,01,01001,number_of_companies_agriculture,201902,-99
2,01,01001,number_of_companies_agriculture,201903,12
3,01,01001,number_of_companies_agriculture,201904,9
4,01,01001,number_of_companies_agriculture,201905,12
...,...,...,...,...,...
280779,07,07000,number_of_company_liquidations,202012,0
280780,07,07000,number_of_company_liquidations,202101,0
280781,07,07000,number_of_company_liquidations,202102,0
280782,07,07000,number_of_company_liquidations,202103,0


In [53]:
df.to_csv('./final_dfs/for_modeling/company_changes_by_date_long.csv', index = False)

In [54]:
df = df.pivot(index=["ags2","ags5","date"], columns="variable", values="value")

In [55]:
df = df.reset_index(drop=False)
df

variable,ags2,ags5,date,number_of_companies_administration,number_of_companies_agriculture,number_of_companies_arts_entertainment,number_of_companies_communication,number_of_companies_construction,number_of_companies_domestic_staff,number_of_companies_economic_services,...,number_of_companies_real_estat,number_of_companies_rendering_other_services,number_of_companies_repair_motor_vehicles,number_of_companies_technical_services,number_of_companies_transport,number_of_companies_unknown_sector,number_of_companies_water_and_sewage,number_of_company_deletions,number_of_company_liquidations,number_of_start_ups
0,01,01001,201901,34.0,14.0,108.0,131.0,264.0,0.0,130.0,...,279.0,1089.0,866.0,619.0,105.0,183.0,12.0,15.0,6.0,5.0
1,01,01001,201902,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,11.0,2.0,15.0
2,01,01001,201903,34.0,12.0,87.0,132.0,207.0,0.0,246.0,...,293.0,568.0,797.0,524.0,103.0,68.0,6.0,6.0,3.0,14.0
3,01,01001,201904,43.0,9.0,119.0,98.0,170.0,0.0,215.0,...,207.0,876.0,678.0,404.0,72.0,60.0,9.0,9.0,2.0,14.0
4,01,01001,201905,44.0,12.0,146.0,150.0,242.0,0.0,281.0,...,315.0,966.0,923.0,614.0,116.0,46.0,10.0,8.0,2.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11251,16,16077,202012,115.0,99.0,158.0,44.0,457.0,1.0,246.0,...,100.0,1212.0,728.0,385.0,134.0,110.0,26.0,2.0,1.0,6.0
11252,16,16077,202101,115.0,99.0,158.0,44.0,459.0,1.0,246.0,...,99.0,1212.0,729.0,384.0,134.0,110.0,26.0,0.0,3.0,2.0
11253,16,16077,202102,115.0,99.0,158.0,43.0,457.0,1.0,245.0,...,97.0,1210.0,722.0,385.0,134.0,110.0,26.0,0.0,0.0,5.0
11254,16,16077,202103,114.0,100.0,158.0,42.0,455.0,1.0,245.0,...,96.0,1208.0,723.0,388.0,133.0,108.0,26.0,4.0,0.0,4.0


In [56]:
df['date'] = pd.to_datetime(df['date'], format = '%Y%m')
df

variable,ags2,ags5,date,number_of_companies_administration,number_of_companies_agriculture,number_of_companies_arts_entertainment,number_of_companies_communication,number_of_companies_construction,number_of_companies_domestic_staff,number_of_companies_economic_services,...,number_of_companies_real_estat,number_of_companies_rendering_other_services,number_of_companies_repair_motor_vehicles,number_of_companies_technical_services,number_of_companies_transport,number_of_companies_unknown_sector,number_of_companies_water_and_sewage,number_of_company_deletions,number_of_company_liquidations,number_of_start_ups
0,01,01001,2019-01-01,34.0,14.0,108.0,131.0,264.0,0.0,130.0,...,279.0,1089.0,866.0,619.0,105.0,183.0,12.0,15.0,6.0,5.0
1,01,01001,2019-02-01,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,...,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,11.0,2.0,15.0
2,01,01001,2019-03-01,34.0,12.0,87.0,132.0,207.0,0.0,246.0,...,293.0,568.0,797.0,524.0,103.0,68.0,6.0,6.0,3.0,14.0
3,01,01001,2019-04-01,43.0,9.0,119.0,98.0,170.0,0.0,215.0,...,207.0,876.0,678.0,404.0,72.0,60.0,9.0,9.0,2.0,14.0
4,01,01001,2019-05-01,44.0,12.0,146.0,150.0,242.0,0.0,281.0,...,315.0,966.0,923.0,614.0,116.0,46.0,10.0,8.0,2.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11251,16,16077,2020-12-01,115.0,99.0,158.0,44.0,457.0,1.0,246.0,...,100.0,1212.0,728.0,385.0,134.0,110.0,26.0,2.0,1.0,6.0
11252,16,16077,2021-01-01,115.0,99.0,158.0,44.0,459.0,1.0,246.0,...,99.0,1212.0,729.0,384.0,134.0,110.0,26.0,0.0,3.0,2.0
11253,16,16077,2021-02-01,115.0,99.0,158.0,43.0,457.0,1.0,245.0,...,97.0,1210.0,722.0,385.0,134.0,110.0,26.0,0.0,0.0,5.0
11254,16,16077,2021-03-01,114.0,100.0,158.0,42.0,455.0,1.0,245.0,...,96.0,1208.0,723.0,388.0,133.0,108.0,26.0,4.0,0.0,4.0


In [57]:
#df.to_csv('./final_dfs/company_changes_by_date_wide.csv', index = False)