get long and wide format for company changes time-series data

In [1]:
import pandas as pd
import datetime

In [2]:
df = pd.read_csv('firmenveraenderungen_2007_2021.csv', delimiter=';', index_col='id')
df.reset_index(drop=True, inplace=True)

In [3]:
df.head(2)

Unnamed: 0,ags2,bundesland,ags5,kreis,variable,d200701,d200702,d200703,d200704,d200705,...,d202009,d202010,d202011,d202012,d202101,d202102,d202103,d202104,d202105,d202106
0,1,Schleswig-Holstein,1001,"Flensburg, Stadt",kr_firm_loesch_m,4,5,7,5,4,...,9,11,8,15,8,8,13,13,9,6
1,1,Schleswig-Holstein,1001,"Flensburg, Stadt",kr_firm_neug_m,5,12,19,11,21,...,10,11,15,19,18,13,16,18,7,13


In [4]:
df.shape

(1203, 179)

In [5]:
di = {"kr_firm_br_99_m":"number_of_companies_unknown_sector",
    "kr_firm_br_a_m":"number_of_companies_agriculture",
    "kr_firm_br_b_m":"number_of_companies_mining",
    "kr_firm_br_c_m":"number_of_companies_manufacturing",
    "kr_firm_br_d_m":"number_of_companies_energy",
    "kr_firm_br_e_m":"number_of_companies_water_and_sewage",
    "kr_firm_br_f_m":"number_of_companies_construction",
    "kr_firm_br_g_m":"number_of_companies_repair_motor_vehicles",
    "kr_firm_br_h_m":"number_of_companies_transport",
    "kr_firm_br_i_m":"number_of_companies_hospitality",
    "kr_firm_br_j_m":"number_of_companies_communication",
    "kr_firm_br_k_m":"number_of_companies_financial_and_insurance",
    "kr_firm_br_l_m":"number_of_companies_real_estat",
    "kr_firm_br_m_m":"number_of_companies_technical_services",
    "kr_firm_br_n_m":"number_of_companies_economic_services",
    "kr_firm_br_o_m":"number_of_companies_administration",
    "kr_firm_br_p_m":"number_of_companies_education",
    "kr_firm_br_q_m":"number_of_companies_health_and_social_services",
    "kr_firm_br_r_m":"number_of_companies_arts_entertainment",
    "kr_firm_br_s_m":"number_of_companies_rendering_other_services",
    "kr_firm_br_t_m":"number_of_companies_domestic_staff",
    "kr_firm_br_u_m":"number_of_companies_extraterritorial",
    "kr_firm_liq_m":"number_of_company_liquidations",
    "kr_firm_loesch_m":"number_of_company_deletions",
    "kr_firm_neug_m":"number_of_start_ups"}

In [6]:
set(df['variable'])

{'kr_firm_liq_m', 'kr_firm_loesch_m', 'kr_firm_neug_m'}

## format

### long

In [7]:
def long_format(df):
    # drop columns
    df = df.drop(columns=['kreis', 'bundesland'])
    # convert to wide format
    df = pd.wide_to_long(df, i=['ags2', 'ags5', 'variable'], stubnames='d', j='date')
    # unifiied changes
    df = df.reset_index(drop=False)
    df = df.rename(columns = {"d":"value"})
    df = df.replace({"variable": di})
    # convert date
    df['date'] = pd.to_datetime(df['date'], format = '%Y%m')
    return df

In [8]:
df_long = long_format(df)

In [9]:
#df_long

In [10]:
#df_long.to_csv('./final_dfs/company_changes_long.csv', index=False)

### wide

In [11]:
def wide_format(df):
    # convert to long format
    df = df.pivot(index=["ags2","ags5","date"], columns="variable", values="value")
    # unifiied changes
    df = df.reset_index(drop=False)
#     df = df.rename(columns = {"d":"value"})
#     df = df.replace({"variable": di})
    # convert date
    df['date'] = pd.to_datetime(df['date'], format = '%Y%m')
    return df

In [12]:
df_wide = wide_format(df_long)

In [13]:
#df_wide

In [14]:
#df.to_csv('./final_dfs/company_changes_wide.csv', index = False)

## merge

In [15]:
df_final_wide = pd.read_csv('../../final_dfs/for_modeling/df_final_date_wide.csv')
df_final_long = pd.read_csv('../../final_dfs/for_modeling/df_final_date_long.csv')

### wide

In [16]:
df_final_wide.shape

(11228, 40)

In [17]:
df_wide.shape

(69774, 6)

In [18]:
# filter these out to avoid duplicate when merging
df_wide = df_wide[df_wide['date']<datetime.datetime(2019,1,1)]

In [19]:
df_final_wide = pd.concat([df_wide, df_final_wide], ignore_index=True)

In [20]:
df_final_wide

Unnamed: 0,ags2,ags5,date,number_of_company_deletions,number_of_company_liquidations,number_of_start_ups,number_of_companies_administration,number_of_companies_agriculture,number_of_companies_arts_entertainment,number_of_companies_communication,...,employees_social_security_at_residence,employees_social_security_at_residenceemployees_social_security_at_work,realized_short_time_work_companies,realized_short_time_work_people,registerd_jobs,underemployment_without_short_time _work,unemployed,unemployment_benefit_entitled,unemployment_benefit_recipients,unemployment_rate
0,1,1001,2007-01-01 00:00:00,4.0,2.0,5.0,,,,,...,,,,,,,,,,
1,1,1001,2007-02-01 00:00:00,5.0,3.0,12.0,,,,,...,,,,,,,,,,
2,1,1001,2007-03-01 00:00:00,7.0,6.0,19.0,,,,,...,,,,,,,,,,
3,1,1001,2007-04-01 00:00:00,5.0,1.0,11.0,,,,,...,,,,,,,,,,
4,1,1001,2007-05-01 00:00:00,4.0,2.0,21.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68967,16,16077,2020-12-01,2.0,1.0,6.0,115.0,99.0,158.0,44.0,...,,,,,702.0,4147.0,3015.0,6196.000000,979.0,6.7
68968,16,16077,2021-01-01,0.0,3.0,2.0,115.0,99.0,158.0,44.0,...,,,,,681.0,,3287.0,6236.000000,,7.3
68969,16,16077,2021-02-01,0.0,0.0,5.0,115.0,99.0,158.0,43.0,...,,,,,680.0,,3270.0,6248.782315,,7.3
68970,16,16077,2021-03-01,4.0,0.0,4.0,114.0,100.0,158.0,42.0,...,,,,,749.0,,3222.0,6219.427429,,7.2


### long

In [21]:
df_final_long.shape

(370207, 5)

In [22]:
df_long.shape

(209322, 5)

In [23]:
# filter these out to avoid duplicate when merging
df_long = df_long[df_long['date']<datetime.datetime(2019,1,1)]

In [24]:
df_final_long = pd.concat([df_long, df_final_long], ignore_index=True)

In [25]:
df_final_long

Unnamed: 0,ags2,ags5,variable,date,value
0,1,1001,number_of_company_deletions,2007-01-01 00:00:00,4.0
1,1,1001,number_of_company_deletions,2007-02-01 00:00:00,5.0
2,1,1001,number_of_company_deletions,2007-03-01 00:00:00,7.0
3,1,1001,number_of_company_deletions,2007-04-01 00:00:00,5.0
4,1,1001,number_of_company_deletions,2007-05-01 00:00:00,4.0
...,...,...,...,...,...
543434,7,7000,number_of_company_liquidations,2020-12-01,0.0
543435,7,7000,number_of_company_liquidations,2021-01-01,0.0
543436,7,7000,number_of_company_liquidations,2021-02-01,0.0
543437,7,7000,number_of_company_liquidations,2021-03-01,0.0


In [26]:
# df_final_wide.to_csv('../../final_dfs/for_modeling/df_final_date_wide_2007.csv')
# df_final_long.to_csv('../../final_dfs/for_modeling/df_final_date_long_2007.csv')