# FBiH Process Raw Data (Non Political)

Purpose is to read in data, and reshape it if necessary.

In [1]:
import pandas as pd
import numpy as np

DATA_DIR = "../data/FBiH/Raw/"
SAVE_DIR = "../data/FBiH/Proccessed/"

## Gross Average Wages

In [2]:
gaw = pd.read_csv(
    DATA_DIR + "Place.csv",
    encoding="iso-8859-1",
    delimiter=";",
    header=1,
)
# gaw

Looks like some of the names are incorrect even after using the encoding the data was provided in. Might make it difficult for merging with political data however that is encoded.

So I can use longitudinal regression, I need to have the column be gross average wages and have a single record be a place and the year.

In [3]:
gaw = pd.melt(
    gaw,
    id_vars = "Canton-Municipality",
    var_name = "Year",
    value_name = "Gross Average Wage",
)
# gaw

Time to clean up the Year column as it contains info for it being the whole year.

In [4]:
gaw["Year"] = gaw["Year"].str.extract(r"(\d{4})")
# gaw

I also want to remove any columns that belong to Kantons for now as I just want to focus on municipalities. I will also rename the column too to reflect this change. Lastly, for easier merging I will lowercase the names of the municipality.

In [5]:
remove = "KANTON"
df_filter = gaw["Canton-Municipality"].str.contains(remove)
gaw = gaw[~df_filter]

gaw = gaw.rename(
    columns={"Canton-Municipality" : "Municipality"}
)
gaw["Municipality"] = gaw["Municipality"].apply(str.lower)
# gaw

For stata, the `Year` col should be an int or long. So I will convert it now to avoid headaches later.

Also it looks like the `Gross Average Wages` col has some strings in there too.

In [6]:
gaw.dtypes

Municipality          object
Year                  object
Gross Average Wage    object
dtype: object

In [7]:
[(col, gaw[col].apply(type).value_counts()) for col in gaw.columns]

[('Municipality',
  Municipality
  <class 'str'>    1580
  Name: count, dtype: int64),
 ('Year',
  Year
  <class 'str'>    1580
  Name: count, dtype: int64),
 ('Gross Average Wage',
  Gross Average Wage
  <class 'int'>    1501
  <class 'str'>      79
  Name: count, dtype: int64)]

In [8]:
gaw["Year"] = gaw["Year"].astype("int")

To see which records are str, uncomment and run below.

In [9]:
# with pd.option_context("display.max_rows", None):
#     print(gaw[gaw["Gross Average Wage"].apply(lambda x: isinstance(x, str))])

In [10]:
gaw.loc[136]["Gross Average Wage"]

'..'

Looks like `..` was placed and should be NaN to make it not a string col.

In [11]:
gaw["Gross Average Wage"] = gaw["Gross Average Wage"].replace("..", np.nan)

In [12]:
gaw.loc[136]["Gross Average Wage"]

nan

In [13]:
gaw["Gross Average Wage"] = pd.to_numeric(gaw["Gross Average Wage"])

In [14]:
[(col, gaw[col].apply(type).value_counts()) for col in gaw.columns]

[('Municipality',
  Municipality
  <class 'str'>    1580
  Name: count, dtype: int64),
 ('Year',
  Year
  <class 'int'>    1580
  Name: count, dtype: int64),
 ('Gross Average Wage',
  Gross Average Wage
  <class 'float'>    1580
  Name: count, dtype: int64)]

Saving this as a xls so datatypes are preserved and ready for stata!

In [15]:
gaw.to_excel(SAVE_DIR + "gross_average_wages.xlsx", index=False)

In [16]:
df = pd.read_excel(SAVE_DIR + "gross_average_wages.xlsx")
df

Unnamed: 0,Municipality,Year,Gross Average Wage
0,grad bihac,2005,824.0
1,bosanska krupa,2005,717.0
2,bosanski petrovac,2005,655.0
3,bu¸im,2005,677.0
4,cazin,2005,721.0
...,...,...,...
1575,drvar,2024,1492.0
1576,glamoc,2024,1708.0
1577,kupres,2024,2080.0
1578,livno,2024,2104.0


## Register of Business Entities per Canton and Municipality

In [17]:
r = pd.read_csv(
    DATA_DIR + "Registar.csv",
    encoding="iso-8859-1",
    delimiter=";",
    header=1,
)
# r

Going to drop any Kanton records and update column name.

In [18]:
remove = "KANTON"
df_filter = r["Canton-Municipality"].str.contains(remove)
r = r[~df_filter]

r = r.rename(
    columns={"Canton-Municipality" : "Municipality"}
)
r["Municipality"] = r["Municipality"].apply(str.lower)
# r

Now to reshape my table

In [19]:
r = pd.melt(
    r,
    id_vars = ["Municipality", "Business entity"],
    var_name = "Year_Sector",
    value_name = "Value",
)
# r

To further split year and sector cols

In [20]:
r[["Year", "Sector"]] = r["Year_Sector"].str.split(" ", n=1, expand=True)
r = r.drop(columns=["Year_Sector"])
r = r[["Municipality", "Business entity", "Year", "Sector", "Value"]]
# r

Now to pivot Business entity, but before doing so I need to make sure the values col does not have any str but rather nan vals.

In [21]:
[(col, r[col].apply(type).value_counts()) for col in r.columns]

[('Municipality',
  Municipality
  <class 'str'>    62568
  Name: count, dtype: int64),
 ('Business entity',
  Business entity
  <class 'str'>    62568
  Name: count, dtype: int64),
 ('Year',
  Year
  <class 'str'>    62568
  Name: count, dtype: int64),
 ('Sector',
  Sector
  <class 'str'>    62568
  Name: count, dtype: int64),
 ('Value',
  Value
  <class 'int'>    32232
  <class 'str'>    30336
  Name: count, dtype: int64)]

Looks like there is, so the following is necessary.

In [22]:
r["Value"] = pd.to_numeric(r["Value"], errors="coerce")

In [23]:
[(col, r[col].apply(type).value_counts()) for col in r.columns]

[('Municipality',
  Municipality
  <class 'str'>    62568
  Name: count, dtype: int64),
 ('Business entity',
  Business entity
  <class 'str'>    62568
  Name: count, dtype: int64),
 ('Year',
  Year
  <class 'str'>    62568
  Name: count, dtype: int64),
 ('Sector',
  Sector
  <class 'str'>    62568
  Name: count, dtype: int64),
 ('Value',
  Value
  <class 'float'>    62568
  Name: count, dtype: int64)]

In [24]:
r = r.pivot_table(
    index=["Municipality", "Year", "Sector"],
    columns="Business entity",
    values="Value"
)
r = r.reset_index()
r = r.rename_axis(None, axis=1)
# r

I want to have each sector be a column so I will combine all three biz entities into one value to avoid nans at a municipal level and reshape the table again.

In [25]:
r["Total Business Entities"] = r[["Crafts", "Legal entity", "Parts of legal entity"]].sum(axis=1)
r = r.drop(columns=["Crafts", "Legal entity", "Parts of legal entity"])
# r

Pivot again to get values for each sector and year.

In [26]:
r = r.pivot_table(
    index=["Municipality", "Year"],
    columns="Sector",
    values="Total Business Entities"
).reset_index().rename_axis(None, axis=1)
# r

Now to change year to be int and save as an excel file

In [27]:
r["Year"] = r["Year"].astype("int")

In [28]:
r.to_excel(SAVE_DIR + "businesses_per_sector.xlsx", index=False)

In [29]:
df = pd.read_excel(SAVE_DIR + "businesses_per_sector.xlsx")
df

Unnamed: 0,Municipality,Year,00-Unclassified according to activities CEA 1),"A-Agriculture, forestry and fishing",Activities of households as employers; undifferentiated goods and services-producing activities of households for own use,B-Minning and quarrying,C-Manufacturing,"D-Electricity, gas, steam and air conditioning supply","E-Water supply, sewerage, waste management and remediation activities",F-Construction,...,K-Financial and insurance activities,L-Real estate activities,"M-Professional, scientific and technical activities",N-Administrative and support service activities,O-Public administration and defence; compulsory social security,P-Education,Q-Human health and social work activities,"R-Arts, entertainment and recreation",S-Other service activities,Total
0,banovici,2012,,18,,7.0,75.0,2.0,4,19.0,...,13.0,3.0,31.0,20.0,31,31,23,56,106,930
1,banovici,2013,,21,,7.0,77.0,2.0,4,19.0,...,11.0,3.0,31.0,20.0,31,31,24,59,106,940
2,banovici,2014,,22,,7.0,76.0,2.0,4,20.0,...,11.0,4.0,31.0,19.0,31,30,24,65,110,946
3,banovici,2015,,24,,7.0,75.0,2.0,4,20.0,...,12.0,4.0,29.0,20.0,31,29,24,67,108,948
4,banovici,2016,0.0,24,0.0,6.0,77.0,2.0,3,19.0,...,12.0,4.0,30.0,20.0,31,31,27,69,111,960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,´ivinice,2019,0.0,102,0.0,8.0,262.0,9.0,18,99.0,...,51.0,9.0,102.0,58.0,28,68,56,136,266,2768
944,´ivinice,2020,0.0,100,0.0,6.0,280.0,9.0,19,114.0,...,55.0,8.0,100.0,62.0,28,67,55,136,276,2831
945,´ivinice,2021,0.0,90,0.0,5.0,301.0,11.0,22,124.0,...,60.0,9.0,101.0,61.0,28,71,66,141,274,2911
946,´ivinice,2022,,89,,5.0,316.0,11.0,22,133.0,...,64.0,9.0,106.0,69.0,28,68,71,138,300,2990


## 2013 Census Data

In [31]:
c = pd.read_csv(
    DATA_DIR + "Popis2013_Knjiga2.csv",
    encoding="iso-8859-1",
    delimiter=";",
    header=1,
)
# c

Going to drop any Kanton or Federation wide records.

In [32]:
remove = "KANTON"
df_filter = c["Area"].str.contains(remove)
c = c[~df_filter]
c = c.drop(labels=[0])
# c

Dropping the Sex column as it is redundant.

In [33]:
c = c.drop(columns="Sex")
# c

Now to rename Area to Municipality, lowercase the values in it, and add an int year column for 2013.

In [34]:
c = c.rename(columns={"Area" : "Municipality"})
c["Municipality"] = c["Municipality"].apply(str.lower)
c["Year"] = 2013
c = c[["Municipality", "Year", "Bosniak", "Croat", "Serb"]]
# c

Just to double check for nans.

In [35]:
[(col, c[col].apply(type).value_counts()) for col in c.columns]

[('Municipality',
  Municipality
  <class 'str'>    79
  Name: count, dtype: int64),
 ('Year',
  Year
  <class 'int'>    79
  Name: count, dtype: int64),
 ('Bosniak',
  Bosniak
  <class 'str'>    79
  Name: count, dtype: int64),
 ('Croat',
  Croat
  <class 'str'>    79
  Name: count, dtype: int64),
 ('Serb',
  Serb
  <class 'int'>    79
  Name: count, dtype: int64)]

Looks like there exists some nulls as some are str cols.

In [36]:
# with pd.option_context("display.max_rows", None):
#     print(c)

Going to assume that for those with missing values, there was no ethnicity present in the municipality so it simplifies my analysis later.

First to make the ".." into nans.

In [37]:
c["Bosniak"] = pd.to_numeric(c["Bosniak"], errors="coerce")
c["Croat"] = pd.to_numeric(c["Croat"], errors="coerce")

Then to fill the nan values with 0.

In [38]:
c = c.fillna(0)
c["Bosniak"] = c["Bosniak"].astype(int)
c["Croat"] = c["Croat"].astype(int)

In [39]:
[(col, c[col].apply(type).value_counts()) for col in c.columns]

[('Municipality',
  Municipality
  <class 'str'>    79
  Name: count, dtype: int64),
 ('Year',
  Year
  <class 'int'>    79
  Name: count, dtype: int64),
 ('Bosniak',
  Bosniak
  <class 'int'>    79
  Name: count, dtype: int64),
 ('Croat',
  Croat
  <class 'int'>    79
  Name: count, dtype: int64),
 ('Serb',
  Serb
  <class 'int'>    79
  Name: count, dtype: int64)]

In [40]:
c

Unnamed: 0,Municipality,Year,Bosniak,Croat,Serb
2,bihac,2013,49550,3265,910
3,bosanska krupa,2013,23578,66,1260
4,bosanski petrovac,2013,3179,26,3996
5,bu´im,2013,19207,8,1
6,cazin,2013,63463,320,29
...,...,...,...,...,...
85,drvar,2013,11,552,6420
86,glamoc,2013,1251,906,1679
87,kupres,2013,255,4474,318
88,livno,2013,4047,29273,438


In [41]:
c.to_excel(SAVE_DIR + "census_2013.xlsx", index=False)

In [42]:
df = pd.read_excel(SAVE_DIR + "census_2013.xlsx")
df

Unnamed: 0,Municipality,Year,Bosniak,Croat,Serb
0,bihac,2013,49550,3265,910
1,bosanska krupa,2013,23578,66,1260
2,bosanski petrovac,2013,3179,26,3996
3,bu´im,2013,19207,8,1
4,cazin,2013,63463,320,29
...,...,...,...,...,...
74,drvar,2013,11,552,6420
75,glamoc,2013,1251,906,1679
76,kupres,2013,255,4474,318
77,livno,2013,4047,29273,438


## Employement by Cantons and Municiipalities

In [43]:
e = pd.read_csv(
    DATA_DIR + "Zaposlenost.csv",
    encoding="iso-8859-1",
    delimiter=";",
    header=1,
)
e

Unnamed: 0,Canton-Municipality,2005 ? I-XII,2006 ? I-XII,2007 ? I-XII,2008 ? I-XII,2009 ? I-XII,2010 ? I-XII,2011 ? I-XII,2012 ? I-XII,2013 ? I-XII,...,2015 ? I-XII,2016 ? I-XII,2017 ? I-XII,2018 ? I-XII,2019 ? I-XII,2020 ? I-XII,2021 ? I-XII,2022 ? I-XII,2023 ? I-XII,2024 ? I-XII
0,UNSKO - SANSKI KANTON,32418,31169,33628,34634,33208,33067,32594,31683,31863,...,31853,32778,37364,37687,38220,37085,37712,38353,38385,38895
1,Grad Bihac,11185,11176,12053,12588,11899,12078,12032,11877,11989,...,11648,11855,13032,13495,13806,13354,13440,13512,13367,13491
2,Bosanska Krupa,2272,2285,2967,2989,2663,2790,2875,2804,2893,...,3097,3175,3447,3343,3378,3300,3503,3596,3666,3603
3,Bosanski Petrovac,1286,1142,1164,1279,1121,1054,1026,1048,1090,...,1078,1127,1366,1274,1319,1276,1372,1443,1410,1360
4,Bu¸im,1105,1135,1381,1446,1364,1382,1378,1181,1138,...,1228,1233,1525,1505,1449,1457,1492,1580,1618,1691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,Drvar,1074,1129,1121,1228,1175,1085,1112,1039,851,...,932,943,937,1007,1008,1034,1038,1054,1020,1081
85,Glamoc,495,508,539,631,682,684,646,570,507,...,540,523,490,510,507,512,537,546,519,628
86,Kupres,1004,1033,748,721,706,738,736,725,800,...,717,739,1272,1357,1449,1415,1436,1429,1420,979
87,Livno,4395,4414,3914,4008,3975,4022,4105,4166,4189,...,4063,4002,4217,4432,4490,4468,4495,4614,4736,4865


Melting like on gross average wages.

In [44]:
e = pd.melt(
    e,
    id_vars = "Canton-Municipality",
    var_name = "Year",
    value_name = "Employees"
)
e

Unnamed: 0,Canton-Municipality,Year,Employees
0,UNSKO - SANSKI KANTON,2005 ? I-XII,32418
1,Grad Bihac,2005 ? I-XII,11185
2,Bosanska Krupa,2005 ? I-XII,2272
3,Bosanski Petrovac,2005 ? I-XII,1286
4,Bu¸im,2005 ? I-XII,1105
...,...,...,...
1775,Drvar,2024 ? I-XII,1081
1776,Glamoc,2024 ? I-XII,628
1777,Kupres,2024 ? I-XII,979
1778,Livno,2024 ? I-XII,4865


Processing municipality and year cols like in gaw.

In [45]:
e = e.rename(
    columns={"Canton-Municipality" : "Municipality"}
)

remove = "KANTON"
df_filter = e["Municipality"].str.contains(remove)
e = e[~df_filter]

e["Municipality"] = e["Municipality"].apply(str.lower)
e

Unnamed: 0,Municipality,Year,Employees
1,grad bihac,2005 ? I-XII,11185
2,bosanska krupa,2005 ? I-XII,2272
3,bosanski petrovac,2005 ? I-XII,1286
4,bu¸im,2005 ? I-XII,1105
5,cazin,2005 ? I-XII,4908
...,...,...,...
1775,drvar,2024 ? I-XII,1081
1776,glamoc,2024 ? I-XII,628
1777,kupres,2024 ? I-XII,979
1778,livno,2024 ? I-XII,4865


In [46]:
e["Year"] = e["Year"].str.extract(r"(\d{4})")
e["Year"] = e["Year"].astype(int)
e

Unnamed: 0,Municipality,Year,Employees
1,grad bihac,2005,11185
2,bosanska krupa,2005,2272
3,bosanski petrovac,2005,1286
4,bu¸im,2005,1105
5,cazin,2005,4908
...,...,...,...
1775,drvar,2024,1081
1776,glamoc,2024,628
1777,kupres,2024,979
1778,livno,2024,4865


Now to double check for any nulls aka `..`.

In [47]:
[(col, e[col].apply(type).value_counts()) for col in e.columns]

[('Municipality',
  Municipality
  <class 'str'>    1580
  Name: count, dtype: int64),
 ('Year',
  Year
  <class 'int'>    1580
  Name: count, dtype: int64),
 ('Employees',
  Employees
  <class 'int'>    1422
  <class 'str'>     158
  Name: count, dtype: int64)]

Looks like there are 158 of these in Employee col.

In [51]:
# with pd.option_context("display.max_rows", None):
#     print(e[e["Employees"].apply(lambda x: isinstance(x, str))])

I will impute this with nan for now and handle dropping them when I merge. The timeline of my analysis may change depending on which years each table overlaps.

In [50]:
e["Employees"] = pd.to_numeric(e["Employees"], errors="coerce")
e

Unnamed: 0,Municipality,Year,Employees
1,grad bihac,2005,11185.0
2,bosanska krupa,2005,2272.0
3,bosanski petrovac,2005,1286.0
4,bu¸im,2005,1105.0
5,cazin,2005,4908.0
...,...,...,...
1775,drvar,2024,1081.0
1776,glamoc,2024,628.0
1777,kupres,2024,979.0
1778,livno,2024,4865.0


In [56]:
e.to_excel(SAVE_DIR + "employees.xlsx", index=False)

In [57]:
df = pd.read_excel(SAVE_DIR + "employees.xlsx")
df

Unnamed: 0,Municipality,Year,Employees
0,grad bihac,2005,11185.0
1,bosanska krupa,2005,2272.0
2,bosanski petrovac,2005,1286.0
3,bu¸im,2005,1105.0
4,cazin,2005,4908.0
...,...,...,...
1575,drvar,2024,1081.0
1576,glamoc,2024,628.0
1577,kupres,2024,979.0
1578,livno,2024,4865.0
