# Population dataset

This notebook aims to transform the population dataset to make it more usable

In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv("../raw_data/population/population_all_years.csv", sep=";")
df

Unnamed: 0,Código do Município,Nome do Município,2008,2009,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,1100015,Alta Floresta D'Oeste (RO),24577,24354,24228,24069,25728,25652,25578,25506,25437,23167,22945,22728,22516,21495
1,1100023,Ariquemes (RO),84581,85541,91570,92747,101269,102860,104401,105896,107345,106168,107863,109523,111148,96833
2,1100031,Cabixi (RO),6777,6695,6221,6132,6495,6424,6355,6289,6224,5438,5312,5188,5067,5363
3,1100049,Cacoal (RO),78263,78675,78959,79330,85863,86556,87226,87877,88507,84813,85359,85893,86416,86895
4,1100056,Cerejeiras (RO),16784,16622,16939,16852,18041,18013,17986,17959,17934,16444,16323,16204,16088,15890
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5565,5222005,Vianópolis (GO),12699,12831,12644,12737,13227,13343,13456,13567,13675,13746,13863,13977,14088,14956
5566,5222054,Vicentinópolis (GO),6091,6093,7476,7576,7933,8053,8171,8286,8397,8611,8743,8873,9002,8768
5567,5222203,Vila Boa (GO),4461,4578,4847,4954,5246,5371,5495,5615,5731,6026,6171,6312,6451,4215
5568,5222302,Vila Propício (GO),5282,5359,5196,5244,5460,5520,5578,5635,5690,5758,5821,5882,5941,5815


In [3]:
# Split 'Município' into municipality and UF (state)
df['UF'] = df['Nome do Município'].str.extract(r'\((\w+)\)')
df['Nome do Município'] = df['Nome do Município'].str.split(' \(').str[0]

df

Unnamed: 0,Código do Município,Nome do Município,2008,2009,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,UF
0,1100015,Alta Floresta D'Oeste,24577,24354,24228,24069,25728,25652,25578,25506,25437,23167,22945,22728,22516,21495,RO
1,1100023,Ariquemes,84581,85541,91570,92747,101269,102860,104401,105896,107345,106168,107863,109523,111148,96833,RO
2,1100031,Cabixi,6777,6695,6221,6132,6495,6424,6355,6289,6224,5438,5312,5188,5067,5363,RO
3,1100049,Cacoal,78263,78675,78959,79330,85863,86556,87226,87877,88507,84813,85359,85893,86416,86895,RO
4,1100056,Cerejeiras,16784,16622,16939,16852,18041,18013,17986,17959,17934,16444,16323,16204,16088,15890,RO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5565,5222005,Vianópolis,12699,12831,12644,12737,13227,13343,13456,13567,13675,13746,13863,13977,14088,14956,GO
5566,5222054,Vicentinópolis,6091,6093,7476,7576,7933,8053,8171,8286,8397,8611,8743,8873,9002,8768,GO
5567,5222203,Vila Boa,4461,4578,4847,4954,5246,5371,5495,5615,5731,6026,6171,6312,6451,4215,GO
5568,5222302,Vila Propício,5282,5359,5196,5244,5460,5520,5578,5635,5690,5758,5821,5882,5941,5815,GO


In [4]:
df['2009'] = pd.to_numeric(df['2009'], errors='coerce')
df['2011'] = pd.to_numeric(df['2011'], errors='coerce')

In [5]:
df['2010'] = round((df['2009'] + df['2011'])/2)

In [6]:
df

Unnamed: 0,Código do Município,Nome do Município,2008,2009,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,UF,2010
0,1100015,Alta Floresta D'Oeste,24577,24354.0,24228.0,24069,25728,25652,25578,25506,25437,23167,22945,22728,22516,21495,RO,24291.0
1,1100023,Ariquemes,84581,85541.0,91570.0,92747,101269,102860,104401,105896,107345,106168,107863,109523,111148,96833,RO,88556.0
2,1100031,Cabixi,6777,6695.0,6221.0,6132,6495,6424,6355,6289,6224,5438,5312,5188,5067,5363,RO,6458.0
3,1100049,Cacoal,78263,78675.0,78959.0,79330,85863,86556,87226,87877,88507,84813,85359,85893,86416,86895,RO,78817.0
4,1100056,Cerejeiras,16784,16622.0,16939.0,16852,18041,18013,17986,17959,17934,16444,16323,16204,16088,15890,RO,16780.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5565,5222005,Vianópolis,12699,12831.0,12644.0,12737,13227,13343,13456,13567,13675,13746,13863,13977,14088,14956,GO,12738.0
5566,5222054,Vicentinópolis,6091,6093.0,7476.0,7576,7933,8053,8171,8286,8397,8611,8743,8873,9002,8768,GO,6784.0
5567,5222203,Vila Boa,4461,4578.0,4847.0,4954,5246,5371,5495,5615,5731,6026,6171,6312,6451,4215,GO,4712.0
5568,5222302,Vila Propício,5282,5359.0,5196.0,5244,5460,5520,5578,5635,5690,5758,5821,5882,5941,5815,GO,5278.0


In [7]:

# Generating the list of years using list comprehension
years = [str(year) for year in range(2008, 2023)]

# Melting the DataFrame
melted_df = pd.melt(df, id_vars=["Código do Município", "Nome do Município", "UF"], value_vars=years, var_name='Ano', value_name='Population')


# Sorting the DataFrame
melted_df = melted_df.sort_values(by=['UF', "Código do Município", 'Ano']).reset_index(drop=True)

melted_df


Unnamed: 0,Código do Município,Nome do Município,UF,Ano,Population
0,1200013,Acrelândia,AC,2008,11987
1,1200013,Acrelândia,AC,2009,12241.0
2,1200013,Acrelândia,AC,2010,12510.0
3,1200013,Acrelândia,AC,2011,12779.0
4,1200013,Acrelândia,AC,2012,13011
...,...,...,...,...,...
83545,1722107,Xambioá,TO,2018,11561
83546,1722107,Xambioá,TO,2019,11540
83547,1722107,Xambioá,TO,2020,11520
83548,1722107,Xambioá,TO,2021,11500


In [8]:
# drop columns
columns_to_drop = ['Nome do Município', 'UF']
melted_df = melted_df.drop(columns=columns_to_drop)
melted_df

Unnamed: 0,Código do Município,Ano,Population
0,1200013,2008,11987
1,1200013,2009,12241.0
2,1200013,2010,12510.0
3,1200013,2011,12779.0
4,1200013,2012,13011
...,...,...,...
83545,1722107,2018,11561
83546,1722107,2019,11540
83547,1722107,2020,11520
83548,1722107,2021,11500


In [11]:
# rename columns
melted_df = melted_df.rename(columns={
    'Population': 'População',
    'Código do Município': 'Código_IBGE'
})
melted_df

Unnamed: 0,Código_IBGE,Ano,População
0,1200013,2008,11987
1,1200013,2009,12241.0
2,1200013,2010,12510.0
3,1200013,2011,12779.0
4,1200013,2012,13011
...,...,...,...
83545,1722107,2018,11561
83546,1722107,2019,11540
83547,1722107,2020,11520
83548,1722107,2021,11500


In [12]:
file_path = "../transformed_data/population_per_municipality.csv"
melted_df.to_csv(file_path, index=False)