In [1]:
import pandas as pd
import numpy as np
import os
from decimal import Decimal
import pickle
from pathlib import Path

# SETUP

In [2]:
dir_tree_util_path = os.path.join("utils", "dir_tree.py")
exec(open(dir_tree_util_path).read())

# INPUTS

In [3]:
# Paths
path_dados_pib = PROJECT_DIRS['DADOS_GIT_BACEN_DIR']
# path_output = TODO
path_output = PROJECT_DIRS['DADOS_PIB_DIR']

In [4]:
path_dados_pib

PosixPath('/home/andcm/work/Projetos/infra-data/dados-git/dados-bacen')

# CARREGANDO p/MEMORIA OS DADOS

In [5]:
# df_pib mensal nominal
# fonte: Bacen. SGS, df_pib mensal - valores correntes, Cód 4380
df_pib = pd.read_csv(path_dados_pib / 'STP-20240827171150836.csv', sep=";", encoding='windows-1251', 
                  skipfooter=1, engine='python')

In [6]:
# df_ipca mensal
# fonte: Bacen. SGS, Índice nacional de preços ao consumidor-amplo (df_ipca), Cód 433
df_ipca = pd.read_csv(path_dados_pib / 'STP-20240910111755946.csv', sep=";", encoding='windows-1251', 
                  skipfooter=1, engine='python')

# TRATANDO OS DADOS

## IPCA

In [7]:
df_ipca

Unnamed: 0,Data,433 - Нndice nacional de preзos ao consumidor-amplo (IPCA) - Var. % mensal
0,01/1980,662
1,02/1980,462
2,03/1980,604
3,04/1980,529
4,05/1980,570
...,...,...
531,04/2024,038
532,05/2024,046
533,06/2024,021
534,07/2024,038


In [8]:
df_ipca.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536 entries, 0 to 535
Data columns (total 2 columns):
 #   Column                                                                      Non-Null Count  Dtype 
---  ------                                                                      --------------  ----- 
 0   Data                                                                        536 non-null    object
 1   433 - Нndice nacional de preзos ao consumidor-amplo (IPCA) - Var. % mensal  536 non-null    object
dtypes: object(2)
memory usage: 8.5+ KB


### Tratando a coluna de data e passando para o índice

In [9]:
df_ipca.loc[0,'Data']

'01/1980'

In [10]:
type(df_ipca.loc[0,'Data'])

str

In [11]:
# Convertendo coluna de data para tipo datetime, e p/final do mês
df_ipca['Data'] = pd.to_datetime(df_ipca['Data'], format='%m/%Y') + pd.offsets.MonthEnd(0)
# df_ipca['Data'] = df_ipca['Data'].dt.to_period('M').dt.to_timestamp('M')

In [12]:
df_ipca.index = df_ipca['Data']

In [13]:
df_ipca = df_ipca.drop(columns=['Data'])

### Outros tratamentos

In [14]:
# Simplificando nome de coluna
df_ipca = df_ipca.rename(columns={'433 - Нndice nacional de preзos ao consumidor-amplo (IPCA) - Var. % mensal':'ipca_mensal_%'})

In [15]:
type(df_ipca.loc[df_ipca.index[0],'ipca_mensal_%']) # str

str

In [16]:
# Convertendo ipca de string para float
df_ipca['ipca_mensal_%'] = df_ipca['ipca_mensal_%'].str.replace(',','.')
df_ipca['ipca_mensal_%'] = df_ipca['ipca_mensal_%'].astype('float')

In [17]:
# Atribuindo dez/2009 = 1.00
df_ipca['ipca_mensal'] = 1 + df_ipca['ipca_mensal_%']/100
df_ipca['ipca_acum'] =  df_ipca['ipca_mensal'].cumprod()
scaling_date = df_ipca.loc[df_ipca.index == '2009-12-31']['ipca_acum'].iloc[0]
df_ipca['ipca_acum'] = df_ipca['ipca_acum'] / scaling_date

In [18]:
# removendo coluna não utilizada
df_ipca = df_ipca.drop(columns=['ipca_mensal'])

## PIB

In [19]:
df_pib

Unnamed: 0,Data,4380 - PIB mensal - Valores correntes (R$ milhхes) - R$ (milhхes)
0,01/1990,02
1,02/1990,04
2,03/1990,08
3,04/1990,07
4,05/1990,08
...,...,...
410,03/2024,"952.242,4"
411,04/2024,"977.372,0"
412,05/2024,"954.779,1"
413,06/2024,"952.995,6"


In [20]:
df_pib.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415 entries, 0 to 414
Data columns (total 2 columns):
 #   Column                                                             Non-Null Count  Dtype 
---  ------                                                             --------------  ----- 
 0   Data                                                               415 non-null    object
 1   4380 - PIB mensal - Valores correntes (R$ milhхes) - R$ (milhхes)  415 non-null    object
dtypes: object(2)
memory usage: 6.6+ KB


In [21]:
# simplificando nome de colunas
df_pib = df_pib.rename(columns={
'4380 - PIB mensal - Valores correntes (R$ milhхes) - R$ (milhхes)':'PIB_mensal'
})

In [22]:
# Alterando data para datetime e final do mês
df_pib['Data'] = pd.to_datetime(df_pib['Data'], format='%m/%Y') + pd.offsets.MonthEnd(0)

In [23]:
# jogando data para o índice
df_pib.index = df_pib['Data']
df_pib = df_pib.drop(columns=['Data'])

In [24]:
# Alterando PIB para float
df_pib['PIB_mensal'] = df_pib['PIB_mensal'].str.replace('.', '', regex=False)
df_pib['PIB_mensal'] = df_pib['PIB_mensal'].str.replace(',', '.', regex=False)
df_pib['PIB_mensal'] = df_pib['PIB_mensal'].astype('float')

# Calculando o PIB real

In [25]:
df_pib = df_pib.join(df_ipca)

In [26]:
df_pib['PIB_mensal_real'] = df_pib['PIB_mensal'] / df_pib['ipca_acum']

In [27]:
df_pib

Unnamed: 0_level_0,PIB_mensal,ipca_mensal_%,ipca_acum,PIB_mensal_real
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1990-01-31,0.2,67.55,0.000002,111533.244265
1990-02-28,0.4,75.73,0.000003,126937.056012
1990-03-31,0.8,82.39,0.000006,139192.999630
1990-04-30,0.7,15.52,0.000007,105430.985696
1990-05-31,0.8,7.59,0.000007,111992.336724
...,...,...,...,...
2024-03-31,952242.4,0.16,2.276373,418315.597437
2024-04-30,977372.0,0.38,2.285023,427729.540003
2024-05-31,954779.1,0.46,2.295534,415928.884720
2024-06-30,952995.6,0.21,2.300355,414281.949388


# Calculando a variação do PIB mensal

In [28]:
df_pib['PIB_mensal_real_%'] = df_pib['PIB_mensal_real'].pct_change() * 100

In [29]:
df_pib

Unnamed: 0_level_0,PIB_mensal,ipca_mensal_%,ipca_acum,PIB_mensal_real,PIB_mensal_real_%
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1990-01-31,0.2,67.55,0.000002,111533.244265,
1990-02-28,0.4,75.73,0.000003,126937.056012,13.810960
1990-03-31,0.8,82.39,0.000006,139192.999630,9.655135
1990-04-30,0.7,15.52,0.000007,105430.985696,-24.255540
1990-05-31,0.8,7.59,0.000007,111992.336724,6.223361
...,...,...,...,...,...
2024-03-31,952242.4,0.16,2.276373,418315.597437,7.294999
2024-04-30,977372.0,0.38,2.285023,427729.540003,2.250440
2024-05-31,954779.1,0.46,2.295534,415928.884720,-2.758906
2024-06-30,952995.6,0.21,2.300355,414281.949388,-0.395966


# Descartando colunas que não serão utilizadas

In [30]:
df_pib = df_pib.drop(columns={'PIB_mensal','ipca_mensal_%','ipca_acum','PIB_mensal_real'})

In [31]:
df_pib

Unnamed: 0_level_0,PIB_mensal_real_%
Data,Unnamed: 1_level_1
1990-01-31,
1990-02-28,13.810960
1990-03-31,9.655135
1990-04-30,-24.255540
1990-05-31,6.223361
...,...
2024-03-31,7.294999
2024-04-30,2.250440
2024-05-31,-2.758906
2024-06-30,-0.395966


In [32]:
# simplificando a data do index
df_pib.index = df_pib.index.date

# SALVANDO PARA PARQUET

In [33]:
df_pib.to_parquet(path_output / 'PIB-Bacen_mensal.parquet')

# SALVANDO PARA EXCEL

In [34]:
df_pib.to_excel(path_output / 'PIB-Bacen_mensal.xlsx')