# Import all datasets

In [35]:
import pandas as pd
import numpy as np

In [7]:
from import_budget_data import df_budget
from import_wb_api import df_gdp_monthly, df_inflation_rotated_monthly, df_interest_rotated_monthly
from import_bcb_api import df_macro_monthly
from import_bcb_currencies import df_currencies_monthly
from import_bcb_focus import df_focus_indicators_monthly
from import_imf_api import df_commodities


EntitySet (Endpoint): ExpectativasMercadoAnuais
EntityType: br.gov.bcb.olinda.servico.Expectativas.ExpectativaMercadoAnual
Properties: Indicador<str>, IndicadorDetalhe<str>, Data<str>, DataReferencia<str>, Media<float>, Mediana<float>, DesvioPadrao<float>, Minimo<float>, Maximo<float>, numeroRespondentes<int>, baseCalculo<int>


In [128]:
dataframes = [df_macro_monthly, # DF with brazilian macro data
              df_gdp_monthly, # DF with GDP data from other countries
              df_inflation_rotated_monthly, # DF with inflation data from other countries
              df_interest_rotated_monthly, # DF with interest rate data from other countries
              df_commodities, # DF with commodities historical prices
              df_budget, # DF with brazilian budget data
              df_focus_indicators_monthly, # DF with market expectations indicators from Brazilian Central Bank
              df_currencies_monthly] # DF with currencies historical prices - USD, EUR and CHN

# Create a new DF with all dataframes merged
merged_df = df_macro_monthly # the merged df will start with the dataframe which contans the target variable

#loop to merge dataframes
for i,df in enumerate(dataframes):
    df.rename_axis('Time', inplace=True) #rename index
    if df is not df_macro_monthly:
        merged_df = pd.merge(merged_df, df, on='Time', how='left') #merge dataframes


In [129]:
merged_df.head()


Unnamed: 0_level_0,eco_fiscal_result_month,eco_fiscal_result_12months,eco_inflation_month,eco_inflation_12months,eco_interest_rate,eco_gross_debt_gdp_pre,eco_gross_debt_gdp_pos,eco_gross_debt_R$_pre,eco_gross_debt_R$_pos,eco_net_debt_gdp_%,...,exp_net_public_debt_y+2,exp_primary_result_y,exp_primary_result_y+1,exp_primary_result_y+2,exp_trade_balance_y,exp_trade_balance_y+1,exp_trade_balance_y+2,CNY,EUR,USD
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-01,2629.62,,0.57,5.92,,,,751214.25,,,...,47.98,3.0,2.7,2.1,0.5,1.5,3.1,0.238588,1.8437,1.9711
2001-02,1053.24,,0.46,6.27,15.25,,,758881.03,,,...,47.0,3.0,2.7,2.2,-0.27,1.1,2.47,0.247551,1.89153,2.0452
2001-03,3884.62,,0.38,6.44,15.25,,,780864.83,,,...,47.0,3.0,2.7,2.2,-1.0,1.0,2.0,0.261659,1.90165,2.1616
2001-04,6500.83,,0.58,6.61,15.75,,,789558.26,,,...,47.0,3.0,3.0,2.2,-1.25,0.42,1.95,0.264464,1.94164,2.1847
2001-05,4011.68,,0.41,7.04,16.25,,,807300.23,,,...,48.0,3.0,3.0,2.3,-1.5,0.0,1.3,0.285699,2.00134,2.36


In [130]:
merged_df.tail()

Unnamed: 0_level_0,eco_fiscal_result_month,eco_fiscal_result_12months,eco_inflation_month,eco_inflation_12months,eco_interest_rate,eco_gross_debt_gdp_pre,eco_gross_debt_gdp_pos,eco_gross_debt_R$_pre,eco_gross_debt_R$_pos,eco_net_debt_gdp_%,...,exp_net_public_debt_y+2,exp_primary_result_y,exp_primary_result_y+1,exp_primary_result_y+2,exp_trade_balance_y,exp_trade_balance_y+1,exp_trade_balance_y+2,CNY,EUR,USD
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-10,19455.64,-108401.95,0.24,4.82,12.75,81.95,73.74,8794598.43,7913479.78,58.26,...,66.0,-1.1,-0.8,-0.6,75.15,61.8,60.0,0.6912,5.3453,5.0575
2023-11,-38922.66,-130800.22,0.28,4.68,12.75,83.01,73.8,8967976.94,7972577.48,58.63,...,66.0,-1.1,-0.8,-0.6,78.4,67.2,61.4,0.6916,5.3856,4.9355
2023-12,,,0.56,4.62,12.25,,,,,,...,66.2,-1.5,-0.8,-0.6,81.3,70.5,66.59,0.6815,5.3516,4.8413
2024-01,,,,,11.75,,,,,,...,68.4,-0.8,-0.6,-0.5,78.45,70.0,71.5,0.6911,5.3805,4.9535
2024-02,,,,,11.25,,,,,,...,,,,,,,,,,


In [131]:
'''
Some data preparation
'''

# Coverting all columns to float
cleanned_df = merged_df.replace('', np.nan).copy() #replace empty cells with NaN
cleanned_df = cleanned_df.astype(float)

# Fill NaN values with the next value of the column
#merged_df['eco_interest_rate'].fillna(method='bfill')

# Fill NaN values with the value before of the column
#merged_df['eco_interest_rate'].fillna(method='ffill')

# Fill NaN values interpollating the values of the column
cleanned_df.interpolate(limit_area='inside', inplace = True)

# Fill NaN values in the beggining of each time series with the next value of the column
cleanned_df.bfill(limit=None, inplace = True)




In [133]:
cleanned_df.tail()

Unnamed: 0_level_0,eco_fiscal_result_month,eco_fiscal_result_12months,eco_inflation_month,eco_inflation_12months,eco_interest_rate,eco_gross_debt_gdp_pre,eco_gross_debt_gdp_pos,eco_gross_debt_R$_pre,eco_gross_debt_R$_pos,eco_net_debt_gdp_%,...,exp_net_public_debt_y+2,exp_primary_result_y,exp_primary_result_y+1,exp_primary_result_y+2,exp_trade_balance_y,exp_trade_balance_y+1,exp_trade_balance_y+2,CNY,EUR,USD
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-10,19455.64,-108401.95,0.24,4.82,12.75,81.95,73.74,8794598.43,7913479.78,58.26,...,66.0,-1.1,-0.8,-0.6,75.15,61.8,60.0,0.6912,5.3453,5.0575
2023-11,-38922.66,-130800.22,0.28,4.68,12.75,83.01,73.8,8967976.94,7972577.48,58.63,...,66.0,-1.1,-0.8,-0.6,78.4,67.2,61.4,0.6916,5.3856,4.9355
2023-12,,,0.56,4.62,12.25,,,,,,...,66.2,-1.5,-0.8,-0.6,81.3,70.5,66.59,0.6815,5.3516,4.8413
2024-01,,,,,11.75,,,,,,...,68.4,-0.8,-0.6,-0.5,78.45,70.0,71.5,0.6911,5.3805,4.9535
2024-02,,,,,11.25,,,,,,...,,,,,,,,,,


In [135]:
# Drop some columns, due to lack of data
cleanned_df.drop(columns=['exp_gross_public_debt_y+1', 'exp_gross_public_debt_y', 'exp_gross_public_debt_y+2', 'ARG_inflation', 'eco_gross_debt_gdp_pos', 'eco_gross_debt_R$_pos'], inplace=True)


In [136]:

# Check for NaN values
#checks_df = merged_df.info()
nan_counts = cleanned_df.isna().sum()
print(nan_counts.sort_values(ascending=False)) #print NaN counts for each column

nan_counts.sort_values(ascending=False).to_csv('nan_counts.txt', header=False)





US_GDP                             6
UK_GDP                             6
eco_fiscal_result_month            3
eco_net_debt_gdp_%_federal_govt    3
CHN_interest                       3
                                  ..
exp_DIC_y+2                        1
exp_DIC_y+1                        1
exp_DIC_y                          1
USD                                1
eco_interest_rate                  0
Length: 78, dtype: int64


In [137]:
# Export merged df to csv
print(len(cleanned_df))
print(len(cleanned_df.columns))
cleanned_df.to_csv('../data/cleanned_df.csv')

278
78
