### EDA
# https://www.youtube.com/watch?v=7_Js8h709Dw
# https://pypi.org/project/ydata-profiling/


In [97]:
import pandas as pd
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
from statsmodels.tsa.stattools import adfuller


In [98]:
# Importing the dataset
df_BR_debt = pd.read_csv('../data/cleanned_df.csv')


In [99]:
df_BR_debt.shape

(278, 80)

In [100]:

columns_to_drop = ['eco_fiscal_result_12months', 'eco_inflation_month', 'eco_gross_debt_gdp_pre', 'eco_net_debt_gdp_%',
       'eco_net_debt_gdp_%_federal_govt','CAN_inflation',
       'UK_inflation', 'CAN_interest', 'CHI_interest', 'UK_interest', 
       'bud_fun_09_initial_value', 'bud_fun_10_initial_value', 'bud_fun_12_initial_value',
       'bud_group_personal_initial_value', 'bud_group_invest_initial_value',
       'bud_type_mandatory_initial_value', 'bud_type_amendments_initial_value',
       'exp_DIC_y+2', 'exp_GDP (%)_y+2', 'exp_US$_currency_y+2',
       'exp_basic_interest_rate_y+2', 'exp_inflation_y+2', 
       'exp_net_public_debt_y+2', 'exp_primary_result_y+2',
       'exp_trade_balance_y+2',
       'UK_GDP', 'CHI_inflation', #Second group
       'exp_DIC_y+1','exp_GDP (%)_y+1','exp_US$_currency_y+1','exp_basic_interest_rate_y+1',
       'exp_inflation_y+1','exp_net_public_debt_y+1','exp_primary_result_y+1',
       'exp_trade_balance_y+1','bud_fun_09_spent_value', 'bud_fun_10_spent_value',
       'bud_fun_12_spent_value','bud_group_invest_spent_value', 
       'bud_type_disc_initial_value', 'bud_type_amendments_spent_value','bud_type_disc_spent_value',
       # Third group
       'eco_fiscal_result_month', 'eco_inflation_12months','eco_gross_debt_R$_pre', 
       'eco_balance_payments_US$_M','CHN_GDP', 'US_GDP', 'CHN_inflation', 'EUZ_inflation', 
       'US_inflation', 'CHN_interest',
       'Euro area (19 countries)',
       'US_interest', 'Brent Crude Oil', 
       'exp_GDP (%)_y','exp_inflation_y', 
       'exp_net_public_debt_y',
       # Fourth group
       'eco_interest_rate','exp_primary_result_y', 'exp_basic_interest_rate_y','exp_US$_currency_y', 'Orange']

# Dropping columns
df_BR_debt.drop(columns_to_drop, axis=1, inplace=True)

# Setting date as index
df_BR_debt.set_index('date', inplace=True)

In [101]:
df_BR_debt.shape

(278, 16)

In [102]:
# Code to normalize the dataset

# Select numerical columns (excluding any time/date or categorical columns)
numerical_cols = df_BR_debt.select_dtypes(include=['float64', 'int64']).columns

# Standardization
# scaler = StandardScaler()
# df_standardized = df.copy()
# df_standardized[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Normalization
min_max_scaler = MinMaxScaler()
BR_param_norm = df_BR_debt.copy()
BR_param_norm[numerical_cols] = min_max_scaler.fit_transform(df_BR_debt[numerical_cols])


In [103]:
BR_param_norm.shape

(278, 16)

In [104]:
# This chunck of code is to check the stationarity of the dataset and eventually make it stationary

# Function to check stationarity using ADF test
def check_stationarity(data):
    # Remove missing or infinite values from the data
    clean_data = data.replace([np.inf, -np.inf], np.nan).dropna()
    
    # Check if there are still valid data points after removing missing or infinite values
    if len(clean_data) > 0:
        result = adfuller(clean_data)
        p_value = result[1]
        return p_value
    else:
        return np.nan  # Return NaN if all data points are missing or infinite

# track variables that need adjustment
variables_to_adjust = BR_param_norm.columns.tolist()  # Start with all columns

# Sometimes the variable needs to be differentiated more than once to achieve stationarity
# Dictionary to track the order of differentiation for each variable
differentiation_order = {column: 1 for column in variables_to_adjust if column != 'date'}

# Check stationarity of each variable iteratively until all are stationary
while variables_to_adjust:
    # Initialize list for next iteration
    variables_to_adjust_next = []  
    
    # Iterate over variables that need adjustment
    for column in variables_to_adjust:
        if column != BR_param_norm.index.name: # Code to execute if the column is not the index
            p_value = check_stationarity(BR_param_norm[column]) # Get the p_value for the ADF test
            if not np.isnan(p_value) and p_value > 0.05:
                order = differentiation_order[column]  # Get the current order of differentiation
                BR_param_norm[column] = BR_param_norm[column].diff(order).fillna(0)  # Take the difference with the current order
                differentiation_order[column] += 1  # Increment the order of differentiation for the next iteration
                variables_to_adjust_next.append(column)  # Add variable for further adjustment
            else:
                variables_to_adjust.remove(column)  # Remove variable if stationary
        
    # Update variables_to_adjust for next iteration
    variables_to_adjust = variables_to_adjust_next
    print("One more wave of adjustments done. Remaining variables to adjust:", len(variables_to_adjust))

# #Print variables that were adjusted to achieve stationarity
# if len(variables_to_adjust) > 0:
#     print("The following variables needed to be adjusted to achieve stationarity:")
#     print(variables_to_adjust)
# else:
#     print("All variables are already stationary.")

One more wave of adjustments done. Remaining variables to adjust: 16
One more wave of adjustments done. Remaining variables to adjust: 2
One more wave of adjustments done. Remaining variables to adjust: 1
One more wave of adjustments done. Remaining variables to adjust: 0


In [105]:
BR_param_norm.shape

(278, 16)

In [106]:
BR_param_norm.head()

Unnamed: 0_level_0,eco_net_debt_R$,eco_net_debt_R$_federal_govt,eco_GDP_R$_12_months,Coffee,Iron Ore,Meat index,Soybeans,Sugar,bud_group_personal_spent_value,bud_type_mandatory_spent_value,exp_DIC_y,exp_trade_balance_y,CNY,EUR,USD,eco_total_revenue
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2001-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001-02,0.00154,0.001741,0.0,0.006594,0.0,0.016707,-0.017165,-0.026579,0.0,0.0,0.0,-0.009244,0.01371,0.0098,0.017578,-0.019192
2001-03,0.002021,0.001652,0.0,-0.002477,0.0,0.062944,-0.006628,-0.02479,0.0,0.0,-0.007171,-0.008764,0.021579,0.002074,0.027612,0.008275
2001-04,0.000605,0.001101,0.000655,-0.002715,0.0,0.001111,-0.012416,-0.025651,0.0,0.0,-0.009778,-0.003001,0.00429,0.008194,0.00548,0.007732
2001-05,0.002808,0.002762,-0.000336,0.014764,0.0,0.032169,0.011328,0.03441,0.0,0.0,-0.022164,-0.003001,0.03248,0.012233,0.041585,-0.003013


In [107]:
BR_param_norm.to_csv('../data/BR_param_norm.csv', index=True)

In [108]:
BR_param_norm.columns

Index(['eco_net_debt_R$', 'eco_net_debt_R$_federal_govt',
       'eco_GDP_R$_12_months', 'Coffee', 'Iron Ore', 'Meat index', 'Soybeans',
       'Sugar', 'bud_group_personal_spent_value',
       'bud_type_mandatory_spent_value', 'exp_DIC_y', 'exp_trade_balance_y',
       'CNY', 'EUR', 'USD', 'eco_total_revenue'],
      dtype='object')

In [109]:
# Now, let´s create some other variables to be used in the models

# Exclude the 'index' column from feature engineering
#features = BR_param_norm.columns.drop(['date'])  # Exclude 'date' as well since it's not a variable to engineer
features = BR_param_norm.columns

# Initialize a list to hold the new feature DataFrames
new_features = []

# Create lag and rolling window features for all variables
for feature in features:
    for lag in [1, 2, 6, 12]:
        new_features.append(BR_param_norm[feature].shift(lag).rename(f'{feature}_lag_{lag}'))
    new_features.append(BR_param_norm[feature].rolling(window=12).mean().rename(f'{feature}_roll_mean_12'))
    new_features.append(BR_param_norm[feature].rolling(window=12).std().rename(f'{feature}_roll_std_12'))

# Concatenate all new features with the original DataFrame
BR_param_norm = pd.concat([BR_param_norm] + new_features, axis=1)


New features created using pd.concat, avoiding DataFrame fragmentation.
Total columns in the dataframe now: 112


In [69]:

profile = ProfileReport(BR_param_norm, title="Profiling Report")
#profile = ProfileReport(df_BR_debt, tsmode=True, sortby="Date Local")
profile.to_file('profile_report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset:  29%|██▊       | 6/21 [00:00<00:00, 32.78it/s, Describe variable:Iron Ore]                    


MemoryError: Unable to allocate 6.38 PiB for an array with shape (897592453862868,) and data type float64

In [16]:
df_BR_debt.columns

Index(['eco_net_debt_R$', 'eco_net_debt_R$_federal_govt',
       'eco_GDP_R$_12_months', 'Coffee', 'Iron Ore', 'Meat index', 'Soybeans',
       'Sugar', 'bud_group_personal_spent_value',
       'bud_type_mandatory_spent_value', 'exp_DIC_y', 'exp_trade_balance_y',
       'CNY', 'EUR', 'USD', 'eco_total_revenue'],
      dtype='object')