# **Exploratory Data Analisys: Companies**

## **Initial Setup**

### Install Packages

In [1]:
%pip install pandas -q
%pip install plotly -q

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Import libs

In [2]:
import os
import itertools
import pandas as pd
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp

### Pandas Config

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Create a file path default

In [4]:
file_path_book = str(Path(os.getcwd()).parent.parent.parent / "data/book")

## Macroeconomic

### Load data

In [5]:
df_macroeconomic_book = pd.read_csv(file_path_book + "/macroeconomic_book.csv")
df_macroeconomic_book.head(5)

Unnamed: 0,date,selic,confidence,pib,incc,ipca,dolar,monthly_inflation,gdp_growth,dollar_growth,real_interest_rate,inflation_confidence_difference
0,2019-01-31,6.5,128.64,578214.5,0.49,3.78,3.6513,-0.111948,-0.015756,0.016459,2.72,-124.86
1,2019-02-28,6.5,139.39,576089.7,0.09,3.89,3.7379,0.029101,-0.003675,0.023718,2.61,-135.5
2,2019-03-31,6.5,125.53,601749.8,0.31,4.58,3.8961,0.177378,0.044542,0.042323,1.92,-120.95
3,2019-04-30,6.5,121.71,612918.4,0.38,4.94,3.9447,0.078603,0.01856,0.012474,1.56,-116.77
4,2019-05-31,6.5,117.01,615304.9,0.03,4.66,3.9401,-0.05668,0.003894,-0.001166,1.84,-112.35


In [18]:
df_macroeconomic_numeric_cols = df_macroeconomic_book.select_dtypes(include=["int", "number", "float64"])
df_macroeconomic_numeric_cols.head(5)

Unnamed: 0,selic,confidence,pib,incc,ipca,dolar,monthly_inflation,gdp_growth,dollar_growth,real_interest_rate,inflation_confidence_difference
0,6.5,128.64,578214.5,0.49,3.78,3.6513,-0.111948,-0.015756,0.016459,2.72,-124.86
1,6.5,139.39,576089.7,0.09,3.89,3.7379,0.029101,-0.003675,0.023718,2.61,-135.5
2,6.5,125.53,601749.8,0.31,4.58,3.8961,0.177378,0.044542,0.042323,1.92,-120.95
3,6.5,121.71,612918.4,0.38,4.94,3.9447,0.078603,0.01856,0.012474,1.56,-116.77
4,6.5,117.01,615304.9,0.03,4.66,3.9401,-0.05668,0.003894,-0.001166,1.84,-112.35


### Macroeconomic Analisys

#### Outlier Analysis

In [None]:
num_columns = len(df_macroeconomic_book)
num_rows = (num_columns + 2) // 2

subplot_titles = [str(col) for col in df_macroeconomic_book]

fig = sp.make_subplots(rows=num_rows, cols=3, subplot_titles=subplot_titles)

for i, column in enumerate(df_macroeconomic_book, start=1):
    row = (i - 1) // 3 + 1
    col = (i - 1) % 3 + 1

    trace = go.Box(y=df_macroeconomic_book[column], name=column, 
                   marker_color='lightseagreen', boxpoints='outliers', 
                   jitter=0.7, hoverinfo='y+text', 
                   text=())

    fig.add_trace(trace, row=row, col=col)

fig.update_layout(title_text='Boxplot of Numerical Variables', height=300*num_rows, showlegend=False, template='plotly_dark')
fig.show()

### Histogram

In [None]:
subplot_titles = [str(col) for col in df_macroeconomic_book]
columns_per_row = 3

num_rows = len(df_macroeconomic_book) // columns_per_row + (len(df_macroeconomic_book) % columns_per_row > 0)
fig = sp.make_subplots(rows=num_rows, cols=columns_per_row, subplot_titles=subplot_titles)

for i, column in enumerate(df_macroeconomic_book):
    row = i // columns_per_row + 1
    col = i % columns_per_row + 1
    
    fig.add_trace(go.Histogram(x=df_macroeconomic_book[column], name=column, marker_color='lightseagreen'), row=row, col=col)

fig.update_layout(title='Histograms by Column', height=300 * num_rows, showlegend=False, template='plotly_dark')
fig.show()

### Correlation and Dispersion

In [26]:

df_macroeconomic_numeric_cols_corr = df_macroeconomic_numeric_cols.corr()
color_scale = [[0, 'rgb(150, 245, 231)'], [0.5, 'rgb(100, 195, 181)'], [1, 'rgb(50, 145, 131)']]
fig = px.imshow(df_macroeconomic_numeric_cols_corr, text_auto=True, aspect="auto", width=1000, height=500, template="plotly_dark", title="Mapa de Calor: Correlação entre as variaveis", color_continuous_scale=color_scale)
fig.show()

In [None]:
fig = px.scatter_matrix(df_macroeconomic_numeric_cols_corr, dimensions=df_macroeconomic_numeric_cols_corr.columns, title='Gráficos de Dispersão para Todas as Colunas', template="plotly_dark",)
fig.update_traces(marker=dict(color='rgb(100, 195, 181)'))

fig.update_layout(width=4000, height=3000, grid=dict(xgap=0.1, ygap=0.5))

fig.show()

### Análise de Tendência

#### Taxa SELIC e Taxa de Juros Real:

In [42]:
df = df_macroeconomic_book
fig_selic = sp.make_subplots(specs=[[{"secondary_y": True}]])

fig_selic.add_trace(go.Scatter(x=df['date'], y=df['selic'], name='SELIC', line=dict(color='rgb(100, 195, 181)')), secondary_y=False)
fig_selic.add_trace(go.Scatter(x=df['date'], y=df['real_interest_rate'], name='Real Interest Rate', line=dict(color='rgb(255, 0, 0)')), secondary_y=True)

fig_selic.update_layout(title_text='SELIC Rate and Real Interest Rate Over Time', template='plotly_dark', legend=dict(font=dict(color='white')), height=600)

fig_selic.update_xaxes(title='Date', title_font=dict(color='white'), tickfont=dict(color='white'))
fig_selic.update_yaxes(title='SELIC Rate (%)', secondary_y=False, title_font=dict(color='white'), tickfont=dict(color='white'))
fig_selic.update_yaxes(title='Real Interest Rate (%)', secondary_y=True, title_font=dict(color='white'), tickfont=dict(color='white'))

fig_selic.show()


#### Confiança do Consumidor:

In [43]:
fig_confidence = px.line(df, x='date', y='confidence', title='Consumer Confidence',
                         color_discrete_sequence=['rgb(100, 195, 181)'], 
                         template='plotly_dark', height=600)
fig_confidence.update_xaxes(title='Date', title_font=dict(color='white'), tickfont=dict(color='white'))
fig_confidence.update_yaxes(title='Confidence Index', title_font=dict(color='white'), tickfont=dict(color='white'))

fig_confidence.show()


#### Crescimento do PIB:

In [44]:
fig_gdp = px.line(df, x='date', y='gdp_growth', title='GDP Growth Rate',
                  color_discrete_sequence=['rgb(100, 195, 181)'], 
                  template='plotly_dark', height=600)
fig_gdp.update_xaxes(title='Date', title_font=dict(color='white'), tickfont=dict(color='white'))
fig_gdp.update_yaxes(title='GDP Growth Rate (%)', title_font=dict(color='white'), tickfont=dict(color='white'))

fig_gdp.show()


#### Dólar e Inflação Mensal:

In [45]:
# Continuando do código anterior para Dólar vs. Inflação Mensal com o template 'plotly_dark'
fig_dollar_inflation = sp.make_subplots(specs=[[{"secondary_y": True}]])

# Adicionando a série de taxa de câmbio do dólar
fig_dollar_inflation.add_trace(
    go.Scatter(x=df['date'], y=df['dolar'], name='Dollar Exchange Rate', line=dict(color='rgb(100, 195, 181)')),
    secondary_y=False,
)

# Adicionando a série de inflação mensal
fig_dollar_inflation.add_trace(
    go.Scatter(x=df['date'], y=df['monthly_inflation'], name='Monthly Inflation', line=dict(color='rgb(255, 0, 0)')),
    secondary_y=True,
)

# Configurações de layout usando template 'plotly_dark'
fig_dollar_inflation.update_layout(
    title_text='Dollar Exchange Rate and Monthly Inflation Over Time',
    template='plotly_dark',
    legend=dict(font=dict(color='white')),
    height=600
)

# Atualizações para o eixo x e eixos y primários e secundários
fig_dollar_inflation.update_xaxes(title='Date', title_font=dict(color='white'), tickfont=dict(color='white'))
fig_dollar_inflation.update_yaxes(title='Dollar Exchange Rate (BRL/USD)', secondary_y=False, title_font=dict(color='white'), tickfont=dict(color='white'))
fig_dollar_inflation.update_yaxes(title='Monthly Inflation (%)', secondary_y=True, title_font=dict(color='white'), tickfont=dict(color='white'))

fig_dollar_inflation.show()


### Análise de Volatilidade:

In [57]:
import pandas as pd
import plotly.graph_objects as go

# Assumindo que 'df' é seu DataFrame e 'date' a coluna com datas
# df = seu_dataframe

# Definir o tamanho da janela para o cálculo do desvio padrão móvel
window_size = 2  # Por exemplo, uma janela de 12 meses

# Calcular o desvio padrão móvel da taxa de câmbio do dólar e da inflação
df['rolling_std_dollar'] = df['dolar'].rolling(window=window_size).std()
df['rolling_std_inflation'] = df['ipca'].rolling(window=window_size).std()

# Criar gráfico da volatilidade do dólar
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['date'], y=df['rolling_std_dollar'], name='Rolling Std Dev - Dollar', line=dict(color='royalblue', width=2)))

fig.update_layout(title='Volatility of Dollar Exchange Rate (Rolling Standard Deviation)',
                  xaxis_title='Date',
                  yaxis_title='Rolling Std Dev',
                  template='plotly_dark')


fig.add_trace(go.Scatter(x=df['date'], y=df['rolling_std_inflation'], name='Rolling Std Dev - Inflation', line=dict(color='tomato', width=2)))

fig.update_layout(title='Volatility of Monthly Inflation (Rolling Standard Deviation)',
                  xaxis_title='Date',
                  yaxis_title='Rolling Std Dev',
                  template='plotly_dark')

fig.show()


### Análise de Componentes Principais (PCA)

In [61]:
import plotly.graph_objs as go
import numpy as np

# Substitua 'pca' com o objeto PCA após ajustar seus dados.
explained_var = np.cumsum(pca.explained_variance_ratio_)

# Criação do gráfico de linha para a variância explicada acumulada
fig = go.Figure()

fig.add_trace(go.Scatter(x=list(range(1, len(explained_var)+1)), y=explained_var, mode='lines+markers',
                         line=dict(color='royalblue', width=2), marker=dict(color='lightseagreen')))

fig.update_layout(
    title='Cumulative Explained Variance by PCA Components',
    xaxis=dict(title='Number of Components'),
    yaxis=dict(title='Cumulative Explained Variance'),
    yaxis_tickformat='%',  # Formato percentual para o eixo y
    template='plotly_dark',  # Usando o template 'plotly_dark'
    showlegend=False,
    annotations=[
        dict(
            x=np.where(explained_var > 0.95)[0][0],
            y=0.95,
            xref="x",
            yref="y",
            text="95% cut-off",
            showarrow=True,
            arrowhead=7,
            ax=0,
            ay=-40
        )
    ]
)

# Mostrar gráfico
fig.show()


### Analise Grafica:

In [63]:
import plotly.express as px

# Supondo que 'df' é seu DataFrame e 'date' a coluna com datas
# df = seu_dataframe

# Gráfico de barras para a inflação mensal (IPCA) ao longo do tempo
fig_ipca_bar = px.bar(df, x='date', y='ipca', title='Monthly IPCA Over Time', labels={'ipca': 'IPCA (%)'})

# Personalização do layout para usar o template 'plotly_dark'
fig_ipca_bar.update_layout(template='plotly_dark', xaxis_title='Date', yaxis_title='IPCA (%)')

# Exibir o gráfico
fig_ipca_bar.show()


In [64]:
import plotly.express as px

# Supondo que 'df' é seu DataFrame e 'date' a coluna com datas e 'ipca' a coluna com a inflação mensal
df = df_macroeconomic_book

# Gráfico de linha para a tendência da inflação mensal
fig_inflation_trend = px.line(df, x='date', y='ipca', title='Monthly Inflation Trend', labels={'ipca': 'Monthly Inflation (%)'})

# Personalização do layout para usar o template 'plotly_dark'
fig_inflation_trend.update_layout(template='plotly_dark', xaxis_title='Date', yaxis_title='Monthly Inflation (%)')

# Exibir o gráfico
fig_inflation_trend.show()


### Inflação

In [65]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Suponha que 'df' é o seu DataFrame e tem uma coluna 'date' no formato de data e 'ipca' para inflação mensal
# df = seu_dataframe

# Converter 'date' para datetime se ainda não for o tipo certo
df['date'] = pd.to_datetime(df['date'])

# Calcular a inflação acumulada anual (exemplo simplificado, pode precisar de ajustes baseados na realidade dos dados)
df['year'] = df['date'].dt.year
df_annual_inflation = df.groupby('year')['ipca'].sum().reset_index()  # substituir sum() pela lógica de acumulação anual correta

# Criar um gráfico de barras com a inflação mensal
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(x=df['date'], y=df['ipca'], name='Monthly Inflation'),
    secondary_y=False,
)

# Adicionar a inflação acumulada anual em um gráfico de linha sobreposto
fig.add_trace(
    go.Scatter(x=df_annual_inflation['year'], y=df_annual_inflation['ipca'], name='Annual Inflation', marker=dict(color='red')),
    secondary_y=True,
)

# Atualizar layout do gráfico
fig.update_layout(title_text='Monthly and Annual Inflation Comparison',
                  template='plotly_dark')

fig.update_xaxes(title_text='Date')

fig.update_yaxes(title_text='Monthly Inflation (%)', secondary_y=False)
fig.update_yaxes(title_text='Annual Inflation (%)', secondary_y=True)

# Mostrar figura
fig.show()


#### Políticas Monetárias na Inflação:

In [67]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Suponha que 'df' é seu DataFrame e contém 'date' para datas, 'selic' para taxa de juros e 'ipca' para inflação
df = df_macroeconomic_book

# Criar uma figura com subplots compartilhados
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, subplot_titles=("SELIC Rate", "IPCA (Inflation)"))

# Adicionar SELIC ao primeiro gráfico de linha
fig.add_trace(
    go.Scatter(x=df['date'], y=df['selic'], name="SELIC", mode='lines'),
    row=1, col=1
)

# Adicionar IPCA ao segundo gráfico de linha
fig.add_trace(
    go.Scatter(x=df['date'], y=df['ipca'], name="IPCA", mode='lines', line=dict(color='firebrick')),
    row=2, col=1
)

# Atualizar o layout
fig.update_layout(height=600, width=800, title_text="Impact of Monetary Policy on Inflation", template='plotly_dark')
fig.update_xaxes(title_text="Date")
fig.update_yaxes(title_text="SELIC Rate (%)", row=1, col=1)
fig.update_yaxes(title_text="IPCA (Inflation) (%)", row=2, col=1)

fig.show()


In [68]:
import pandas as pd
import statsmodels.api as sm
import plotly.express as px

# Suponha que 'df' é o seu DataFrame e contém 'selic' para a taxa de juros e 'ipca' para a inflação
# df = seu_dataframe

# Preparando os dados para a regressão
# Adicionando uma constante para o intercepto
X = sm.add_constant(df['selic'])  # Variável independente
y = df['ipca']  # Variável dependente

# Construindo o modelo de regressão
model = sm.OLS(y, X).fit()

# Mostrando o resumo do modelo
print(model.summary())

# Criando um DataFrame para a visualização dos resultados da regressão
df_regression = pd.DataFrame({'SELIC': df['selic'], 'IPCA': df['ipca'], 'IPCA_Predicted': model.predict(X)})

# Criando um gráfico de dispersão com uma linha de regressão
fig_regression = px.scatter(df_regression, x='SELIC', y='IPCA', trendline="ols", 
                            labels={"SELIC": "SELIC Rate (%)", "IPCA": "IPCA (Inflation) %"}, 
                            title="Scatter plot of SELIC vs IPCA with Regression Line")
# Adicionando a linha de previsão
fig_regression.add_scatter(x=df_regression['SELIC'], y=df_regression['IPCA_Predicted'], mode='lines', name='Predicted IPCA')

# Personalizando o layout do gráfico
fig_regression.update_layout(template='plotly_dark')

# Exibir o gráfico
fig_regression.show()


                            OLS Regression Results                            
Dep. Variable:                   ipca   R-squared:                       0.150
Model:                            OLS   Adj. R-squared:                  0.134
Method:                 Least Squares   F-statistic:                     9.506
Date:                Thu, 09 Nov 2023   Prob (F-statistic):            0.00322
Time:                        13:23:24   Log-Likelihood:                -136.61
No. Observations:                  56   AIC:                             277.2
Df Residuals:                      54   BIC:                             281.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.9253      0.746      5.264      0.0