# 8.3. Aplicação Prática: Regressão Linear

In [18]:
import numpy as np
import pandas as pd
import scipy.stats

from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 
from sklearn import metrics 
import sklearn.metrics as metrics
from math import sqrt
import plotly.graph_objects as go

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Capturando os dados para a análise

def consulta_bc(codigo_bcb):
    url = 'http://api.bcb.gov.br/dados/serie/bcdata.sgs.{}/dados?formato=json'.format(codigo_bcb)
    df = pd.read_json(url)
    df['data'] = pd.to_datetime(df['data'], dayfirst=True)
    df.set_index('data', inplace=True)
    return df

ipca = consulta_bc(433) # índice nacional de preços ao consumidor-amplo IBGE
ipca_12 = consulta_bc(13522) # índice nacional de preços ao consumidor - amplo (IPCA) - em 12 meses IBGE

In [3]:
ipca_12["valor"].describe()

count     502.000000
mean      318.094582
std       874.412525
min         1.650000
25%         5.462500
50%         8.055000
75%       165.885000
max      6821.310000
Name: valor, dtype: float64

In [4]:
ipca["valor"].describe()

count    513.000000
mean       5.969454
std       10.877966
min       -0.680000
25%        0.380000
50%        0.780000
75%        6.840000
max       82.390000
Name: valor, dtype: float64

In [5]:
ipcas = pd.merge(ipca, ipca_12, left_index = True, right_index = True)
ipcas.columns = ["ipca","ipca_12"]
ipcas

Unnamed: 0_level_0,ipca,ipca_12
data,Unnamed: 1_level_1,Unnamed: 2_level_1
1980-12-01,6.61,99.25
1981-01-01,6.84,99.67
1981-02-01,6.40,103.07
1981-03-01,4.97,101.03
1981-04-01,6.46,103.27
...,...,...
2022-05-01,0.47,11.73
2022-06-01,0.67,11.89
2022-07-01,-0.68,10.07
2022-08-01,-0.36,8.73


In [8]:
# Tratamento de outliers

ipcas["ipca"] = np.where(ipcas["ipca"] > ipcas["ipca"].describe()[6]
                         , ipcas["ipca"].describe()[5]
                         , ipcas["ipca"])

ipcas["ipca_12"] = np.where(ipcas["ipca_12"] > ipcas["ipca_12"].describe()[6]
                         , ipcas["ipca_12"].describe()[5]
                         , ipcas["ipca_12"])

In [14]:
fig = go.Figure(data = go.Scatter(x = ipcas["1994":"2013"]["ipca_12"], y = ipcas["1994":"2013"]["ipca"] , mode = 'markers'))

fig.update_layout(height = 600, width = 800
                  , title_text = "Relação entre IPCA e IPAC12 - 1994:2013"
                  , font_color = "blue"
                  , title_font_color = "black"
                  , xaxis_title = "IPCA_12"
                  , yaxis_title = "IPCA"
                  , font = dict(size = 15, color = "Black")
                 )
fig.show()

In [13]:
fig = go.Figure(data = go.Scatter(x = ipcas["2014":"2022"]["ipca_12"], y = ipcas["2014":"2022"]["ipca"] , mode = 'markers'))

fig.update_layout(height = 600, width = 800
                  , title_text = "Relação entre IPCA e IPAC12 - 2014:2022"
                  , font_color = "blue"
                  , title_font_color = "black"
                  , xaxis_title = "IPCA_12"
                  , yaxis_title = "IPCA"
                  , font = dict(size = 15, color = "Black")
                 )
fig.show()

# A regressão linear

In [15]:
# Separando a base entre treinamento e teste

x = ipcas.drop("ipca", axis = 1) #ipca_12
y = ipcas["ipca"] #ipac

x_train = x["1994":"2013"]
x_test = x["2014":"2022"]

y_train = y["1994":"2013"]
y_test = y["2014":"2022"]

#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.50)

In [16]:
# Treinando o modelo

lr = LinearRegression()

lr.fit(x_train, y_train) # essa é a linha que treina o modelo!!!!

LinearRegression()

In [17]:
# Predições para o treinamento e teste

y_pred_train = lr.predict(x_train)
y_pred_test = lr.predict(x_test)

### Métricas de avaliação: MAE, R2 e RMSE

In [19]:
# Avaliação dos resultados

MAE_train = metrics.mean_absolute_error(y_train, y_pred_train) 
R2_train = metrics.r2_score(y_train, y_pred_train) # Coeficiente de determinação
RMSE_train = sqrt(metrics.mean_squared_error(y_train, y_pred_train))

print("----- Avaliação do treinamento -----")
print("MAE: ", round(MAE_train,2))
print('R2: ', round(R2_train,2))
print('RMSE: ', round(RMSE_train,2))

MAE_test = metrics.mean_absolute_error(y_test, y_pred_test) 
R2_test = metrics.r2_score(y_test, y_pred_test)
RMSE_test = sqrt(metrics.mean_squared_error(y_test, y_pred_test))

print("")
print("----- Avaliação do teste -----")
print("MAE: ", round(MAE_test,2))
print('R2: ', round(R2_test,2))
print('RMSE: ', round(RMSE_test,2))


----- Avaliação do treinamento -----
MAE:  0.34
R2:  0.16
RMSE:  0.61

----- Avaliação do teste -----
MAE:  0.31
R2:  0.12
RMSE:  0.4


### Entendendo os coeficientes da regressão

In [20]:
lr.coef_

array([0.03671909])

In [21]:
# E o intercepto

lr.intercept_

0.3570427534320629

In [None]:
# A minha regressão linear = 0.3570427534320629 + 0.03671909*ipca_12

In [24]:
0.3570427534320629 + 0.03671909*10.5

0.7425931984320628

In [22]:
# Fazendo a predição de um único registro

lr.predict([[10.5]])

array([0.74259323])

In [25]:
# Resultados: Real x previsto em um dataframe

resultados = pd.DataFrame({"Real": y_test, "Previsto": y_pred_test})
resultados["Residuos"] = resultados["Real"] - resultados["Previsto"]
resultados

Unnamed: 0_level_0,Real,Previsto,Residuos
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-01-01,0.55,0.562302,-0.012302
2014-02-01,0.69,0.565607,0.124393
2014-03-01,0.92,0.582865,0.337135
2014-04-01,0.67,0.587639,0.082361
2014-05-01,0.46,0.590943,-0.130943
...,...,...,...
2022-05-01,0.47,0.787758,-0.317758
2022-06-01,0.67,0.793633,-0.123633
2022-07-01,-0.68,0.726804,-1.406804
2022-08-01,-0.36,0.677600,-1.037600


In [26]:
fig = go.Figure(data = go.Scatter(x = resultados["Previsto"], y = resultados["Real"]
                                  , mode = 'markers'))

fig.update_layout(height = 600, width = 800
                  , title_text = "Avaliação da regressão - Real x Previsto"
                  , font_color = "blue"
                  , title_font_color = "black"
                  , xaxis_title = "Previsto"
                  , yaxis_title = "Real"
                  , font = dict(size = 15, color = "Black")
                 )
fig.show()

In [27]:
fig = go.Figure(data = go.Scatter(x = resultados["Residuos"]
                                  , y = resultados["Previsto"]
                                  , mode = "markers"))

fig.update_layout(height = 600, width = 800
                  , title_text = "Avaliação da regressão - Fitted x Residuals"
                  , font_color = "blue"
                  , title_font_color = "black"
                  , xaxis_title = "Resíduos"
                  , yaxis_title = "Previsto"
                  , font = dict(size = 15, color = "Black")
                 )
fig.show()