# Regressão Linear com os Príncipais pacotes.

## Criandos os Imports

In [23]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import os


## Leituras dos CSVs

In [2]:
current_path = os.getcwd()
current_path

'c:\\Users\\alexsandro.ignacio\\OneDrive - WCA Soluções de Inteligência Comercial\\Documentos\\Docs\\Git\\python_developer\\Treino Regressao'

In [3]:
path = os.path.join(current_path,'data', 'california_housing_train.csv')

In [4]:
_df = pd.read_csv(path,sep=',')
_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


### Statsmodels

X = exógena
Y = endógenas 

Em modelos econômicos e econométricos, uma variável exógena refere-se a uma variável que é determinada fora do modelo e representa as entradas de um modelo. Em outras palavras, variáveis exógenas são fixadas no momento em que são introduzidas no modelo. Em contraste, variáveis endógenas são determinadas dentro do modelo e, portanto, representam as saídas de um modelo. O modelo especificado com as variáveis mostra como a mudança de uma variável exógena coeteris paribus afeta todas as variáveis endógenas.

https://pt.wikipedia.org/wiki/Vari%C3%A1veis_ex%C3%B3genas_e_end%C3%B3genas

In [5]:
y = _df['median_house_value']
x = _df.drop(['median_house_value'], axis=1)

In [6]:
_df.drop(['median_house_value'], axis=1).head(1)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936


In [7]:
modelo = sm.OLS( y, x)
modelo

<statsmodels.regression.linear_model.OLS at 0x18cbacc7a30>

In [8]:
res = modelo.fit()
print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:     median_house_value   R-squared (uncentered):                   0.901
Model:                            OLS   Adj. R-squared (uncentered):              0.901
Method:                 Least Squares   F-statistic:                          1.927e+04
Date:                Fri, 18 Mar 2022   Prob (F-statistic):                        0.00
Time:                        17:55:14   Log-Likelihood:                     -2.1492e+05
No. Observations:               17000   AIC:                                  4.298e+05
Df Residuals:                   16992   BIC:                                  4.299e+05
Df Model:                           8                                                  
Covariance Type:            nonrobust                                                  
                         coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------

In [11]:
# Normalizando os dados
scaler = StandardScaler()
normalized_x = scaler.fit_transform(x)

modelo = sm.OLS( y, normalized_x)
res = modelo.fit()
print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:     median_house_value   R-squared (uncentered):                   0.153
Model:                            OLS   Adj. R-squared (uncentered):              0.152
Method:                 Least Squares   F-statistic:                              383.4
Date:                Fri, 18 Mar 2022   Prob (F-statistic):                        0.00
Time:                        17:57:18   Log-Likelihood:                     -2.3314e+05
No. Observations:               17000   AIC:                                  4.663e+05
Df Residuals:                   16992   BIC:                                  4.664e+05
Df Model:                           8                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [12]:
# Normalizando os dados
scaler = StandardScaler()
# np.array
normalized_x_np = scaler.fit_transform(x)
# DataFrame
normalized_x_df = pd.DataFrame(normalized_x_np, columns=x.columns)


modelo = sm.OLS( y, normalized_x_df)
res = modelo.fit()
print(res.summary())


                                 OLS Regression Results                                
Dep. Variable:     median_house_value   R-squared (uncentered):                   0.153
Model:                            OLS   Adj. R-squared (uncentered):              0.152
Method:                 Least Squares   F-statistic:                              383.4
Date:                Fri, 18 Mar 2022   Prob (F-statistic):                        0.00
Time:                        17:57:22   Log-Likelihood:                     -2.3314e+05
No. Observations:               17000   AIC:                                  4.663e+05
Df Residuals:                   16992   BIC:                                  4.664e+05
Df Model:                           8                                                  
Covariance Type:            nonrobust                                                  
                         coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------

In [33]:
# Normalizando os dados
scaler = StandardScaler()
# np.array
normalized_x_np = scaler.fit_transform(x)
# DataFrame
normalized_x_df = pd.DataFrame(normalized_x_np, columns=x.columns)
normalized_x_df['intercept'] = 1

modelo = sm.OLS( y, normalized_x_df)
res = modelo.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:     median_house_value   R-squared:                       0.641
Model:                            OLS   Adj. R-squared:                  0.641
Method:                 Least Squares   F-statistic:                     3798.
Date:                Fri, 18 Mar 2022   Prob (F-statistic):               0.00
Time:                        18:32:27   Log-Likelihood:            -2.1365e+05
No. Observations:               17000   AIC:                         4.273e+05
Df Residuals:                   16991   BIC:                         4.274e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
longitude           -8.65e+04   1583

### Scikit learn

In [17]:
_df.shape

(17000, 9)

In [15]:
x.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82


In [16]:
y.head(2)

0    66900.0
1    80100.0
Name: median_house_value, dtype: float64

In [24]:
x_train, x_val, y_train, y_val = train_test_split(x,y, test_size=0.3, random_state=0)
print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

(11900, 8) (5100, 8) (11900,) (5100,)


In [25]:
print(x_train.head(2), x_val.head(2), y_train.head(2), y_val.head(2))

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
322      -116.88     34.25                11.0       1089.0           198.0   
11968    -121.38     38.60                36.0       1249.0           159.0   

       population  households  median_income  
322         230.0        90.0         4.9643  
11968       362.0       143.0         6.8469         longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
3873    -117.95     33.84                18.0       3418.0           815.0   
3625    -117.92     34.08                36.0       1479.0           251.0   

      population  households  median_income  
3873      1961.0       773.0         3.6500  
3625       741.0       245.0         4.2986   322      176000.0
11968    446400.0
Name: median_house_value, dtype: float64 3873    171400.0
3625    189600.0
Name: median_house_value, dtype: float64


In [22]:
modelo = LinearRegression()
modelo.fit(x_train, y_train)
p = modelo.predict(x_val)
p

array([113335.2851803 ,  87644.62646805, 123056.10495727, ...,
       102209.06704475, 310108.3880645 , 182088.4909068 ])

In [26]:
modelo = LinearRegression(normalize=True)
modelo.fit(x_train, y_train)
p = modelo.predict(x_val)
p

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




array([210115.00792512, 218552.25069197, 465480.37566264, ...,
       120891.22403707, 173531.86120393, 216921.07491236])

In [27]:
np.sqrt(mean_squared_error(y_val, p))

69057.71585050465

In [28]:
modelo.coef_

array([-4.29924725e+04, -4.27651880e+04,  1.17410488e+03, -8.43005986e+00,
        1.18929331e+02, -3.58414446e+01,  3.86437859e+01,  4.04939678e+04])

* Usando scaler para normalizar os dados.

In [31]:
x_train, x_val, y_train, y_val = train_test_split(x,y, test_size=0.3, random_state=0)
print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

# Normalizando os dados
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)

modelo = LinearRegression(normalize=False)
modelo.fit(x_train_scaled, y_train)
p = modelo.predict(x_val_scaled)
mse = np.sqrt(mean_squared_error(y_val, p))
cof = modelo.coef_

print(f"\n{p}\n\n{mse}\n\n{cof}\n")

(11900, 8) (5100, 8) (11900,) (5100,)

[210115.00792512 218552.25069197 465480.37566264 ... 120891.22403707
 173531.86120393 216921.07491236]

69057.71585050465

[-86347.90780568 -91686.55531605  14820.25508291 -18738.10696631
  50487.93188416 -42342.52251271  14973.15005562  77778.8185199 ]





In [32]:
x_train, x_val, y_train, y_val = train_test_split(x,y, test_size=0.3, random_state=0)
print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

# Normalizando os dados
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)

modelo = LinearRegression(fit_intercept=False , normalize=False)
modelo.fit(x_train_scaled, y_train)
p = modelo.predict(x_val_scaled)
mse = np.sqrt(mean_squared_error(y_val, p))
cof = modelo.coef_

print(f"\n{p}\n\n{mse}\n\n{cof}\n")

(11900, 8) (5100, 8) (11900,) (5100,)

[  2174.95834529  10612.20111213 257540.32608281 ... -87048.82554276
 -34408.18837591   8981.02533253]

219217.21439528995

[-86347.90780568 -91686.55531605  14820.25508291 -18738.10696632
  50487.93188416 -42342.52251271  14973.15005562  77778.8185199 ]



