In [4]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.stats.api as sms

### Documentacion de stats model: https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.OLSResults.t_test.html#statsmodels.regression.linear_model.OLSResults.t_test

In [5]:
df_casa = pd.read_excel('./Ejemplo_Casa.xls')
display(df_casa.head())

Unnamed: 0,PRECIO,LOTE,CUARTOS,BANOS,PISOS,ENTRADA,REC,SOTANO,CALEF,AIRE,GARAGE,NBHD
0,42000,5850,3,1,2,1,0,1,0,0,1,0
1,38500,4000,2,1,1,1,0,0,0,0,0,0
2,49500,3060,3,1,1,1,0,0,0,0,0,0
3,60500,6650,3,1,2,1,1,0,0,0,0,0
4,61000,6360,2,1,1,1,0,0,0,0,0,0


In [6]:
y=df_casa['PRECIO']
X=sm.add_constant(df_casa.drop(columns='PRECIO'))

reg_casa=sm.OLS(y,X).fit()
display(reg_casa.summary())

0,1,2,3
Dep. Variable:,PRECIO,R-squared:,0.673
Model:,OLS,Adj. R-squared:,0.666
Method:,Least Squares,F-statistic:,99.97
Date:,"Wed, 30 Jul 2025",Prob (F-statistic):,6.18e-122
Time:,12:23:24,Log-Likelihood:,-6034.1
No. Observations:,546,AIC:,12090.0
Df Residuals:,534,BIC:,12140.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-4038.3504,3409.471,-1.184,0.237,-1.07e+04,2659.271
LOTE,3.5463,0.350,10.124,0.000,2.858,4.234
CUARTOS,1832.0035,1047.000,1.750,0.081,-224.741,3888.748
BANOS,1.434e+04,1489.921,9.622,0.000,1.14e+04,1.73e+04
PISOS,6556.9457,925.290,7.086,0.000,4739.291,8374.600
ENTRADA,6687.7789,2045.246,3.270,0.001,2670.065,1.07e+04
REC,4511.2838,1899.958,2.374,0.018,778.976,8243.592
SOTANO,5452.3855,1588.024,3.433,0.001,2332.845,8571.926
CALEF,1.283e+04,3217.597,3.988,0.000,6510.706,1.92e+04

0,1,2,3
Omnibus:,93.454,Durbin-Watson:,1.604
Prob(Omnibus):,0.0,Jarque-Bera (JB):,247.62
Skew:,0.853,Prob(JB):,1.7e-54
Kurtosis:,5.824,Cond. No.,30700.0


### Test de hipótesis
Tomo la regresora $\hat\beta$
$$
 H_0: \hat\beta=0
 \\
 H_1: \hat\beta <>0
$$
Donde reg=0 implica que no es significativo y <> que que si lo es, por lo tanto explica a X

In [7]:
#Hacemos un t-test para los cuartos

display(reg_casa.t_test("CUARTOS=0"))

<class 'statsmodels.stats.contrast.ContrastResults'>
                             Test for Constraints                             
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c0          1832.0035   1047.000      1.750      0.081    -224.741    3888.748

Podemos reconstruir el IC de la siguiente forma

$$
\hat{\beta} \pm t_{\alpha/2, \, gl} \cdot \text{SE}(\hat{\beta})
$$

para este estimador la cuenta seria

$$
    1832 \pm 1.96 *1047
$$

El valor t que aprece en la regresión es el t observado, el t que usamos para el intervalo de confianza es el valor crítico para la distribución t-student, que como es muy grande (534) se asemeja al valor de la normal.

En función del p-value podemos ver que es significativo al 10% pero no al 5% porque es 0.081 (8%)

### Intervalo de confianza

In [8]:
display(reg_casa.conf_int(alpha=0.05).loc["LOTE"])

0    2.858168
1    4.234438
Name: LOTE, dtype: float64

### Test F
Estos test sirven para detectar multicolinealidad. Si el p-value es significativo==> rechazo H0. 
En este caso la H0 es que los coeficiente en su conjunto son 0, es decir existe multicolinealidad. 
En caso de NRH0 entonces HAY MULTICOLINEADAD si el p-value no es significativo.

In [9]:
print(reg_casa.f_test("CALEF=0, AIRE=0"))

<F test: F=36.888750867416185, p=9.856033740435273e-16, df_denom=534, df_num=2>


In [10]:
df_casa_filtrado=df_casa[df_casa['BANOS']!=4].copy()
y1=df_casa_filtrado['PRECIO']
X1=sm.add_constant(df_casa_filtrado.drop(columns='PRECIO'))


reg_casa_2=sm.OLS(y1,X1).fit()
print(reg_casa_2.summary())

                            OLS Regression Results                            
Dep. Variable:                 PRECIO   R-squared:                       0.665
Model:                            OLS   Adj. R-squared:                  0.658
Method:                 Least Squares   F-statistic:                     96.14
Date:                Wed, 30 Jul 2025   Prob (F-statistic):          7.67e-119
Time:                        12:23:24   Log-Likelihood:                -6022.2
No. Observations:                 545   AIC:                         1.207e+04
Df Residuals:                     533   BIC:                         1.212e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -3509.7393   3419.390     -1.026      0.3

## Variables Dummy

In [11]:
df_casa_filtrado['DB2']=(df_casa_filtrado['BANOS']==2).astype(int)
df_casa_filtrado['DB3']=(df_casa_filtrado['BANOS']==3).astype(int)

X2=sm.add_constant(df_casa_filtrado.drop(columns=['PRECIO','BANOS'])) # Elimino la columna BANOS porque ya tengo las dummies

reg_casa_3=sm.OLS(y1,X2).fit()
print(reg_casa_3.summary())


                            OLS Regression Results                            
Dep. Variable:                 PRECIO   R-squared:                       0.665
Model:                            OLS   Adj. R-squared:                  0.657
Method:                 Least Squares   F-statistic:                     88.01
Date:                Wed, 30 Jul 2025   Prob (F-statistic):          7.04e-118
Time:                        12:23:24   Log-Likelihood:                -6022.1
No. Observations:                 545   AIC:                         1.207e+04
Df Residuals:                     532   BIC:                         1.213e+04
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.031e+04   3464.573      2.976      0.0

### Esto es para ver cuanto se valora un baño mas

In [12]:
print(reg_casa_3.f_test("DB3= 2*DB2"))

<F test: F=0.1815129172859376, p=0.6702485908495301, df_denom=532, df_num=1>


### Formula

In [13]:
reg_casa_4=smf.ols("PRECIO~LOTE+CUARTOS+C(BANOS)+PISOS+ENTRADA+REC+SOTANO+CALEF+AIRE+GARAGE+NBHD",
                   data=df_casa_filtrado).fit()

print(reg_casa_4.summary())

                            OLS Regression Results                            
Dep. Variable:                 PRECIO   R-squared:                       0.665
Model:                            OLS   Adj. R-squared:                  0.657
Method:                 Least Squares   F-statistic:                     88.01
Date:                Wed, 30 Jul 2025   Prob (F-statistic):          7.04e-118
Time:                        12:23:24   Log-Likelihood:                -6022.1
No. Observations:                 545   AIC:                         1.207e+04
Df Residuals:                     532   BIC:                         1.213e+04
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept      1.031e+04   3464.573      2.976

In [14]:
print(reg_casa_4.f_test('C(BANOS)[T.2]=0'))

<F test: F=60.98075593837472, p=3.087247239773227e-14, df_denom=532, df_num=1>


In [15]:
## Interacciones entre variables
## La formula incorpora por defecto la constante


reg_casa_5=smf.ols('PRECIO ~ LOTE+CUARTOS+LOTE:NBHD',data=df_casa)



In [16]:
#Incluye lote, nbhd y lote*nbhd
reg_casa_6=smf.ols('PRECIO ~CUARTOS+LOTE*NBHD',data=df_casa).fit()

display(reg_casa_6.summary())

0,1,2,3
Dep. Variable:,PRECIO,R-squared:,0.409
Model:,OLS,Adj. R-squared:,0.405
Method:,Least Squares,F-statistic:,93.7
Date:,"Wed, 30 Jul 2025",Prob (F-statistic):,1.62e-60
Time:,12:23:24,Log-Likelihood:,-6195.6
No. Observations:,546,AIC:,12400.0
Df Residuals:,541,BIC:,12420.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,7294.7262,4163.014,1.752,0.080,-882.925,1.55e+04
CUARTOS,1.027e+04,1212.646,8.470,0.000,7888.478,1.27e+04
LOTE,5.3083,0.504,10.539,0.000,4.319,6.298
NBHD,9343.4224,5666.371,1.649,0.100,-1787.362,2.05e+04
LOTE:NBHD,0.5926,0.917,0.646,0.519,-1.210,2.395

0,1,2,3
Omnibus:,84.951,Durbin-Watson:,1.188
Prob(Omnibus):,0.0,Jarque-Bera (JB):,151.614
Skew:,0.925,Prob(JB):,1.2e-33
Kurtosis:,4.801,Cond. No.,39400.0


In [17]:

print(reg_casa_6.f_test('NBHD=0,LOTE:NBHD=0'))

<F test: F=17.85790812392496, p=3.087994730282173e-08, df_denom=541, df_num=2>


### Test White

In [19]:
white_test=sms.het_white(reg_casa_6.resid, reg_casa_6.model.exog)

print(f"White test statistic: {white_test[0]}, p-value: {white_test[1]}")

White test statistic: 66.0416350889167, p-value: 2.560619906808647e-10


### H0= Homocedasticidad
### H1= Heterocedasticidad
### Test de White
Como el p-value es muy significativo hay heterocedasticidad.

In [21]:
reg_casa_6_robust = reg_casa_6.get_robustcov_results(cov_type='HC1')
print(reg_casa_6_robust.summary())

                            OLS Regression Results                            
Dep. Variable:                 PRECIO   R-squared:                       0.409
Model:                            OLS   Adj. R-squared:                  0.405
Method:                 Least Squares   F-statistic:                     72.62
Date:                Wed, 30 Jul 2025   Prob (F-statistic):           3.08e-49
Time:                        14:32:22   Log-Likelihood:                -6195.6
No. Observations:                 546   AIC:                         1.240e+04
Df Residuals:                     541   BIC:                         1.242e+04
Df Model:                           4                                         
Covariance Type:                  HC1                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   7294.7262   4017.535      1.816      0.0

### Aca lo que cambia es el std err, t y p-value, los coeficientes no cambian.

## Ejemplo multicolinealidad

In [30]:
dfmulti = pd.read_excel('./CEO_ejemplo_multicolinealidad.xlsx',sheet_name='datos')

print(dfmulti.head(20))

X_mult=sm.add_constant(dfmulti.drop(columns='Comp'))
y=dfmulti['Comp']
display(X_mult)
display(y)
regre_multico=sm.OLS(y,X_mult).fit()

print(regre_multico.summary())


       Gan  Gan_10  Comp
0    357.0      35   0.7
1     48.0       4   0.7
2    932.0      93   0.8
3    366.0      36   0.7
4     83.0       8   0.8
5     22.0       2   0.0
6     67.0       6   0.0
7    413.0      41   0.6
8    496.0      49   0.3
9    458.0      45   0.5
10   152.0      15   0.4
11   115.0      11   1.0
12   964.0      96   0.9
13   459.0      45   1.2
14   421.0      42   1.0
15   723.0      72   1.3
16   256.0      25   1.1
17   294.0      29   0.5
18  1310.0     131   0.7
19   627.0      62   0.6


Unnamed: 0,const,Gan,Gan_10
0,1.0,357.0,35
1,1.0,48.0,4
2,1.0,932.0,93
3,1.0,366.0,36
4,1.0,83.0,8
...,...,...,...
65,1.0,327.0,32
66,1.0,409.0,40
67,1.0,117.0,11
68,1.0,179.0,17


0     0.7
1     0.7
2     0.8
3     0.7
4     0.8
     ... 
65    0.6
66    1.7
67    2.4
68    0.4
69    1.2
Name: Comp, Length: 70, dtype: float64

                            OLS Regression Results                            
Dep. Variable:                   Comp   R-squared:                       0.436
Model:                            OLS   Adj. R-squared:                  0.419
Method:                 Least Squares   F-statistic:                     25.92
Date:                Wed, 30 Jul 2025   Prob (F-statistic):           4.59e-09
Time:                        15:42:04   Log-Likelihood:                -73.546
No. Observations:                  70   AIC:                             153.1
Df Residuals:                      67   BIC:                             159.8
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.6558      0.166      3.941      0.0

### Aca vemos varios p-values altos y un f-statistics bajo, lo que indica que es posible que exista un problema de multicolinealidas.

In [33]:
X=sm.add_constant(dfmulti["Gan"])

reg_normal=sm.OLS(y,X).fit()

print(reg_normal.summary())

                            OLS Regression Results                            
Dep. Variable:                   Comp   R-squared:                       0.434
Model:                            OLS   Adj. R-squared:                  0.426
Method:                 Least Squares   F-statistic:                     52.24
Date:                Wed, 30 Jul 2025   Prob (F-statistic):           5.50e-10
Time:                        15:45:59   Log-Likelihood:                -73.655
No. Observations:                  70   AIC:                             151.3
Df Residuals:                      68   BIC:                             155.8
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.6000      0.112      5.342      0.0