# Sesión 09: Análisis de regresión: selección de modelos

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn import linear_model

In [2]:
df = pd.read_excel('data/b05_burritos.xlsx')
df.columns

Index(['Location', 'Burrito', 'Cost', 'Hunger', 'Tortilla', 'Temp', 'Meat',
       'Fillings', 'Meat:filling', 'Uniformity', 'Salsa', 'Synergy', 'Wrap',
       'overall', 'Reviewer'],
      dtype='object')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239 entries, 0 to 238
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Location      239 non-null    object 
 1   Burrito       239 non-null    object 
 2   Cost          237 non-null    float64
 3   Hunger        238 non-null    float64
 4   Tortilla      239 non-null    float64
 5   Temp          226 non-null    float64
 6   Meat          231 non-null    float64
 7   Fillings      238 non-null    float64
 8   Meat:filling  233 non-null    float64
 9   Uniformity    237 non-null    float64
 10  Salsa         223 non-null    float64
 11  Synergy       237 non-null    float64
 12  Wrap          237 non-null    float64
 13  overall       237 non-null    float64
 14  Reviewer      238 non-null    object 
dtypes: float64(12), object(3)
memory usage: 28.1+ KB


In [4]:
df.head(3)

Unnamed: 0,Location,Burrito,Cost,Hunger,Tortilla,Temp,Meat,Fillings,Meat:filling,Uniformity,Salsa,Synergy,Wrap,overall,Reviewer
0,Donato's taco shop,California,6.49,3.0,3.0,5.0,3.0,3.5,4.0,4.0,4.0,4.0,4.0,3.8,Scott
1,Oscar's Mexican food,California,5.45,3.5,2.0,3.5,2.5,2.5,2.0,4.0,3.5,2.5,5.0,3.0,Scott
2,Oscar's Mexican food,Carnitas,4.85,1.5,3.0,2.0,2.5,3.0,4.5,4.0,3.0,3.0,5.0,3.0,Emily


In [6]:
#quitar valores perdidos
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 199 entries, 0 to 238
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Location      199 non-null    object 
 1   Burrito       199 non-null    object 
 2   Cost          199 non-null    float64
 3   Hunger        199 non-null    float64
 4   Tortilla      199 non-null    float64
 5   Temp          199 non-null    float64
 6   Meat          199 non-null    float64
 7   Fillings      199 non-null    float64
 8   Meat:filling  199 non-null    float64
 9   Uniformity    199 non-null    float64
 10  Salsa         199 non-null    float64
 11  Synergy       199 non-null    float64
 12  Wrap          199 non-null    float64
 13  overall       199 non-null    float64
 14  Reviewer      199 non-null    object 
dtypes: float64(12), object(3)
memory usage: 24.9+ KB


## Con StatsModels

In [7]:
var_ind = ['Cost', 'Hunger', 'Tortilla', 'Temp', 'Meat',
           'Fillings', 'Meat:filling', 'Uniformity', 'Salsa', 
           'Synergy', 'Wrap']
X = df[var_ind]
X = sm.add_constant(X)
y = df['overall']

In [8]:
results = sm.OLS(y, X).fit()
results.summary()

0,1,2,3
Dep. Variable:,overall,R-squared:,0.809
Model:,OLS,Adj. R-squared:,0.798
Method:,Least Squares,F-statistic:,71.97
Date:,"Fri, 08 Mar 2024",Prob (F-statistic):,3.87e-61
Time:,01:06:43,Log-Likelihood:,-63.761
No. Observations:,199,AIC:,151.5
Df Residuals:,187,BIC:,191.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.6135,0.239,-2.564,0.011,-1.086,-0.141
Cost,0.0330,0.022,1.494,0.137,-0.011,0.077
Hunger,0.0094,0.030,0.315,0.753,-0.049,0.068
Tortilla,0.0243,0.038,0.645,0.519,-0.050,0.099
Temp,0.0746,0.025,2.931,0.004,0.024,0.125
Meat,0.2052,0.038,5.342,0.000,0.129,0.281
Fillings,0.2497,0.043,5.810,0.000,0.165,0.334
Meat:filling,0.0701,0.031,2.274,0.024,0.009,0.131
Uniformity,0.0851,0.025,3.368,0.001,0.035,0.135

0,1,2,3
Omnibus:,28.381,Durbin-Watson:,1.626
Prob(Omnibus):,0.0,Jarque-Bera (JB):,50.384
Skew:,-0.738,Prob(JB):,1.15e-11
Kurtosis:,4.975,Cond. No.,131.0


In [9]:
var_ind.remove('Hunger')
X = df[var_ind]
X = sm.add_constant(X)
results = sm.OLS(y, X).fit()
results.summary()

0,1,2,3
Dep. Variable:,overall,R-squared:,0.809
Model:,OLS,Adj. R-squared:,0.799
Method:,Least Squares,F-statistic:,79.54
Date:,"Fri, 08 Mar 2024",Prob (F-statistic):,4.44e-62
Time:,01:06:48,Log-Likelihood:,-63.814
No. Observations:,199,AIC:,149.6
Df Residuals:,188,BIC:,185.9
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.5856,0.222,-2.641,0.009,-1.023,-0.148
Cost,0.0322,0.022,1.471,0.143,-0.011,0.075
Tortilla,0.0253,0.037,0.674,0.501,-0.049,0.099
Temp,0.0756,0.025,3.002,0.003,0.026,0.125
Meat,0.2044,0.038,5.345,0.000,0.129,0.280
Fillings,0.2513,0.043,5.907,0.000,0.167,0.335
Meat:filling,0.0701,0.031,2.280,0.024,0.009,0.131
Uniformity,0.0848,0.025,3.366,0.001,0.035,0.135
Salsa,0.0255,0.030,0.856,0.393,-0.033,0.084

0,1,2,3
Omnibus:,28.11,Durbin-Watson:,1.633
Prob(Omnibus):,0.0,Jarque-Bera (JB):,50.102
Skew:,-0.73,Prob(JB):,1.32e-11
Kurtosis:,4.978,Cond. No.,118.0


In [10]:
var_ind.remove('Tortilla')
X = df[var_ind]
X = sm.add_constant(X)
results = sm.OLS(y, X).fit()
results.summary()

0,1,2,3
Dep. Variable:,overall,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.799
Method:,Least Squares,F-statistic:,88.58
Date:,"Fri, 08 Mar 2024",Prob (F-statistic):,5.73e-63
Time:,01:07:43,Log-Likelihood:,-64.054
No. Observations:,199,AIC:,148.1
Df Residuals:,189,BIC:,181.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.5511,0.215,-2.558,0.011,-0.976,-0.126
Cost,0.0316,0.022,1.445,0.150,-0.012,0.075
Temp,0.0787,0.025,3.185,0.002,0.030,0.127
Meat,0.2048,0.038,5.365,0.000,0.130,0.280
Fillings,0.2562,0.042,6.121,0.000,0.174,0.339
Meat:filling,0.0692,0.031,2.256,0.025,0.009,0.130
Uniformity,0.0874,0.025,3.517,0.001,0.038,0.136
Salsa,0.0287,0.029,0.976,0.330,-0.029,0.087
Synergy,0.3084,0.041,7.540,0.000,0.228,0.389

0,1,2,3
Omnibus:,27.365,Durbin-Watson:,1.642
Prob(Omnibus):,0.0,Jarque-Bera (JB):,47.793
Skew:,-0.72,Prob(JB):,4.19e-11
Kurtosis:,4.921,Cond. No.,110.0


In [11]:
var_ind.remove('Salsa')
X = df[var_ind]
X = sm.add_constant(X)
results = sm.OLS(y, X).fit()
results.summary()

0,1,2,3
Dep. Variable:,overall,R-squared:,0.807
Model:,OLS,Adj. R-squared:,0.799
Method:,Least Squares,F-statistic:,99.56
Date:,"Fri, 08 Mar 2024",Prob (F-statistic):,8.890000000000001e-64
Time:,01:08:04,Log-Likelihood:,-64.555
No. Observations:,199,AIC:,147.1
Df Residuals:,190,BIC:,176.7
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.5373,0.215,-2.500,0.013,-0.961,-0.113
Cost,0.0342,0.022,1.577,0.116,-0.009,0.077
Temp,0.0783,0.025,3.169,0.002,0.030,0.127
Meat,0.2061,0.038,5.404,0.000,0.131,0.281
Fillings,0.2580,0.042,6.171,0.000,0.176,0.340
Meat:filling,0.0674,0.031,2.202,0.029,0.007,0.128
Uniformity,0.0904,0.025,3.662,0.000,0.042,0.139
Synergy,0.3185,0.040,8.052,0.000,0.240,0.397
Wrap,0.0591,0.022,2.742,0.007,0.017,0.102

0,1,2,3
Omnibus:,25.418,Durbin-Watson:,1.629
Prob(Omnibus):,0.0,Jarque-Bera (JB):,43.275
Skew:,-0.682,Prob(JB):,4.01e-10
Kurtosis:,4.832,Cond. No.,106.0


Para utilizar las funciones de la regresión por pasos se requiere hacer la siguiente instalación:
`!pip install stepwise-regression`

In [13]:
from stepwise_regression import step_reg

In [14]:
# Modelo completo con todos los predictores
var_ind = ['Cost', 'Hunger', 'Tortilla', 'Temp', 'Meat',
           'Fillings', 'Meat:filling', 'Uniformity', 'Salsa', 
           'Synergy', 'Wrap']
X = df[var_ind]
X = sm.add_constant(X)
y = df['overall']

El paquete *Stepwise-regression* tiene dos funciones: *backward-regression* y *forward_regression*. Hay cuatro parámetros:
- X: variables independientes
- y: variable dependiente
- threshold_in: el nivel de significancia para el p-valor
- verbose=False 

In [15]:
backselect = step_reg.backward_regression(X, y, 0.05,verbose=False)
backselect

['const',
 'Temp',
 'Meat',
 'Fillings',
 'Meat:filling',
 'Uniformity',
 'Synergy',
 'Wrap']

In [16]:
backselect.remove('const')
X_backselect = df[backselect]
# add a constant 
X_backselect = sm.add_constant(X_backselect)
# define the model and fit it
backres = sm.OLS(y, X_backselect).fit()
backres.summary()

0,1,2,3
Dep. Variable:,overall,R-squared:,0.805
Model:,OLS,Adj. R-squared:,0.798
Method:,Least Squares,F-statistic:,112.6
Date:,"Fri, 08 Mar 2024",Prob (F-statistic):,2.7500000000000003e-64
Time:,01:11:15,Log-Likelihood:,-65.85
No. Observations:,199,AIC:,147.7
Df Residuals:,191,BIC:,174.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.3017,0.155,-1.944,0.053,-0.608,0.004
Temp,0.0768,0.025,3.097,0.002,0.028,0.126
Meat,0.2169,0.038,5.755,0.000,0.143,0.291
Fillings,0.2643,0.042,6.325,0.000,0.182,0.347
Meat:filling,0.0665,0.031,2.162,0.032,0.006,0.127
Uniformity,0.0841,0.024,3.440,0.001,0.036,0.132
Synergy,0.3146,0.040,7.939,0.000,0.236,0.393
Wrap,0.0560,0.022,2.598,0.010,0.013,0.098

0,1,2,3
Omnibus:,23.839,Durbin-Watson:,1.633
Prob(Omnibus):,0.0,Jarque-Bera (JB):,40.112
Skew:,-0.647,Prob(JB):,1.95e-09
Kurtosis:,4.778,Cond. No.,61.8


## Para reportar los resultados de una regresión:
-   Indicar procedimiento (OLS y otras metodologías)
-   Indicar el ajuste del modelo (R cuadrada)
-   Indicar si el modelo es significativo (Estadístico F y su p-valor)
- Indicar si las variables son significativas (B, estadístico t y p-valor)

Se realizó una regresión de mínimos cuadrados ordinarios. Se aplicó un procedimiento de selección de variables con eliminación hacia atrás (backward elimination). El modelo final mostró un buen ajuste (R cuadrada = 0.805) y resultó significativo (F = 112.6, P < 0.001). Las variables independientes en el modelo final fueron "Temp" (B = 0.0768, p = 0.002), "Meat" (BI= 0.2169, P ‹
0.001), "Fillings" (p < 0.001), "Meat:Filling" (B = 0.0655, p = 0.032),
"Uniformity" (B = 0.0841, p = 0.001), "Synergy" (B = 0.3146, P < 0.001), "Wrap"
(B = 0.0560, p = 0.010).