# How to Develop LASSO Regression Models in Python

January 19, 2025

Objetivo:

* Lasso Regression es una extensión de la regresión lineal que agrega una penalización de regularización a la función de pérdida durante el entrenamiento.

* Cómo evaluar un modelo de regresión de Lasso y utilizar un modelo final para hacer predicciones de nuevos datos.

* Cómo configurar el modelo Lasso Regression para un nuevo conjunto de datos.

## Setup
#### Load Python libaries, etc.

In [None]:
#
# Dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.model_selection import RepeatedKFold
import numpy as np
from numpy import arange
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from statsmodels.formula.api import ols
from scipy import stats
import plotly.express as px

#
import warnings
warnings.filterwarnings('ignore')

## Lasso Regression

El objetivo es econtrar los valores de $\hat{\boldsymbol{\beta}}^{Lasso}$ que resulta de:
$$\hat{\boldsymbol{\beta}}^{Lasso} = \min_{\boldsymbol{\beta}} \left[ \sum_{i = 1}^n (y_i - \mathbf{X}_i \boldsymbol{\beta})^2 + \lambda \sum_{k = 2}^K |\beta_k| \right]$$

Donde $\lambda \geq 0$ y suponemos que $\beta_1$ es el término constante de la regresión. El término constante se estima considerando que este toma el valor de la media de $Y$ dado por: $\overline{Y} = \frac{\sum_{i = 1}^{n} y_i}{n}$.

## Lectura de Data

Los datos utilizados corresponden a precios de venta de inmuebles en la CDMX recolectados de la web de Inmuebles 24 en mayo de 2020. 

In [None]:
# Data

Venta = pd.read_csv('Ventas.csv')

Venta.tail()

## Limpieza de datos

In [None]:
#

Venta.info()

In [None]:
# 

Venta.columns

In [None]:
# Creamos un histograma:

# 
Venta[['rooms', 'bathrooms', 'construction (m2)', 'terrain (m2)', 'Monto1', 'lng', 
       'lat']].hist(bins = 50, figsize = (20, 15))

plt.show()

In [None]:
#

Venta_sel = Venta[(Venta['Monto1'] > 1000) & (Venta['Monto1'] < 70000000)]

Venta_sel[['rooms', 'bathrooms', 'construction (m2)', 'terrain (m2)', 'Monto1', 'lng', 
           'lat']].hist(bins = 50, figsize = (20, 15))

plt.show()

In [None]:
#

Venta_sel = Venta_sel[(Venta_sel['terrain (m2)'] > 10) & (Venta_sel['terrain (m2)'] < 1500)]

Venta_sel[['rooms', 'bathrooms', 'construction (m2)', 'terrain (m2)', 'Monto1', 'lng', 
           'lat']].hist(bins = 50, figsize = (20, 15))

plt.show()

In [None]:
#

Venta_sel = Venta_sel[(Venta_sel['construction (m2)'] > 10) & (Venta_sel['construction (m2)'] < 1200)]

Venta_sel[['rooms', 'bathrooms', 'construction (m2)', 'terrain (m2)', 'Monto1', 'lng', 
           'lat']].hist(bins = 50, figsize = (20, 15))

plt.show()

In [None]:
#

Venta_sel = Venta_sel[(Venta_sel['rooms'] > 0) & (Venta_sel['rooms'] < 8)]

Venta_sel[['rooms', 'bathrooms', 'construction (m2)', 'terrain (m2)', 'Monto1', 'lng', 
           'lat']].hist(bins = 50, figsize = (20, 15))

plt.show()

In [None]:
#

Venta_sel = Venta_sel[ (Venta_sel['bathrooms'] > 0) & (Venta_sel['bathrooms'] < 8)]

Venta_sel[['rooms', 'bathrooms', 'construction (m2)', 'terrain (m2)', 'Monto1', 'lng', 
           'lat']].hist(bins = 50, figsize = (20, 15))

plt.show()

In [None]:
#

Venta_sel = Venta_sel[(Venta_sel['lat'] > 19.1) & (Venta_sel['lat'] < 19.8) & \
                      (Venta_sel['lng'] > - 99.4) & (Venta_sel['lng'] < - 98.8)]

Venta_sel[['rooms', 'bathrooms', 'construction (m2)', 'terrain (m2)', 'Monto1', 'lng', 
           'lat']].hist(bins = 50, figsize = (20, 15))

plt.show()

### En suma:

In [None]:
#

Venta_sel = Venta[ ( (Venta['Monto1'] > 1000) & (Venta['Monto1'] < 70000000) ) & \
                   ( (Venta_sel['terrain (m2)'] > 10) & (Venta_sel['terrain (m2)'] < 1500) ) & \
                   ( (Venta_sel['construction (m2)'] > 10) & (Venta_sel['construction (m2)'] < 1200) ) & \
                   ( (Venta_sel['rooms'] > 0) & (Venta_sel['rooms'] < 8) ) & \
                   ( (Venta_sel['bathrooms'] > 0 ) & (Venta_sel['bathrooms'] < 8) )& \
                   ( (Venta_sel['lat'] > 19.1) & (Venta_sel['lat'] < 19.8) ) & \
                   ( (Venta_sel['lng'] > - 99.4) & (Venta_sel['lng'] < - 98.8) ) ]

In [None]:
#

Venta.shape, Venta_sel.shape

In [None]:
#

Venta_sel['Price_m2'] = Venta_sel['Monto1'] / Venta_sel['construction (m2)']

## Visualización

In [None]:
#

Venta_sel.plot(kind = 'scatter', x = 'lng', y = 'lat')

plt.show()

In [None]:
#

Venta_sel.plot(kind = 'scatter', x = 'lng', y = 'lat', alpha = 0.1)

plt.show()

In [None]:
#

Venta_sel.plot( kind = 'scatter', x = 'lng', y = 'lat', alpha = 0.4,
                s = Venta_sel['Monto1']/1000000, label = "Precios de propiedades", 
                figsize = (10, 7),
                c = Venta_sel['Monto1']/1000000,  colorbar = True)

plt.legend()

plt.show()

In [None]:
#

Venta_sel.plot( kind = 'scatter', x = 'lng', y = 'lat', alpha = 0.4,
                s = Venta_sel['Price_m2']/1000, label = "Precios por m2 de propiedades", 
                figsize = (10, 7),
                c = Venta_sel['Price_m2']/1000,  colorbar = True)

plt.legend()

plt.show()

In [None]:
#

Venta_sel.plot( kind = 'scatter', x = 'lng', y = 'lat', alpha = 0.4,
                s = Venta_sel['construction (m2)'], label = "construction (m2) de propiedades", 
                figsize = (10, 7),
                c = Venta_sel['construction (m2)'],  colorbar = True)

plt.legend()

plt.show()

In [None]:
#
attributes = ['rooms', 'bathrooms', 'construction (m2)', 'terrain (m2)', 'Monto1']

scatter_matrix(Venta_sel[attributes], figsize = (15, 10))

plt.show()

In [None]:
# Save 

Venta_sel.to_csv('Venta_sel.csv', sep =',', encoding = 'utf-8-sig', index = False)

## El Modelo

In [None]:
#

Venta_sel['construction'] = Venta_sel['construction (m2)']
Venta_sel['terrain'] = Venta_sel['terrain (m2)']

# Formulamos la regresión

formula = 'Monto1 ~ rooms + bathrooms + construction + terrain + lat + lng'

results = ols(formula, Venta_sel).fit()

print(results.summary())

In [None]:
# Definamos la matriz X y el vector Y

X = Venta_sel[["rooms", "bathrooms", "construction (m2)", "terrain (m2)", 'lat', 'lng']]

y = Venta_sel["Monto1"].values.reshape(-1, 1)

X.shape, y.shape

In [None]:
# Separamos los datos en prueba y entrenamiento

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.25)

In [None]:
#

X_train.shape, X_test.shape, 28461/(28461+9487)

In [None]:
# Create a StandardScater model and fit it to the training data

X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [None]:
# Transform the training and testing data using the X_scaler and y_scaler models

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
# Create a LinearRegression model and fit it to the scaled training data

model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)

In [None]:
# Extract coefficients form regression
# Venta_sel[["rooms", "bathrooms", "construction (m2)", "terrain (m2)", 'lat', 'lng']]

model.intercept_, model.coef_

In [None]:
# Make predictions using a fitted model
# Plot the difference between the model predicted values and actual y values, versus the model predicted values
# Obs: You can predict values of X training and testing data using the model.predict() method on a fitted model

predictions = model.predict(X_test_scaled)
predictions

In [None]:
# PLOT

plt.scatter( model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, 
             c = "darkblue", label = "Training Data" )

plt.scatter( model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, 
             c = "orange", label = "Testing Data" )

plt.legend()

plt.hlines( y = 0, xmin = y_test_scaled.min(), xmax = y_test_scaled.max() )

plt.title("Residual Plot")

plt.show()

In [None]:
# PLOT

plt.scatter( model.predict(X_train_scaled), y_train_scaled, 
             c = "darkblue", label = "Training Data" )

plt.scatter( model.predict(X_test_scaled), y_test_scaled, 
             c = "orange", label = "Testing Data" )

plt.legend()

plt.title("Predicted Plot")

plt.show()

In [None]:
# PLOT Re-Scaled Data

plt.scatter( y_scaler.inverse_transform(model.predict(X_train_scaled)), 
             y_scaler.inverse_transform(y_train_scaled), 
             c = "darkblue", label = "Training Data" )

plt.scatter( y_scaler.inverse_transform(model.predict(X_test_scaled)), 
             y_scaler.inverse_transform(y_test_scaled), 
             c = "orange", label = "Testing Data" )

plt.legend()

plt.title("Predicted Plot")

plt.show()

In [None]:
# Compare R2:

model.score(X_train_scaled, y_train_scaled), model.score(X_test_scaled, y_test_scaled)

In [None]:
# LASSO model
# Note: Use an alpha of .01 when creating the model for this activity
# When alpha = 0, the objective is equivalent to ordinary least squares, 
# solved by the LinearRegression object. 
# For numerical reasons, using alpha = 0 with the Lasso object is not advised. 
# Instead, you should use the LinearRegression object.

lasso = Lasso( alpha = 0.01 )
lasso.fit(X_train_scaled, y_train_scaled)

In [None]:
# Extract coefficients form lasso regression Vs linear regression
# Venta_sel[["rooms", "bathrooms", "construction (m2)", "terrain (m2)", 'lat', 'lng']]

lasso.intercept_, lasso.coef_, model.intercept_, model.coef_

In [None]:
#

predictions = lasso.predict(X_test_scaled)

predictions

In [None]:
# PLOT

plt.scatter( lasso.predict(X_train_scaled), y_train_scaled, 
             c = "darkblue", label = "Training Data" )

plt.scatter( lasso.predict(X_test_scaled), y_test_scaled, 
             c = "orange", label = "Testing Data" )

plt.legend()

plt.title("Predicted Plot")

plt.show()

In [None]:
# Compare R2:

lasso.score(X_train_scaled, y_train_scaled), lasso.score(X_test_scaled, y_test_scaled)

In [None]:
#

model.score(X_train_scaled, y_train_scaled), model.score(X_test_scaled, y_test_scaled)

In [None]:
# use automatically configured the lasso regression algorithm

# define model evaluation method
cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 1)

# define model
model = LassoCV(alphas=arange(0, 1, 0.01), cv=cv, n_jobs=-1)

# fit model
model.fit(X_train_scaled, y_train_scaled)

# summarize chosen configuration
print('alpha: %f' % model.alpha_)