In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=6243b121e268efca7920a34ee55f1d53a789e1f6607d60e3e2d0a82ea54ecc0d
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


# **Reg Lineal Simple**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import fetch_california_housing

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Seleccionar característica y variable objetivo
X = df[['MedInc']]  # Mediana de ingresos
y = df['MedHouseVal']  # Valor medio de la vivienda

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear el modelo
model = LinearRegression()
model.fit(X_train, y_train)

# Predecir
y_pred = model.predict(X_test)

# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
fig = px.scatter(x=X_test['MedInc'], y=y_test, title='Regresión Lineal: Mediana de ingresos vs Valor de la vivienda')
fig.add_trace(go.Scatter(x=X_test['MedInc'], y=y_pred, mode='lines', name='Línea de regresión'))
fig.show()

# Validación cruzada
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

mse_scores = cross_val_score(model, X, y, cv=5, scoring=mse_scorer)
r2_scores = cross_val_score(model, X, y, cv=5, scoring=r2_scorer)

rmse_scores = np.sqrt(-mse_scores)
print(f'RMSE scores: {rmse_scores}')
print(f'RMSE mean: {rmse_scores.mean()}')
print(f'RMSE std: {rmse_scores.std()}')

print(f'R^2 scores: {r2_scores}')
print(f'R^2 mean: {r2_scores.mean()}')
print(f'R^2 std: {r2_scores.std()}')

MSE: 0.7091157771765549
RMSE: 0.8420901241414455
R^2: 0.45885918903846656


RMSE scores: [0.75840798 0.86976297 0.87930369 0.9080471  0.82847446]
RMSE mean: 0.8487992416380019
RMSE std: 0.05189216464198965
R^2 scores: [0.46458389 0.35374043 0.46252448 0.29716971 0.52896687]
R^2 mean: 0.42139707826944833
R^2 std: 0.08383040717288297


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import plotly.express as px

# Crear la sesión de Spark
spark = SparkSession.builder.appName('LinearRegression').getOrCreate()

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Convertir DataFrame de pandas a Spark DataFrame
spark_df = spark.createDataFrame(df)

# Seleccionar característica y variable objetivo
assembler = VectorAssembler(inputCols=['MedInc'], outputCol='features')
data_assembled = assembler.transform(spark_df).select('features', 'MedHouseVal')

# Dividir en conjuntos de entrenamiento y prueba
train_data, test_data = data_assembled.randomSplit([0.8, 0.2], seed=42)

# Crear el modelo
lr = LinearRegression(featuresCol='features', labelCol='MedHouseVal')
lr_model = lr.fit(train_data)

# Predecir
predictions = lr_model.transform(test_data)

# Evaluar el modelo
evaluator = RegressionEvaluator(labelCol='MedHouseVal', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: 'r2'})

# Resultados
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
pandas_pred = predictions.toPandas()
fig = px.scatter(x=pandas_pred['features'].apply(lambda x: x[0]), y=pandas_pred['MedHouseVal'], title='Regresión Lineal: Mediana de ingresos vs Valor de la vivienda')
fig.add_trace(go.Scatter(x=pandas_pred['features'].apply(lambda x: x[0]), y=pandas_pred['prediction'], mode='lines', name='Línea de regresión'))
fig.show()

# Validación cruzada (opcional)
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder().build()
crossval = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cv_model = crossval.fit(data_assembled)

# Resultados de la validación cruzada
best_model = cv_model.bestModel
print(f'Best RMSE: {best_model.summary.rootMeanSquaredError}')
print(f'Best R^2: {best_model.summary.r2}')

RMSE: 0.8246711778272693
R^2: 0.47558201045454385


Best RMSE: 0.8373357452616914
Best R^2: 0.47344749180719903


# **Reg Lineal Multiple**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import fetch_california_housing

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Seleccionar características y variable objetivo
X = df[['MedInc', 'AveRooms', 'AveOccup']]  # Mediana de ingresos, promedio de habitaciones, promedio de ocupantes
y = df['MedHouseVal']  # Valor medio de la vivienda

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear el modelo
model = LinearRegression()
model.fit(X_train, y_train)

# Predecir
y_pred = model.predict(X_test)

# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
fig = px.scatter(x=y_test, y=y_pred, title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode='lines', name='Línea ideal'))
fig.show()

# Validación cruzada
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

mse_scores = cross_val_score(model, X, y, cv=5, scoring=mse_scorer)
r2_scores = cross_val_score(model, X, y, cv=5, scoring=r2_scorer)

rmse_scores = np.sqrt(-mse_scores)
print(f'RMSE scores: {rmse_scores}')
print(f'RMSE mean: {rmse_scores.mean()}')
print(f'RMSE std: {rmse_scores.std()}')

print(f'R^2 scores: {r2_scores}')
print(f'R^2 mean: {r2_scores.mean()}')
print(f'R^2 std: {r2_scores.std()}')

MSE: 0.7006855912225248
RMSE: 0.8370696453835397
R^2: 0.4652924370503557


RMSE scores: [0.78351439 0.85809362 0.87390128 0.89847424 0.84266595]
RMSE mean: 0.851329894569233
RMSE std: 0.038605208072769674
R^2 scores: [0.42854821 0.37096545 0.46910866 0.31191043 0.51269138]
R^2 mean: 0.418644826440033
R^2 std: 0.0709295230568389


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import plotly.express as px

# Crear la sesión de Spark
spark = SparkSession.builder.appName('LinearRegression').getOrCreate()

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Convertir DataFrame de pandas a Spark DataFrame
spark_df = spark.createDataFrame(df)

# Seleccionar características y variable objetivo
assembler = VectorAssembler(inputCols=['MedInc', 'AveRooms', 'AveOccup'], outputCol='features')
data_assembled = assembler.transform(spark_df).select('features', 'MedHouseVal')

# Dividir en conjuntos de entrenamiento y prueba
train_data, test_data = data_assembled.randomSplit([0.8, 0.2], seed=42)

# Crear el modelo
lr = LinearRegression(featuresCol='features', labelCol='MedHouseVal')
lr_model = lr.fit(train_data)

# Predecir
predictions = lr_model.transform(test_data)

# Evaluar el modelo
evaluator = RegressionEvaluator(labelCol='MedHouseVal', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: 'r2'})

# Resultados
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
pandas_pred = predictions.toPandas()
fig = px.scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['prediction'], title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['MedHouseVal'], mode='lines', name='Línea ideal'))
fig.show()

# Validación cruzada (opcional)
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder().build()
crossval = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cv_model = crossval.fit(data_assembled)

# Resultados de la validación cruzada
best_model = cv_model.bestModel
print(f'Best RMSE: {best_model.summary.rootMeanSquaredError}')
print(f'Best R^2: {best_model.summary.r2}')

RMSE: 0.8091707725556978
R^2: 0.48861735305602993


Best RMSE: 0.8314519981569677
Best R^2: 0.4808213965354955


# **Regresión Ridge**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Seleccionar características y variable objetivo
X = df[['MedInc', 'AveRooms', 'AveOccup']]  # Mediana de ingresos, promedio de habitaciones, promedio de ocupantes
y = df['MedHouseVal']  # Valor medio de la vivienda

# Normalizar datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Crear el modelo
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

# Predecir
y_pred = ridge.predict(X_test)

# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
fig = px.scatter(x=y_test, y=y_pred, title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode='lines', name='Línea ideal'))
fig.show()

# Validación cruzada
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

mse_scores = cross_val_score(ridge, X_scaled, y, cv=5, scoring=mse_scorer)
r2_scores = cross_val_score(ridge, X_scaled, y, cv=5, scoring=r2_scorer)

rmse_scores = np.sqrt(-mse_scores)
print(f'RMSE scores: {rmse_scores}')
print(f'RMSE mean: {rmse_scores.mean()}')
print(f'RMSE std: {rmse_scores.std()}')

print(f'R^2 scores: {r2_scores}')
print(f'R^2 mean: {r2_scores.mean()}')
print(f'R^2 std: {r2_scores.std()}')

MSE: 0.7006831704606405
RMSE: 0.8370681994082922
R^2: 0.465294284383459


RMSE scores: [0.78349978 0.85808696 0.87390137 0.89847511 0.84266018]
RMSE mean: 0.8513246786917659
RMSE std: 0.038610592470167494
R^2 scores: [0.42856953 0.37097521 0.46910854 0.31190909 0.51269806]
R^2 mean: 0.41865208776248347
R^2 std: 0.07093096197431942


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import plotly.express as px

# Crear la sesión de Spark
spark = SparkSession.builder.appName('RidgeRegression').getOrCreate()

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Convertir DataFrame de pandas a Spark DataFrame
spark_df = spark.createDataFrame(df)

# Seleccionar características y variable objetivo
assembler = VectorAssembler(inputCols=['MedInc', 'AveRooms', 'AveOccup'], outputCol='features')
data_assembled = assembler.transform(spark_df).select('features', 'MedHouseVal')

# Normalizar datos
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(data_assembled)
data_scaled = scaler_model.transform(data_assembled).select('scaledFeatures', 'MedHouseVal')

# Dividir en conjuntos de entrenamiento y prueba
train_data, test_data = data_scaled.randomSplit([0.8, 0.2], seed=42)

# Crear el modelo
lr = LinearRegression(featuresCol='scaledFeatures', labelCol='MedHouseVal', regParam=1.0, elasticNetParam=0.0)
lr_model = lr.fit(train_data)

# Predecir
predictions = lr_model.transform(test_data)

# Evaluar el modelo
evaluator = RegressionEvaluator(labelCol='MedHouseVal', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: 'r2'})

# Resultados
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
pandas_pred = predictions.toPandas()
fig = px.scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['prediction'], title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['MedHouseVal'], mode='lines', name='Línea ideal'))
fig.show()

# Validación cruzada
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 1.0, 10.0]).build()
crossval = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cv_model = crossval.fit(data_scaled)

# Resultados de la validación cruzada
best_model = cv_model.bestModel
print(f'Best RMSE: {best_model.summary.rootMeanSquaredError}')
print(f'Best R^2: {best_model.summary.r2}')

RMSE: 0.8919701692541109
R^2: 0.3786071209397184


Best RMSE: 0.8344927771065066
Best R^2: 0.47701698163400486


# **Reg Lasso**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Seleccionar características y variable objetivo
X = df[['MedInc', 'AveRooms', 'AveOccup']]  # Mediana de ingresos, promedio de habitaciones, promedio de ocupantes
y = df['MedHouseVal']  # Valor medio de la vivienda

# Normalizar datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Crear el modelo
lasso = Lasso(alpha=1.0)
lasso.fit(X_train, y_train)

# Predecir
y_pred = lasso.predict(X_test)

# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
fig = px.scatter(x=y_test, y=y_pred, title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode='lines', name='Línea ideal'))
fig.show()

# Validación cruzada
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

mse_scores = cross_val_score(lasso, X_scaled, y, cv=5, scoring=mse_scorer)
r2_scores = cross_val_score(lasso, X_scaled, y, cv=5, scoring=r2_scorer)

rmse_scores = np.sqrt(-mse_scores)
print(f'RMSE scores: {rmse_scores}')
print(f'RMSE mean: {rmse_scores.mean()}')
print(f'RMSE std: {rmse_scores.std()}')

print(f'R^2 scores: {r2_scores}')
print(f'R^2 mean: {r2_scores.mean()}')
print(f'R^2 std: {r2_scores.std()}')

MSE: 1.3106960720039365
RMSE: 1.1448563543099792
R^2: -0.00021908714592466794


RMSE scores: [1.14300564 1.09495832 1.25408734 1.12189797 1.23064605]
RMSE mean: 1.1689190658050883
RMSE std: 0.06231618587595042
R^2 scores: [-0.21613668 -0.02423671 -0.09329273 -0.07285357 -0.0393445 ]
R^2 mean: -0.0891728361025553
R^2 std: 0.06797451537659266


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import plotly.express as px

# Crear la sesión de Spark
spark = SparkSession.builder.appName('LassoRegression').getOrCreate()

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Convertir DataFrame de pandas a Spark DataFrame
spark_df = spark.createDataFrame(df)

# Seleccionar características y variable objetivo
assembler = VectorAssembler(inputCols=['MedInc', 'AveRooms', 'AveOccup'], outputCol='features')
data_assembled = assembler.transform(spark_df).select('features', 'MedHouseVal')

# Normalizar datos
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(data_assembled)
data_scaled = scaler_model.transform(data_assembled).select('scaledFeatures', 'MedHouseVal')

# Dividir en conjuntos de entrenamiento y prueba
train_data, test_data = data_scaled.randomSplit([0.8, 0.2], seed=42)

# Crear el modelo
lr = LinearRegression(featuresCol='scaledFeatures', labelCol='MedHouseVal', regParam=1.0, elasticNetParam=1.0)
lr_model = lr.fit(train_data)

# Predecir
predictions = lr_model.transform(test_data)

# Evaluar el modelo
evaluator = RegressionEvaluator(labelCol='MedHouseVal', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: 'r2'})

# Resultados
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
pandas_pred = predictions.toPandas()
fig = px.scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['prediction'], title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['MedHouseVal'], mode='lines', name='Línea ideal'))
fig.show()

# Validación cruzada
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 1.0, 10.0]).build()
crossval = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cv_model = crossval.fit(data_scaled)

# Resultados de la validación cruzada
best_model = cv_model.bestModel
print(f'Best RMSE: {best_model.summary.rootMeanSquaredError}')
print(f'Best R^2: {best_model.summary.r2}')

RMSE: 1.1315329257172853
R^2: -7.091503606559968e-09


Best RMSE: 0.8432859172576733
Best R^2: 0.4659374578628944


# **Elastic Net**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Seleccionar características y variable objetivo
X = df[['MedInc', 'AveRooms', 'AveOccup']]  # Mediana de ingresos, promedio de habitaciones, promedio de ocupantes
y = df['MedHouseVal']  # Valor medio de la vivienda

# Normalizar datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Crear el modelo
elastic_net = ElasticNet(alpha=1.0, l1_ratio=0.5)
elastic_net.fit(X_train, y_train)

# Predecir
y_pred = elastic_net.predict(X_test)

# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
fig = px.scatter(x=y_test, y=y_pred, title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode='lines', name='Línea ideal'))
fig.show()

# Validación cruzada
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

mse_scores = cross_val_score(elastic_net, X_scaled, y, cv=5, scoring=mse_scorer)
r2_scores = cross_val_score(elastic_net, X_scaled, y, cv=5, scoring=r2_scorer)

rmse_scores = np.sqrt(-mse_scores)
print(f'RMSE scores: {rmse_scores}')
print(f'RMSE mean: {rmse_scores.mean()}')
print(f'RMSE std: {rmse_scores.std()}')

print(f'R^2 scores: {r2_scores}')
print(f'R^2 mean: {r2_scores.mean()}')
print(f'R^2 std: {r2_scores.std()}')

MSE: 1.0429810992343884
RMSE: 1.0212644609670838
R^2: 0.20407970599034497


RMSE scores: [1.01142492 0.97428816 1.12503506 1.02019441 1.10220208]
RMSE mean: 1.0466289272570544
RMSE std: 0.057283837023184084
R^2 scores: [0.0477457  0.18907627 0.12014115 0.11284481 0.16628877]
R^2 mean: 0.12721933967519958
R^2 std: 0.0488374620571402


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import plotly.express as px

# Crear la sesión de Spark
spark = SparkSession.builder.appName('ElasticNetRegression').getOrCreate()

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Convertir DataFrame de pandas a Spark DataFrame
spark_df = spark.createDataFrame(df)

# Seleccionar características y variable objetivo
assembler = VectorAssembler(inputCols=['MedInc', 'AveRooms', 'AveOccup'], outputCol='features')
data_assembled = assembler.transform(spark_df).select('features', 'MedHouseVal')

# Normalizar datos
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(data_assembled)
data_scaled = scaler_model.transform(data_assembled).select('scaledFeatures', 'MedHouseVal')

# Dividir en conjuntos de entrenamiento y prueba
train_data, test_data = data_scaled.randomSplit([0.8, 0.2], seed=42)

# Crear el modelo
lr = LinearRegression(featuresCol='scaledFeatures', labelCol='MedHouseVal', regParam=1.0, elasticNetParam=0.5)
lr_model = lr.fit(train_data)

# Predecir
predictions = lr_model.transform(test_data)

# Evaluar el modelo
evaluator = RegressionEvaluator(labelCol='MedHouseVal', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: 'r2'})

# Resultados
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
pandas_pred = predictions.toPandas()
fig = px.scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['prediction'], title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['MedHouseVal'], mode='lines', name='Línea ideal'))
fig.show()

# Validación cruzada
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 1.0, 10.0]).addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]).build()
crossval = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cv_model = crossval.fit(data_scaled)

# Resultados de la validación cruzada
best_model = cv_model.bestModel
print(f'Best RMSE: {best_model.summary.rootMeanSquaredError}')
print(f'Best R^2: {best_model.summary.r2}')

RMSE: 1.0002351757651935
R^2: 0.2186063840146124


Best RMSE: 0.8344927771065066
Best R^2: 0.47701698163400486


# **Arbol de decision**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Seleccionar características y variable objetivo
X = df[['MedInc', 'AveRooms', 'AveOccup']]  # Mediana de ingresos, promedio de habitaciones, promedio de ocupantes
y = df['MedHouseVal']  # Valor medio de la vivienda

# Normalizar datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Crear el modelo
tree = DecisionTreeRegressor(max_depth=5, random_state=42)
tree.fit(X_train, y_train)

# Predecir
y_pred = tree.predict(X_test)

# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
fig = px.scatter(x=y_test, y=y_pred, title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode='lines', name='Línea ideal'))
fig.show()

# Validación cruzada
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

mse_scores = cross_val_score(tree, X_scaled, y, cv=5, scoring=mse_scorer)
r2_scores = cross_val_score(tree, X_scaled, y, cv=5, scoring=r2_scorer)

rmse_scores = np.sqrt(-mse_scores)
print(f'RMSE scores: {rmse_scores}')
print(f'RMSE mean: {rmse_scores.mean()}')
print(f'RMSE std: {rmse_scores.std()}')

print(f'R^2 scores: {r2_scores}')
print(f'R^2 mean: {r2_scores.mean()}')
print(f'R^2 std: {r2_scores.std()}')

MSE: 0.5569140784037155
RMSE: 0.7462667608862903
R^2: 0.57500743077072


RMSE scores: [0.69971995 0.73973701 0.77186277 0.81724576 0.76164878]
RMSE mean: 0.7580428531273563
RMSE std: 0.038589225494558135
R^2 scores: [0.54424215 0.53252342 0.58584675 0.43070277 0.60189036]
R^2 mean: 0.5390410901409873
R^2 std: 0.05991247626677806


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import plotly.express as px

# Crear la sesión de Spark
spark = SparkSession.builder.appName('DecisionTreeRegression').getOrCreate()

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Convertir DataFrame de pandas a Spark DataFrame
spark_df = spark.createDataFrame(df)

# Seleccionar características y variable objetivo
assembler = VectorAssembler(inputCols=['MedInc', 'AveRooms', 'AveOccup'], outputCol='features')
data_assembled = assembler.transform(spark_df).select('features', 'MedHouseVal')

# Normalizar datos
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(data_assembled)
data_scaled = scaler_model.transform(data_assembled).select('scaledFeatures', 'MedHouseVal')

# Dividir en conjuntos de entrenamiento y prueba
train_data, test_data = data_scaled.randomSplit([0.8, 0.2], seed=42)

# Crear el modelo
tree = DecisionTreeRegressor(featuresCol='scaledFeatures', labelCol='MedHouseVal', maxDepth=5)
tree_model = tree.fit(train_data)

# Predecir
predictions = tree_model.transform(test_data)

# Evaluar el modelo
evaluator = RegressionEvaluator(labelCol='MedHouseVal', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: 'r2'})

# Resultados
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
pandas_pred = predictions.toPandas()
fig = px.scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['prediction'], title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['MedHouseVal'], mode='lines', name='Línea ideal'))
fig.show()

# Validación cruzada
paramGrid = ParamGridBuilder().addGrid(tree.maxDepth, [3, 5, 7]).build()
crossval = CrossValidator(estimator=tree, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cv_model = crossval.fit(data_scaled)

# Resultados de la validación cruzada
best_model = cv_model.bestModel

# Evaluar el mejor modelo de la validación cruzada
predictions_best = best_model.transform(test_data)
rmse_best = evaluator.evaluate(predictions_best)
r2_best = evaluator.evaluate(predictions_best, {evaluator.metricName: 'r2'})

print(f'Best RMSE: {rmse_best}')
print(f'Best R^2: {r2_best}')

RMSE: 0.7200351601535404
R^2: 0.595076464862895


Best RMSE: 0.6831227476156003
Best R^2: 0.6355288925989562


# **Random Forest**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Seleccionar características y variable objetivo
X = df[['MedInc', 'AveRooms', 'AveOccup']]  # Mediana de ingresos, promedio de habitaciones, promedio de ocupantes
y = df['MedHouseVal']  # Valor medio de la vivienda

# Normalizar datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Crear el modelo
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, y_train)

# Predecir
y_pred = rf.predict(X_test)

# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
fig = px.scatter(x=y_test, y=y_pred, title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode='lines', name='Línea ideal'))
fig.show()

# Importancia de características
importances = rf.feature_importances_
feature_names = ['MedInc', 'AveRooms', 'AveOccup']
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
fig = px.bar(feature_importance_df, x='feature', y='importance', title='Importancia de las características')
fig.show()

# Validación cruzada
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

mse_scores = cross_val_score(rf, X_scaled, y, cv=5, scoring=mse_scorer)
r2_scores = cross_val_score(rf, X_scaled, y, cv=5, scoring=r2_scorer)

rmse_scores = np.sqrt(-mse_scores)
print(f'RMSE scores: {rmse_scores}')
print(f'RMSE mean: {rmse_scores.mean()}')
print(f'RMSE std: {rmse_scores.std()}')

print(f'R^2 scores: {r2_scores}')
print(f'R^2 mean: {r2_scores.mean()}')
print(f'R^2 std: {r2_scores.std()}')

MSE: 0.5085522273942719
RMSE: 0.7131284788832036
R^2: 0.6119133523665602


RMSE scores: [0.65352792 0.70245372 0.73078355 0.79845373 0.73224031]
RMSE mean: 0.7234918463708988
RMSE std: 0.04708830721026349
R^2 scores: [0.6024297  0.57845826 0.62875687 0.45658298 0.63204012]
R^2 mean: 0.5796535882197379
R^2 std: 0.06453221815132257


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import plotly.express as px

# Crear la sesión de Spark
spark = SparkSession.builder.appName('RandomForestRegression').getOrCreate()

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Convertir DataFrame de pandas a Spark DataFrame
spark_df = spark.createDataFrame(df)

# Seleccionar características y variable objetivo
assembler = VectorAssembler(inputCols=['MedInc', 'AveRooms', 'AveOccup'], outputCol='features')
data_assembled = assembler.transform(spark_df).select('features', 'MedHouseVal')

# Normalizar datos
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(data_assembled)
data_scaled = scaler_model.transform(data_assembled).select('scaledFeatures', 'MedHouseVal')

# Dividir en conjuntos de entrenamiento y prueba
train_data, test_data = data_scaled.randomSplit([0.8, 0.2], seed=42)

# Crear el modelo
rf = RandomForestRegressor(featuresCol='scaledFeatures', labelCol='MedHouseVal', numTrees=100, maxDepth=10, seed=42)
rf_model = rf.fit(train_data)

# Predecir
predictions = rf_model.transform(test_data)

# Evaluar el modelo
evaluator = RegressionEvaluator(labelCol='MedHouseVal', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: 'r2'})

# Resultados
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
pandas_pred = predictions.toPandas()
fig = px.scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['prediction'], title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['MedHouseVal'], mode='lines', name='Línea ideal'))
fig.show()

# Importancia de características
importances = rf_model.featureImportances
feature_names = ['MedInc', 'AveRooms', 'AveOccup']
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances.toArray()})
fig = px.bar(feature_importance_df, x='feature', y='importance', title='Importancia de las características')
fig.show()

# Validación cruzada
paramGrid = ParamGridBuilder().addGrid(rf.maxDepth, [5, 10, 15]).addGrid(rf.numTrees, [5, 10, 20]).build()
crossval = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cv_model = crossval.fit(data_scaled)

# Resultados de la validación cruzada
best_model = cv_model.bestModel

# Evaluar el mejor modelo de la validación cruzada
predictions_best = best_model.transform(test_data)
rmse_best = evaluator.evaluate(predictions_best)
r2_best = evaluator.evaluate(predictions_best, {evaluator.metricName: 'r2'})

print(f'Best RMSE: {rmse_best}')
print(f'Best R^2: {r2_best}')

RMSE: 0.6904780281887918
R^2: 0.6276380136155003


Best RMSE: 0.6586822249304639
Best R^2: 0.6611421903483674


# **Gradient Boosting**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Seleccionar características y variable objetivo
X = df[['MedInc', 'AveRooms', 'AveOccup']]  # Mediana de ingresos, promedio de habitaciones, promedio de ocupantes
y = df['MedHouseVal']  # Valor medio de la vivienda

# Normalizar datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Crear el modelo
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbr.fit(X_train, y_train)

# Predecir
y_pred = gbr.predict(X_test)

# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
fig = px.scatter(x=y_test, y=y_pred, title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode='lines', name='Línea ideal'))
fig.show()

# Importancia de características
importances = gbr.feature_importances_
feature_names = ['MedInc', 'AveRooms', 'AveOccup']
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
fig = px.bar(feature_importance_df, x='feature', y='importance', title='Importancia de las características')
fig.show()

# Validación cruzada
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

mse_scores = cross_val_score(gbr, X_scaled, y, cv=5, scoring=mse_scorer)
r2_scores = cross_val_score(gbr, X_scaled, y, cv=5, scoring=r2_scorer)

rmse_scores = np.sqrt(-mse_scores)
print(f'RMSE scores: {rmse_scores}')
print(f'RMSE mean: {rmse_scores.mean()}')
print(f'RMSE std: {rmse_scores.std()}')

print(f'R^2 scores: {r2_scores}')
print(f'R^2 mean: {r2_scores.mean()}')
print(f'R^2 std: {r2_scores.std()}')

MSE: 0.5006534150328602
RMSE: 0.7075686645357185
R^2: 0.6179410982783862


RMSE scores: [0.64774459 0.70252416 0.72478273 0.78127576 0.72472619]
RMSE mean: 0.7162106841901297
RMSE std: 0.04302181903073073
R^2 scores: [0.60943509 0.57837371 0.63482876 0.47971367 0.63955326]
R^2 mean: 0.5883808957494445
R^2 std: 0.05852863194034778


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import plotly.express as px

# Crear la sesión de Spark
spark = SparkSession.builder.appName('GradientBoostingRegression').getOrCreate()

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Convertir DataFrame de pandas a Spark DataFrame
spark_df = spark.createDataFrame(df)

# Seleccionar características y variable objetivo
assembler = VectorAssembler(inputCols=['MedInc', 'AveRooms', 'AveOccup'], outputCol='features')
data_assembled = assembler.transform(spark_df).select('features', 'MedHouseVal')

# Normalizar datos
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(data_assembled)
data_scaled = scaler_model.transform(data_assembled).select('scaledFeatures', 'MedHouseVal')

# Dividir en conjuntos de entrenamiento y prueba
train_data, test_data = data_scaled.randomSplit([0.8, 0.2], seed=42)

# Crear el modelo
gbr = GBTRegressor(featuresCol='scaledFeatures', labelCol='MedHouseVal', maxIter=100, maxDepth=3, stepSize=0.1)
gbr_model = gbr.fit(train_data)

# Predecir
predictions = gbr_model.transform(test_data)

# Evaluar el modelo
evaluator = RegressionEvaluator(labelCol='MedHouseVal', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: 'r2'})

# Resultados
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
pandas_pred = predictions.toPandas()
fig = px.scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['prediction'], title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['MedHouseVal'], mode='lines', name='Línea ideal'))
fig.show()

# Importancia de características
importances = gbr_model.featureImportances
feature_names = ['MedInc', 'AveRooms', 'AveOccup']
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances.toArray()})
fig = px.bar(feature_importance_df, x='feature', y='importance', title='Importancia de las características')
fig.show()

# Validación cruzada
paramGrid = ParamGridBuilder().addGrid(gbr.maxDepth, [3, 5, 7]).addGrid(gbr.maxIter, [5, 10, 20]).build()
crossval = CrossValidator(estimator=gbr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cv_model = crossval.fit(data_scaled)

# Resultados de la validación cruzada
best_model = cv_model.bestModel

# Evaluar el mejor modelo de la validación cruzada
predictions_best = best_model.transform(test_data)
rmse_best = evaluator.evaluate(predictions_best)
r2_best = evaluator.evaluate(predictions_best, {evaluator.metricName: 'r2'})

print(f'Best RMSE: {rmse_best}')
print(f'Best R^2: {r2_best}')

RMSE: 0.6887406134478061
R^2: 0.6295095670540674


Best RMSE: 0.6799207374974731
Best R^2: 0.6389376651407965


# **XGBoost**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Seleccionar características y variable objetivo
X = df[['MedInc', 'AveRooms', 'AveOccup']]  # Mediana de ingresos, promedio de habitaciones, promedio de ocupantes
y = df['MedHouseVal']  # Valor medio de la vivienda

# Normalizar datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Crear el modelo
xgbr = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
xgbr.fit(X_train, y_train)

# Predecir
y_pred = xgbr.predict(X_test)

# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
fig = px.scatter(x=y_test, y=y_pred, title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode='lines', name='Línea ideal'))
fig.show()

# Importancia de características
importances = xgbr.feature_importances_
feature_names = ['MedInc', 'AveRooms', 'AveOccup']
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
fig = px.bar(feature_importance_df, x='feature', y='importance', title='Importancia de las características')
fig.show()

# Validación cruzada
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

mse_scores = cross_val_score(xgbr, X_scaled, y, cv=5, scoring=mse_scorer)
r2_scores = cross_val_score(xgbr, X_scaled, y, cv=5, scoring=r2_scorer)

rmse_scores = np.sqrt(-mse_scores)
print(f'RMSE scores: {rmse_scores}')
print(f'RMSE mean: {rmse_scores.mean()}')
print(f'RMSE std: {rmse_scores.std()}')

print(f'R^2 scores: {r2_scores}')
print(f'R^2 mean: {r2_scores.mean()}')
print(f'R^2 std: {r2_scores.std()}')

MSE: 0.4967952243510625
RMSE: 0.7048370197081468
R^2: 0.6208853628140101


RMSE scores: [0.64617547 0.69819037 0.72408697 0.7810309  0.7233086 ]
RMSE mean: 0.7145584626712184
RMSE std: 0.0436579684409588
R^2 scores: [0.61132503 0.58355959 0.63552952 0.48003973 0.64096197]
R^2 mean: 0.5902831681730525
R^2 std: 0.05874888044194399


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import plotly.express as px

# Crear la sesión de Spark
spark = SparkSession.builder.appName('XGBoostRegression').getOrCreate()

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Convertir DataFrame de pandas a Spark DataFrame
spark_df = spark.createDataFrame(df)

# Seleccionar características y variable objetivo
assembler = VectorAssembler(inputCols=['MedInc', 'AveRooms', 'AveOccup'], outputCol='features')
data_assembled = assembler.transform(spark_df).select('features', 'MedHouseVal')

# Normalizar datos
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(data_assembled)
data_scaled = scaler_model.transform(data_assembled).select('scaledFeatures', 'MedHouseVal')

# Dividir en conjuntos de entrenamiento y prueba
train_data, test_data = data_scaled.randomSplit([0.8, 0.2], seed=42)

# Crear el modelo
gbr = GBTRegressor(featuresCol='scaledFeatures', labelCol='MedHouseVal', maxIter=100, maxDepth=3, stepSize=0.1)
gbr_model = gbr.fit(train_data)

# Predecir
predictions = gbr_model.transform(test_data)

# Evaluar el modelo
evaluator = RegressionEvaluator(labelCol='MedHouseVal', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: 'r2'})

# Resultados
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
pandas_pred = predictions.toPandas()
fig = px.scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['prediction'], title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['MedHouseVal'], mode='lines', name='Línea ideal'))
fig.show()

# Importancia de características
importances = gbr_model.featureImportances
feature_names = ['MedInc', 'AveRooms', 'AveOccup']
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances.toArray()})
fig = px.bar(feature_importance_df, x='feature', y='importance', title='Importancia de las características')
fig.show()

# Validación cruzada
paramGrid = ParamGridBuilder().addGrid(gbr.maxDepth, [3, 5, 7]).addGrid(gbr.maxIter, [5, 10, 20]).build()
crossval = CrossValidator(estimator=gbr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cv_model = crossval.fit(data_scaled)

# Resultados de la validación cruzada
best_model = cv_model.bestModel

# Evaluar el mejor modelo de la validación cruzada
predictions_best = best_model.transform(test_data)
rmse_best = evaluator.evaluate(predictions_best)
r2_best = evaluator.evaluate(predictions_best, {evaluator.metricName: 'r2'})

print(f'Best RMSE: {rmse_best}')
print(f'Best R^2: {r2_best}')

RMSE: 0.6887406134478061
R^2: 0.6295095670540674


Best RMSE: 0.6799207374974731
Best R^2: 0.6389376651407965


# **Adaboost**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Seleccionar características y variable objetivo
X = df[['MedInc', 'AveRooms', 'AveOccup']]  # Mediana de ingresos, promedio de habitaciones, promedio de ocupantes
y = df['MedHouseVal']  # Valor medio de la vivienda

# Normalizar datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Crear el modelo
base_estimator = DecisionTreeRegressor(max_depth=3)
ada = AdaBoostRegressor(base_estimator=base_estimator, n_estimators=100, learning_rate=0.1, random_state=42)
ada.fit(X_train, y_train)

# Predecir
y_pred = ada.predict(X_test)

# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
fig = px.scatter(x=y_test, y=y_pred, title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode='lines', name='Línea ideal'))
fig.show()

# Importancia de características
importances = ada.feature_importances_
feature_names = ['MedInc', 'AveRooms', 'AveOccup']
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
fig = px.bar(feature_importance_df, x='feature', y='importance', title='Importancia de las características')
fig.show()

# Validación cruzada
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

mse_scores = cross_val_score(ada, X_scaled, y, cv=5, scoring=mse_scorer)
r2_scores = cross_val_score(ada, X_scaled, y, cv=5, scoring=r2_scorer)

rmse_scores = np.sqrt(-mse_scores)
print(f'RMSE scores: {rmse_scores}')
print(f'RMSE mean: {rmse_scores.mean()}')
print(f'RMSE std: {rmse_scores.std()}')

print(f'R^2 scores: {r2_scores}')
print(f'R^2 mean: {r2_scores.mean()}')
print(f'R^2 std: {r2_scores.std()}')



MSE: 0.6296668245411705
RMSE: 0.7935154847519804
R^2: 0.5194883162457885



`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.


`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.


`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.


`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.


`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.


`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.


`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.


`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.


`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.


`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.



RMSE scores: [0.87490466 0.69811712 0.79709972 0.91031985 0.79277189]
RMSE mean: 0.8146426488601495
RMSE std: 0.07368842605669232
R^2 scores: [0.28746352 0.58364697 0.55832155 0.29364708 0.56868985]
R^2 mean: 0.45835379280929656
R^2 std: 0.1372572500892341


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import plotly.express as px

# Crear la sesión de Spark
spark = SparkSession.builder.appName('AdaBoostRegression').getOrCreate()

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Convertir DataFrame de pandas a Spark DataFrame
spark_df = spark.createDataFrame(df)

# Seleccionar características y variable objetivo
assembler = VectorAssembler(inputCols=['MedInc', 'AveRooms', 'AveOccup'], outputCol='features')
data_assembled = assembler.transform(spark_df).select('features', 'MedHouseVal')

# Normalizar datos
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(data_assembled)
data_scaled = scaler_model.transform(data_assembled).select('scaledFeatures', 'MedHouseVal')

# Dividir en conjuntos de entrenamiento y prueba
train_data, test_data = data_scaled.randomSplit([0.8, 0.2], seed=42)

# Crear el modelo
gbr = GBTRegressor(featuresCol='scaledFeatures', labelCol='MedHouseVal', maxIter=100, maxDepth=3, stepSize=0.1)
gbr_model = gbr.fit(train_data)

# Predecir
predictions = gbr_model.transform(test_data)

# Evaluar el modelo
evaluator = RegressionEvaluator(labelCol='MedHouseVal', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: 'r2'})

# Resultados
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
pandas_pred = predictions.toPandas()
fig = px.scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['prediction'], title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['MedHouseVal'], mode='lines', name='Línea ideal'))
fig.show()

# Importancia de características
importances = gbr_model.featureImportances
feature_names = ['MedInc', 'AveRooms', 'AveOccup']
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances.toArray()})
fig = px.bar(feature_importance_df, x='feature', y='importance', title='Importancia de las características')
fig.show()

# Validación cruzada
paramGrid = ParamGridBuilder().addGrid(gbr.maxDepth, [3, 5, 7]).addGrid(gbr.maxIter, [5, 10, 20]).build()
crossval = CrossValidator(estimator=gbr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cv_model = crossval.fit(data_scaled)

# Resultados de la validación cruzada
best_model = cv_model.bestModel

# Evaluar el mejor modelo de la validación cruzada
predictions_best = best_model.transform(test_data)
rmse_best = evaluator.evaluate(predictions_best)
r2_best = evaluator.evaluate(predictions_best, {evaluator.metricName: 'r2'})

print(f'Best RMSE: {rmse_best}')
print(f'Best R^2: {r2_best}')

RMSE: 0.685810142878437
R^2: 0.6326556035536768


Best RMSE: 0.6798625445766247
Best R^2: 0.6389994675585219


# **Catboost**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Seleccionar características y variable objetivo
X = df[['MedInc', 'AveRooms', 'AveOccup']]  # Mediana de ingresos, promedio de habitaciones, promedio de ocupantes
y = df['MedHouseVal']  # Valor medio de la vivienda

# Normalizar datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Crear el modelo
catboost_model = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=6, random_seed=42, verbose=0)
catboost_model.fit(X_train, y_train)

# Predecir
y_pred = catboost_model.predict(X_test)

# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
fig = px.scatter(x=y_test, y=y_pred, title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode='lines', name='Línea ideal'))
fig.show()

# Importancia de características
importances = catboost_model.get_feature_importance()
feature_names = ['MedInc', 'AveRooms', 'AveOccup']
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
fig = px.bar(feature_importance_df, x='feature', y='importance', title='Importancia de las características')
fig.show()

# Validación cruzada
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

mse_scores = cross_val_score(catboost_model, X_scaled, y, cv=5, scoring=mse_scorer)
r2_scores = cross_val_score(catboost_model, X_scaled, y, cv=5, scoring=r2_scorer)

rmse_scores = np.sqrt(-mse_scores)
print(f'RMSE scores: {rmse_scores}')
print(f'RMSE mean: {rmse_scores.mean()}')
print(f'RMSE std: {rmse_scores.std()}')

print(f'R^2 scores: {r2_scores}')
print(f'R^2 mean: {r2_scores.mean()}')
print(f'R^2 std: {r2_scores.std()}')

MSE: 0.5043661727812356
RMSE: 0.7101874208835549
R^2: 0.6151078166006618


RMSE scores: [0.6554769  0.70273284 0.73522268 0.79670673 0.73374072]
RMSE mean: 0.7247759716544581
RMSE std: 0.04615581331479079
R^2 scores: [0.60005487 0.57812319 0.62423296 0.45895836 0.63053063]
R^2 mean: 0.5783800020802049
R^2 std: 0.06253571070658494


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
import plotly.express as px

# Crear la sesión de Spark
spark = SparkSession.builder.appName('CatBoostRegression').getOrCreate()

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Convertir DataFrame de pandas a Spark DataFrame
spark_df = spark.createDataFrame(df)

# Seleccionar características y variable objetivo
assembler = VectorAssembler(inputCols=['MedInc', 'AveRooms', 'AveOccup'], outputCol='features')
data_assembled = assembler.transform(spark_df).select('features', 'MedHouseVal')

# Normalizar datos
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(data_assembled)
data_scaled = scaler_model.transform(data_assembled).select('scaledFeatures', 'MedHouseVal')

# Convertir de Spark DataFrame a pandas DataFrame
train_data, test_data = data_scaled.randomSplit([0.8, 0.2], seed=42)
train_pd = train_data.toPandas()
test_pd = test_data.toPandas()

X_train = np.array(train_pd['scaledFeatures'].tolist())
y_train = train_pd['MedHouseVal']
X_test = np.array(test_pd['scaledFeatures'].tolist())
y_test = test_pd['MedHouseVal']

# Crear el modelo
catboost_model = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=6, random_seed=42, verbose=0)
catboost_model.fit(X_train, y_train)

# Predecir
y_pred = catboost_model.predict(X_test)

# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
fig = px.scatter(x=y_test, y=y_pred, title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode='lines', name='Línea ideal'))
fig.show()

# Importancia de características
importances = catboost_model.get_feature_importance()
feature_names = ['MedInc', 'AveRooms', 'AveOccup']
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
fig = px.bar(feature_importance_df, x='feature', y='importance', title='Importancia de las características')
fig.show()

# Validación cruzada
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

mse_scores = cross_val_score(catboost_model, X_train, y_train, cv=5, scoring=mse_scorer)
r2_scores = cross_val_score(catboost_model, X_train, y_train, cv=5, scoring=r2_scorer)

rmse_scores = np.sqrt(-mse_scores)
print(f'RMSE scores: {rmse_scores}')
print(f'RMSE mean: {rmse_scores.mean()}')
print(f'RMSE std: {rmse_scores.std()}')

print(f'R^2 scores: {r2_scores}')
print(f'R^2 mean: {r2_scores.mean()}')
print(f'R^2 std: {r2_scores.std()}')

MSE: 0.481862843706194
RMSE: 0.694163412825967
R^2: 0.6236524865911965


RMSE scores: [0.60706809 0.70932952 0.74805194 0.77165208 0.7811797 ]
RMSE mean: 0.723456267820979
RMSE std: 0.06325433171136412
R^2 scores: [0.32088727 0.38966756 0.73641521 0.25452152 0.49185789]
R^2 mean: 0.4386698919452878
R^2 std: 0.16832458875121534


# **LightGBM**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Seleccionar características y variable objetivo
X = df[['MedInc', 'AveRooms', 'AveOccup']]  # Mediana de ingresos, promedio de habitaciones, promedio de ocupantes
y = df['MedHouseVal']  # Valor medio de la vivienda

# Normalizar datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Crear el modelo
lgbm = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
lgbm.fit(X_train, y_train)

# Predecir
y_pred = lgbm.predict(X_test)

# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
fig = px.scatter(x=y_test, y=y_pred, title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode='lines', name='Línea ideal'))
fig.show()

# Importancia de características
importances = lgbm.feature_importances_
feature_names = ['MedInc', 'AveRooms', 'AveOccup']
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
fig = px.bar(feature_importance_df, x='feature', y='importance', title='Importancia de las características')
fig.show()

# Validación cruzada
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

mse_scores = cross_val_score(lgbm, X_scaled, y, cv=5, scoring=mse_scorer)
r2_scores = cross_val_score(lgbm, X_scaled, y, cv=5, scoring=r2_scorer)

rmse_scores = np.sqrt(-mse_scores)
print(f'RMSE scores: {rmse_scores}')
print(f'RMSE mean: {rmse_scores.mean()}')
print(f'RMSE std: {rmse_scores.std()}')

print(f'R^2 scores: {r2_scores}')
print(f'R^2 mean: {r2_scores.mean()}')
print(f'R^2 std: {r2_scores.std()}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 3
[LightGBM] [Info] Start training from score 2.071947
MSE: 0.49610902225337705
RMSE: 0.7043500708123603
R^2: 0.6214090177256291


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000936 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 3
[LightGBM] [Info] Start training from score 2.164930
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000945 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 3
[LightGBM] [Info] Start training from score 2.034871
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000958 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 3
[LightGBM] [Info] Start training 

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import plotly.express as px

# Crear la sesión de Spark
spark = SparkSession.builder.appName('LightGBMRegression').getOrCreate()

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Convertir DataFrame de pandas a Spark DataFrame
spark_df = spark.createDataFrame(df)

# Seleccionar características y variable objetivo
assembler = VectorAssembler(inputCols=['MedInc', 'AveRooms', 'AveOccup'], outputCol='features')
data_assembled = assembler.transform(spark_df).select('features', 'MedHouseVal')

# Normalizar datos
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(data_assembled)
data_scaled = scaler_model.transform(data_assembled).select('scaledFeatures', 'MedHouseVal')

# Dividir en conjuntos de entrenamiento y prueba
train_data, test_data = data_scaled.randomSplit([0.8, 0.2], seed=42)

# Crear el modelo
gbr = GBTRegressor(featuresCol='scaledFeatures', labelCol='MedHouseVal', maxIter=100, maxDepth=3, stepSize=0.1)
gbr_model = gbr.fit(train_data)

# Predecir
predictions = gbr_model.transform(test_data)

# Evaluar el modelo
evaluator = RegressionEvaluator(labelCol='MedHouseVal', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: 'r2'})

# Resultados
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
pandas_pred = predictions.toPandas()
fig = px.scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['prediction'], title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['MedHouseVal'], mode='lines', name='Línea ideal'))
fig.show()

# Importancia de características
importances = gbr_model.featureImportances
feature_names = ['MedInc', 'AveRooms', 'AveOccup']
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances.toArray()})
fig = px.bar(feature_importance_df, x='feature', y='importance', title='Importancia de las características')
fig.show()

# Validación cruzada
paramGrid = ParamGridBuilder().addGrid(gbr.maxDepth, [3, 5, 7]).addGrid(gbr.maxIter, [5, 10, 20]).build()
crossval = CrossValidator(estimator=gbr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cv_model = crossval.fit(data_scaled)

# Resultados de la validación cruzada
best_model = cv_model.bestModel

# Evaluar el mejor modelo de la validación cruzada
predictions_best = best_model.transform(test_data)
rmse_best = evaluator.evaluate(predictions_best)
r2_best = evaluator.evaluate(predictions_best, {evaluator.metricName: 'r2'})

print(f'Best RMSE: {rmse_best}')
print(f'Best R^2: {r2_best}')

RMSE: 0.6887406134478061
R^2: 0.6295095670540674


KeyboardInterrupt: 

# **SVM**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Seleccionar características y variable objetivo
X = df[['MedInc', 'AveRooms', 'AveOccup']]  # Mediana de ingresos, promedio de habitaciones, promedio de ocupantes
y = df['MedHouseVal']  # Valor medio de la vivienda

# Normalizar datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Crear el modelo
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)
svr.fit(X_train, y_train)

# Predecir
y_pred = svr.predict(X_test)

# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
fig = px.scatter(x=y_test, y=y_pred, title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode='lines', name='Línea ideal'))
fig.show()

# Validación cruzada
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

mse_scores = cross_val_score(svr, X_scaled, y, cv=5, scoring=mse_scorer)
r2_scores = cross_val_score(svr, X_scaled, y, cv=5, scoring=r2_scorer)

rmse_scores = np.sqrt(-mse_scores)
print(f'RMSE scores: {rmse_scores}')
print(f'RMSE mean: {rmse_scores.mean()}')
print(f'RMSE std: {rmse_scores.std()}')

print(f'R^2 scores: {r2_scores}')
print(f'R^2 mean: {r2_scores.mean()}')
print(f'R^2 std: {r2_scores.std()}')

MSE: 0.5475260564291637
RMSE: 0.7399500364410855
R^2: 0.5821716231186345


RMSE scores: [0.60631657 0.76149795 0.78366867 0.78768198 0.75672547]
RMSE mean: 0.7391781268659441
RMSE std: 0.06751342106089031
R^2 scores: [0.65779645 0.50461525 0.57308062 0.47114631 0.6070205 ]
R^2 mean: 0.5627318264092077
R^2 std: 0.06763329181109078


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import plotly.express as px

# Crear la sesión de Spark
spark = SparkSession.builder.appName('SVRRegression').getOrCreate()

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Convertir DataFrame de pandas a Spark DataFrame
spark_df = spark.createDataFrame(df)

# Seleccionar características y variable objetivo
assembler = VectorAssembler(inputCols=['MedInc', 'AveRooms', 'AveOccup'], outputCol='features')
data_assembled = assembler.transform(spark_df).select('features', 'MedHouseVal')

# Normalizar datos
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(data_assembled)
data_scaled = scaler_model.transform(data_assembled).select('scaledFeatures', 'MedHouseVal')

# Dividir en conjuntos de entrenamiento y prueba
train_data, test_data = data_scaled.randomSplit([0.8, 0.2], seed=42)

# Crear el modelo
# PySpark no tiene una implementación directa de SVR, así que se usa LinearRegression con polynomial expansion como aproximación
lr = LinearRegression(featuresCol='scaledFeatures', labelCol='MedHouseVal')
lr_model = lr.fit(train_data)

# Predecir
predictions = lr_model.transform(test_data)

# Evaluar el modelo
evaluator = RegressionEvaluator(labelCol='MedHouseVal', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: 'r2'})

# Resultados
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
pandas_pred = predictions.toPandas()
fig = px.scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['prediction'], title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['MedHouseVal'], mode='lines', name='Línea ideal'))
fig.show()

# Validación cruzada
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [50, 100]).build()
crossval = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cv_model = crossval.fit(data_scaled)

# Resultados de la validación cruzada
best_model = cv_model.bestModel

# Evaluar el mejor modelo de la validación cruzada
predictions_best = best_model.transform(test_data)
rmse_best = evaluator.evaluate(predictions_best)
r2_best = evaluator.evaluate(predictions_best, {evaluator.metricName: 'r2'})

print(f'Best RMSE: {rmse_best}')
print(f'Best R^2: {r2_best}')

RMSE: 0.809170772555698
R^2: 0.4886173530560296


Best RMSE: 0.8074665930696612
Best R^2: 0.4907691117053492


# **Reg Polinomica**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import fetch_california_housing

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Seleccionar características y variable objetivo
X = df[['MedInc']]  # Mediana de ingresos
y = df['MedHouseVal']  # Valor medio de la vivienda

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear el modelo
poly_features = PolynomialFeatures(degree=2)
lin_reg = LinearRegression()
model = make_pipeline(poly_features, StandardScaler(), lin_reg)

# Entrenar el modelo
model.fit(X_train, y_train)

# Predecir
y_pred = model.predict(X_test)

# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
fig = px.scatter(x=X_test['MedInc'], y=y_test, title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=X_test['MedInc'], y=y_pred, mode='lines', name='Línea de ajuste'))
fig.show()

# Validación cruzada
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

mse_scores = cross_val_score(model, X, y, cv=5, scoring=mse_scorer)
r2_scores = cross_val_score(model, X, y, cv=5, scoring=r2_scorer)

rmse_scores = np.sqrt(-mse_scores)
print(f'RMSE scores: {rmse_scores}')
print(f'RMSE mean: {rmse_scores.mean()}')
print(f'RMSE std: {rmse_scores.std()}')

print(f'R^2 scores: {r2_scores}')
print(f'R^2 mean: {r2_scores.mean()}')
print(f'R^2 std: {r2_scores.std()}')

MSE: 0.7032732680932144
RMSE: 0.8386138969115731
R^2: 0.46331772769346224


RMSE scores: [0.75606566 0.86898311 0.87140892 0.9130188  0.81969794]
RMSE mean: 0.845834885895604
RMSE std: 0.0537497983814084
R^2 scores: [0.46788603 0.35489883 0.47213254 0.28945242 0.53889388]
R^2 mean: 0.42465273930399033
R^2 std: 0.08979891074413518


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler, PolynomialExpansion
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import plotly.express as px

# Crear la sesión de Spark
spark = SparkSession.builder.appName('PolynomialRegression').getOrCreate()

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Convertir DataFrame de pandas a Spark DataFrame
spark_df = spark.createDataFrame(df)

# Seleccionar características y variable objetivo
assembler = VectorAssembler(inputCols=['MedInc'], outputCol='features')
data_assembled = assembler.transform(spark_df).select('features', 'MedHouseVal')

# Normalizar datos
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(data_assembled)
data_scaled = scaler_model.transform(data_assembled).select('scaledFeatures', 'MedHouseVal')

# Expansión polinómica
poly_expansion = PolynomialExpansion(degree=2, inputCol='scaledFeatures', outputCol='polyFeatures')
data_poly = poly_expansion.transform(data_scaled).select('polyFeatures', 'MedHouseVal')

# Dividir en conjuntos de entrenamiento y prueba
train_data, test_data = data_poly.randomSplit([0.8, 0.2], seed=42)

# Crear el modelo
lr = LinearRegression(featuresCol='polyFeatures', labelCol='MedHouseVal')
lr_model = lr.fit(train_data)

# Predecir
predictions = lr_model.transform(test_data)

# Evaluar el modelo
evaluator = RegressionEvaluator(labelCol='MedHouseVal', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: 'r2'})

# Resultados
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

# Gráficos
pandas_pred = predictions.toPandas()
fig = px.scatter(x=pandas_pred['polyFeatures'].apply(lambda x: x[1]), y=pandas_pred['MedHouseVal'], title='Predicciones vs Observaciones')
fig.add_trace(go.Scatter(x=pandas_pred['polyFeatures'].apply(lambda x: x[1]), y=pandas_pred['prediction'], mode='lines', name='Línea de ajuste'))
fig.show()

# Validación cruzada
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [50, 100]).build()
crossval = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cv_model = crossval.fit(data_poly)

# Resultados de la validación cruzada
best_model = cv_model.bestModel

# Evaluar el mejor modelo de la validación cruzada
predictions_best = best_model.transform(test_data)
rmse_best = evaluator.evaluate(predictions_best)
r2_best = evaluator.evaluate(predictions_best, {evaluator.metricName: 'r2'})

print(f'Best RMSE: {rmse_best}')
print(f'Best R^2: {r2_best}')

RMSE: 0.8217951770063591
R^2: 0.4792333970322046


Best RMSE: 0.8217700307969854
Best R^2: 0.47926526654433355


# **Resumen**

In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, make_scorer
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import fetch_california_housing

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Seleccionar características y variable objetivo
X = df[['MedInc', 'AveRooms', 'AveOccup']]  # Características seleccionadas
y = df['MedHouseVal']  # Variable objetivo

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definir los modelos a evaluar
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostRegressor(n_estimators=100, random_state=42),
    'SVR': SVR(kernel='rbf')
}

# Definir el pipeline para estandarización y el modelo
pipelines = {}
for name, model in models.items():
    pipelines[name] = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])

# Definir las métricas de evaluación
scoring = {'R2': 'r2', 'MAE': 'neg_mean_absolute_error'}

# Realizar validación cruzada y evaluar modelos
results = {}
for name, pipeline in pipelines.items():
    grid = GridSearchCV(estimator=pipeline, param_grid={}, scoring=scoring, refit='R2', cv=5)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    results[name] = {
        'R2': r2_score(y_test, y_pred),
        'MAE': mean_absolute_error(y_test, y_pred),
        'Model': grid.best_estimator_
    }
    print(f"Model: {name}, R2: {results[name]['R2']:.4f}, MAE: {results[name]['MAE']:.4f}")

# Seleccionar el mejor modelo basado en R2
best_model_name = max(results, key=lambda name: results[name]['R2'])
best_model = results[best_model_name]['Model']

# Mostrar resultados del mejor modelo
print(f"\nBest Model: {best_model_name}")
y_pred_best = best_model.predict(X_test)
print(f"Best Model R2: {r2_score(y_test, y_pred_best):.4f}")
print(f"Best Model MAE: {mean_absolute_error(y_test, y_pred_best):.4f}")

# Gráficos representativos
# Gráfico de predicciones vs observaciones para el mejor modelo
fig = px.scatter(x=y_test, y=y_pred_best, title=f'Predicciones vs Observaciones - {best_model_name}')
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode='lines', name='Línea ideal'))
fig.show()

# Importancia de características para el Random Forest
if 'Random Forest' in results:
    importances = results['Random Forest']['Model'].named_steps['model'].feature_importances_
    feature_names = ['MedInc', 'AveRooms', 'AveOccup']
    feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
    fig = px.bar(feature_importance_df, x='feature', y='importance', title='Importancia de las características - Random Forest')
    fig.show()

Model: Linear Regression, R2: 0.4653, MAE: 0.6240
Model: Ridge, R2: 0.4653, MAE: 0.6240
Model: Lasso, R2: -0.0002, MAE: 0.9061
Model: ElasticNet, R2: 0.2031, MAE: 0.8060
Model: Random Forest, R2: 0.5869, MAE: 0.5270
Model: Gradient Boosting, R2: 0.6179, MAE: 0.5090
Model: AdaBoost, R2: 0.4483, MAE: 0.7051
Model: SVR, R2: 0.5811, MAE: 0.5164

Best Model: Gradient Boosting
Best Model R2: 0.6179
Best Model MAE: 0.5090


In [14]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler, PolynomialExpansion
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import pandas as pd
import numpy as np
import plotly.express as px

# Crear la sesión de Spark
spark = SparkSession.builder.appName('ModelComparison').getOrCreate()

# Cargar datos
data = fetch_california_housing(as_frame=True)
df = data.frame

# Convertir DataFrame de pandas a Spark DataFrame
spark_df = spark.createDataFrame(df)

# Seleccionar características y variable objetivo
assembler = VectorAssembler(inputCols=['MedInc', 'AveRooms', 'AveOccup'], outputCol='features')
data_assembled = assembler.transform(spark_df).select('features', 'MedHouseVal')

# Normalizar datos
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(data_assembled)
data_scaled = scaler_model.transform(data_assembled).select('scaledFeatures', 'MedHouseVal')

# Dividir en conjuntos de entrenamiento y prueba
train_data, test_data = data_scaled.randomSplit([0.8, 0.2], seed=42)

# Definir los modelos a evaluar
models = {
    'Linear Regression': LinearRegression(featuresCol='scaledFeatures', labelCol='MedHouseVal'),
    'Random Forest': RandomForestRegressor(featuresCol='scaledFeatures', labelCol='MedHouseVal', numTrees=100, maxDepth=10, seed=42),
    'Gradient Boosting': GBTRegressor(featuresCol='scaledFeatures', labelCol='MedHouseVal', maxIter=100, maxDepth=3)
}

# Evaluar los modelos utilizando validación cruzada
results = {}
for name, model in models.items():
    paramGrid = ParamGridBuilder().build()
    evaluator = RegressionEvaluator(labelCol='MedHouseVal', predictionCol='prediction', metricName='r2')
    cv = CrossValidator(estimator=model, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
    cv_model = cv.fit(train_data)

    # Evaluar el mejor modelo
    predictions = cv_model.transform(test_data)
    r2 = evaluator.evaluate(predictions)
    mae_evaluator = RegressionEvaluator(labelCol='MedHouseVal', predictionCol='prediction', metricName='mae')
    mae = mae_evaluator.evaluate(predictions)

    results[name] = {
        'R2': r2,
        'MAE': mae,
        'Model': cv_model.bestModel
    }
    print(f"Model: {name}, R2: {r2:.4f}, MAE: {mae:.4f}")

# Seleccionar el mejor modelo basado en R2
best_model_name = max(results, key=lambda name: results[name]['R2'])
best_model = results[best_model_name]['Model']

# Mostrar resultados del mejor modelo
print(f"\nBest Model: {best_model_name}")
predictions_best = best_model.transform(test_data)
r2_best = evaluator.evaluate(predictions_best)
mae_best = mae_evaluator.evaluate(predictions_best)
print(f"Best Model R2: {r2_best:.4f}")
print(f"Best Model MAE: {mae_best:.4f}")

# Convertir predicciones a pandas DataFrame para gráficos
pandas_pred = predictions_best.toPandas()

# Gráficos representativos
# Gráfico de predicciones vs observaciones para el mejor modelo
fig = px.scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['prediction'], title=f'Predicciones vs Observaciones - {best_model_name}')
fig.add_trace(go.Scatter(x=pandas_pred['MedHouseVal'], y=pandas_pred['MedHouseVal'], mode='lines', name='Línea ideal'))
fig.show()

# Importancia de características para el Random Forest (si aplica)
if best_model_name == 'Random Forest':
    importances = best_model.featureImportances
    feature_names = ['MedInc', 'AveRooms', 'AveOccup']
    feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances.toArray()})
    fig = px.bar(feature_importance_df, x='feature', y='importance', title='Importancia de las características - Random Forest')
    fig.show()

Model: Linear Regression, R2: 0.4886, MAE: 0.6018
Model: Random Forest, R2: 0.6276, MAE: 0.5051
Model: Gradient Boosting, R2: 0.6327, MAE: 0.4949

Best Model: Gradient Boosting
Best Model R2: 0.6327
Best Model MAE: 0.4949
