In [0]:
# Databricks notebook PySpark
# Proyecto: Automatización de series económicas del Perú

# =======================
# Celda 1: Instalar dependencias
# =======================
# Databricks notebook PySpark
%pip install econdata scikit-learn mlflow

In [0]:
dbutils.library.restartPython()

In [0]:
import pandas as pd
from econdata import BCRP
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import mlflow
import mlflow.sklearn


In [0]:
# Buscar series disponibles (opcional, para explorar)
BCRP.search(
    consulta=['PBI'],
    grupo=['Producto', 'variaciones'],
    frecuencia='Trimestral'
)

# Descargar serie de PIB (ejemplo: PN02526AQ = PIB)
df_pib = BCRP.get_data(
    series={'PN02526AQ': 'PIB'},
    fechaini='2000Q1',
    fechafin='2024Q4'
)

# Ajustar índice
df_pib.index = pd.PeriodIndex(df_pib.index, freq="Q").to_timestamp()
df_pib = df_pib.rename(columns={"PIB":"PIB_PEN"}).dropna()

display(df_pib.head())


In [0]:
# Creamos variables rezagadas para modelar dependencia temporal
df_ml = df_pib.copy()
df_ml["PIB_LAG1"] = df_ml["PIB_PEN"].shift(1)
df_ml["PIB_LAG2"] = df_ml["PIB_PEN"].shift(2)
df_ml["PIB_LAG3"] = df_ml["PIB_PEN"].shift(3)

# Eliminamos filas con NaN (por los rezagos)
df_ml = df_ml.dropna()

display(df_ml.head())


In [0]:
# Si no tienes base, puedes usar la "default"
spark.sql("""
    CREATE TABLE IF NOT EXISTS default.pib_bcrp_lags (
        date TIMESTAMP,
        PIB_PEN DOUBLE,
        PIB_LAG1 DOUBLE,
        PIB_LAG2 DOUBLE,
        PIB_LAG3 DOUBLE
    )
    USING DELTA
""")

print("✅ Tabla Delta 'default.pib_bcrp_lags' creada")


In [0]:
# Convertimos Pandas -> Spark
spark_df = spark.createDataFrame(df_ml.reset_index())

# Insertamos en la tabla (append si quieres versionado histórico)
spark_df.write.insertInto("default.pib_bcrp_lags", overwrite=True)

print("✅ Datos insertados en la tabla Delta 'default.pib_bcrp_lags'")


In [0]:
df_delta = spark.read.table("default.pib_bcrp_lags")
df_delta.show(5)

In [0]:
# Variables predictoras y target
X = df_ml[["PIB_LAG1", "PIB_LAG2", "PIB_LAG3"]]
y = df_ml["PIB_PEN"]

# Split en train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

# Entrenamiento con regresión lineal
model = LinearRegression()
model.fit(X_train, y_train)

# Predicciones
y_pred = model.predict(X_test)

# Métricas
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {mse**0.5:.2f}")
print(f"R2: {r2:.4f}")


In [0]:
mlflow.set_experiment("/Users/2501350@esan.edu.pe/PIB_BCRP")

with mlflow.start_run():
    mlflow.log_param("lags", 3)
    mlflow.log_metric("rmse", mse**0.5)
    mlflow.log_metric("r2", r2)
    
    # Guardar el modelo
    mlflow.sklearn.log_model(model, "PIB_model")

print("✅ Modelo registrado en MLflow")


In [0]:
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

mlflow.set_experiment("/Users/2501350@esan.edu.pe/PIB_BCRP")

# Inferimos la firma del modelo (entrada y salida)
signature = infer_signature(X_test, y_pred)

with mlflow.start_run():
    mlflow.log_param("lags", 3)
    mlflow.log_metric("rmse", mse**0.5)
    mlflow.log_metric("r2", r2)
    
    # Guardamos el modelo con signature
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="PIB_model",
        signature=signature,
        input_example=X_test.iloc[:5]  # opcional: muestra de entrada
    )
mlflow.end_run()
print("✅ Modelo registrado en MLflow con signature")


### REGISTRANDO EL MODELO

In [0]:
import mlflow

experiments = mlflow.search_experiments()

for exp in experiments:
    print(f'Nombre: {exp.name}, ID: {exp.experiment_id}')

In [0]:
experiment_id = exp = experiments[0].experiment_id
print(experiment_id)

In [0]:
import mlflow

runs = mlflow.search_runs(experiment_ids=[experiment_id], order_by=["start_time DESC"])

if not runs.empty:
    latest_run = runs.iloc[0]["run_id"]
    print(f"Ultimo run_id: {latest_run}")
else:
    print("No hay ejecuciones en este experimiento.")

In [0]:
run_id = latest_run
model_name = 'PIB_model'

mlflow.register_model(
    model_uri=f"runs:/{run_id}/PIB_model",
    name=model_name
)

### PROBANDO EL MODELO

In [0]:
# Cargar datos de la tabla Delta
df_delta = spark.read.table("default.pib_bcrp_lags").toPandas()

# Separar variables predictoras (rezagos) y target
X_test = df_delta[["PIB_LAG1", "PIB_LAG2", "PIB_LAG3"]]
y_true = df_delta["PIB_PEN"]


In [0]:
import mlflow.sklearn

# Cargar el último modelo registrado
model_uri = "models:/PIB_model/1"   # nombre:version
model = mlflow.sklearn.load_model(model_uri)


In [0]:
# Predicciones
y_pred = model.predict(X_test)

# Comparación real vs predicho
resultados = pd.DataFrame({
    "Fecha": df_delta["date"],
    "PIB_Real": y_true,
    "PIB_Predicho": y_pred
})

print("✅ Resultados de prueba del modelo:")
print(resultados.tail(10))  # últimos 10 registros


In [0]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12,6))
plt.plot(resultados["Fecha"], resultados["PIB_Real"], label="PIB Real", color="blue")
plt.plot(resultados["Fecha"], resultados["PIB_Predicho"], label="PIB Predicho", color="red", linestyle="--")
plt.title("Comparación: PIB Real vs Predicho")
plt.xlabel("Fecha")
plt.ylabel("PIB (millones S/)")
plt.legend()
plt.show()
