In [4]:
import logging
import os

from pyspark import SparkConf
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, Imputer, SQLTransformer
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, countDistinct, when
import mlflow.spark
# add src to path
import sys

sys.path.append("../src")

from ..logging_config import configure_logger
from ..utils import log_env, setup_env, validate_env

logger = configure_logger("predictive", logging.DEBUG)

setup_env()
validate_env()
log_env()

MONGODB_HOST = os.environ.get("MONGODB_HOST")
MONGODB_PORT = os.environ.get("MONGODB_PORT")
MONGODB_URI = f"mongodb://{MONGODB_HOST}:{MONGODB_PORT}/"
MONGODB_DB = os.environ.get("MONGODB_DB")

SPARK_AVRO_JAR = os.environ.get("SPARK_AVRO_JAR")
SPARK_MONGO_CONNECTOR_JAR = os.environ.get("SPARK_MONGO_CONNECTOR_JAR")
CONF = (
    SparkConf()
    .set("spark.master", "local")
    .set("spark.app.name", "BDM Spark formatted pipeline")
    .set(
        "spark.jars.packages",
        f"{SPARK_AVRO_JAR},{SPARK_MONGO_CONNECTOR_JAR}",
    )
    .set("spark.driver.extraJavaOptions", "-Dlog4j.configuration=file:log4j.properties")
    .set(
        "spark.executor.extraJavaOptions", "-Dlog4j.configuration=file:log4j.properties"
    )
    .set("spark.mongodb.read.connection.uri", MONGODB_URI)
)

spark = SparkSession.builder.config(conf=CONF).getOrCreate()
logger.info(f"Python version = {spark.sparkContext.pythonVer}")
logger.info(f"Spark version = {spark.version}")


ImportError: attempted relative import with no known parent package

In [None]:


logger.info("Loading data from MongoDB")
# Load the data from MongoDB
idealista_df = (
    spark.read.format("mongodb")
    .option("database", MONGODB_DB)
    .option("collection", "idealista")
    .load()
)

# Cast all integer features to double
integer_columns = [field.name for field in idealista_df.schema.fields if 'IntegerType' in str(field.dataType)]
cast_stages = []
for col_name in integer_columns:
    cast_stages.append(SQLTransformer(statement=f"SELECT *, CAST({col_name} AS DOUBLE) AS {col_name}_cast FROM __THIS__"))

# Impute numeric (integer, double) features with the mean
numeric_columns = [field.name for field in idealista_df.schema.fields if 'IntegerType' in str(field.dataType) or 'DoubleType' in str(field.dataType)]
imputer = Imputer(inputCols=[f"{c}_cast" for c in numeric_columns], outputCols=[f"{c}_imputed" for c in numeric_columns]).setStrategy("mean")

# Cast all boolean features to string (categorical encoding)
boolean_columns = [field.name for field in idealista_df.schema.fields if 'BooleanType' in str(field.dataType)]
for col_name in boolean_columns:
    cast_stages.append(SQLTransformer(statement=f"SELECT *, CAST({col_name} AS STRING) AS {col_name}_cast FROM __THIS__"))

# Impute nulls in categorical features with 'Unknown'
categorical_columns = [field.name for field in idealista_df.schema.fields if 'StringType' in str(field.dataType)]
impute_expr = [when(col(c).isNull(), 'Unknown').otherwise(col(c)).alias(c) for c in categorical_columns]
impute_stage = SQLTransformer(statement=f"SELECT *, " + ", ".join([str(expr) for expr in impute_expr]) + " FROM __THIS__")

# Remove columns that only have one class from categorical features
distinct_counts = idealista_df.select([countDistinct(col(c)).alias(c) for c in categorical_columns]).collect()[0].asDict()
columns_to_keep = [k for k, v in distinct_counts.items() if v > 1]
idealista_df = idealista_df.select(*columns_to_keep, *numeric_columns)

# Prepare the features for the model
indexers = [StringIndexer(inputCol=c, outputCol=f"{c}_indexed") for c in categorical_columns]
encoders = [OneHotEncoder(inputCol=f"{c}_indexed", outputCol=f"{c}_encoded") for c in categorical_columns]

# Assemble all the features into a vector
feature_cols = [f"{c}_encoded" for c in categorical_columns] + [f"{c}_imputed" for c in numeric_columns]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Define the Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="price")

# Create a pipeline
pipeline = Pipeline(stages=cast_stages + [impute_stage] + indexers + encoders + [imputer, assembler, lr])

# Train-test split
train_df, test_df = idealista_df.randomSplit([0.8, 0.2], seed=42)

# Train the model
model = pipeline.fit(train_df)

# Log the model with mlflow
mlflow.spark.log_model(model, "linear_regression_model")

logger.info("Model training complete and logged with MLflow")

# Make predictions on the test set
predictions = model.transform(test_df)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
logger.info(f"Root Mean Squared Error (RMSE) on test data = {rmse}")

evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
logger.info(f"R-squared on test data = {r2}")

# Output the evaluation metrics
print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")
print(f"R-squared on test data = {r2}")
