<a href="https://colab.research.google.com/github/aaabhijith13/linkedIN_posts/blob/main/SparkMLib.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SparkMLib

In [48]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import col, count, when, isnull, skewness, log1p, col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor
from xgboost.spark import SparkXGBRegressor
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator


In [9]:
TARGET = "median_house_value"
spark = SparkSession.builder.getOrCreate()

In [10]:
df = spark.read.csv("sample_data/california_housing_train.csv", header=True, inferSchema=True) #local files, we have header = True because first row is the header in the csv


In [16]:
df.printSchema() #all columns copied correctly.
df.describe().show()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)

+-------+-------------------+------------------+------------------+-----------------+-----------------+------------------+-----------------+------------------+------------------+
|summary|          longitude|          latitude|housing_median_age|      total_rooms|   total_bedrooms|        population|       households|     median_income|median_house_value|
+-------+-------------------+------------------+------------------+-----------------+-----------------+------------------+-----------------+------------------+------------------+
|  count|              17000|             17000|          

# Feature Engineering
1. Look For null values
2. Look for outliers - Skewness of skew > 1 - right skew, Skew < 1 left skew, around 0.5 1 is moderate to good.



In [21]:
df.select([count(when(col(c).isNull(), c)).alias(c)for c in df.columns]).show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|        0|       0|                 0|          0|             0|         0|         0|            0|                 0|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+



In [100]:

def engineer_features(_df):
    # --- Ratios with guards ---
    _df = _df.withColumn(
        "rooms_per_household",
        when((col("households") > 0) & col("total_rooms").isNotNull(),
             col("total_rooms") / col("households")).otherwise(None)
    ).withColumn(
        "bedrooms_per_room",
        when((col("total_rooms") > 0) & col("total_bedrooms").isNotNull(),
             col("total_bedrooms") / col("total_rooms")).otherwise(None)
    ).withColumn(
        "population_per_household",
        when((col("households") > 0) & col("population").isNotNull(),
             col("population") / col("households")).otherwise(None)
    )

    # --- 99th percentile of population_per_household (ignore nulls) ---
    p99 = _df.filter(col("population_per_household").isNotNull()) \
             .approxQuantile("population_per_household", [0.99], 0.01)[0]

    # --- Create a NEW capped column (do NOT overwrite the original) ---
    _df = _df.withColumn(
        "population_per_household_capped",
        when(col("population_per_household") > p99, p99)
        .otherwise(col("population_per_household"))
    )

    # --- Log transforms with guards ---
    for c in ["population", "total_rooms", "total_bedrooms", "households",
              "rooms_per_household", "bedrooms_per_room", "median_income"]:
        _df = _df.withColumn(
            f"log_{c}",
            when(col(c).isNotNull() & (col(c) >= 0), log1p(col(c))).otherwise(None)
        )

    # IMPORTANT: log the CAPPED ratio (this was missing/renamed before)
    _df = _df.withColumn(
        "log_population_per_household_capped",
        when(
            col("population_per_household_capped").isNotNull() &
            (col("population_per_household_capped") >= 0),
            log1p(col("population_per_household_capped"))
        ).otherwise(None)
    )

    return _df


1. There were no null values
2. We computed statistical skewness for continuous features and applied log1p transformations to variables with skewness > 1 to reduce long-tail effects and stabilize tree splits.

In [108]:
housing_df = engineer_features(df).cache()
housing_df

DataFrame[longitude: double, latitude: double, housing_median_age: double, total_rooms: double, total_bedrooms: double, population: double, households: double, median_income: double, median_house_value: double, rooms_per_household: double, bedrooms_per_room: double, population_per_household: double, population_per_household_capped: double, log_population: double, log_total_rooms: double, log_total_bedrooms: double, log_households: double, log_rooms_per_household: double, log_bedrooms_per_room: double, log_median_income: double, log_population_per_household_capped: double]

In [102]:
housing_df.show(3)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+-------------------+-------------------+------------------------+-------------------------------+-----------------+-----------------+------------------+-----------------+-----------------------+---------------------+------------------+-----------------------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|rooms_per_household|  bedrooms_per_room|population_per_household|population_per_household_capped|   log_population|  log_total_rooms|log_total_bedrooms|   log_households|log_rooms_per_household|log_bedrooms_per_room| log_median_income|log_population_per_household_capped|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+-------------------+-------------------+------------------------+----------------------------

From data validation we can see that there is clear change in skewness. Heavily skewed data has come closer to 1. Normalization was intentionally omitted because the models used (Decision Tree and XGBoost) are invariant to feature scaling. Instead, log transformations were applied to reduce skew and improve split stability.

In [96]:
feature_cols = [
    "housing_median_age", "latitude", "longitude",
    "log_population", "log_total_rooms", "log_total_bedrooms",
    "log_households", "log_rooms_per_household",
    "log_bedrooms_per_room",
    "log_population_per_household_capped",
    "log_median_income",
]

In [79]:
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
) #Spark models do not accept raw columns.

Now that data is prepared, split the data for train and validation for training and testing. We will use unseen test data for final testing.

In [80]:
train_df, val_df = housing_df.randomSplit([0.8, 0.2], seed=42)

In [81]:
dt = DecisionTreeRegressor(
    featuresCol="features",
    labelCol="median_house_value",
    maxDepth=5
)
xgb = SparkXGBRegressor(
    features_col="features",
    label_col="median_house_value",
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    num_round=200
) #base decision tree and XGB

In [82]:
pipeline_dt = Pipeline(stages=[assembler, dt])
pipeline_xgb = Pipeline(stages=[assembler, xgb]) #creating pipelines with assembler and the models

In [109]:
evaluator = RegressionEvaluator(
    labelCol="median_house_value",
    predictionCol="prediction",
    metricName="rmse"
) #only using 1 evalutor for now as this is just to show the capabilities of SparkML

In [85]:
dt_param_grid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [3, 5, 8]) \
    .addGrid(dt.minInstancesPerNode, [1, 5, 10]) \
    .build()
xgb_param_grid = ParamGridBuilder() \
    .addGrid(xgb.max_depth, [4, 6, 8]) \
    .addGrid(xgb.learning_rate, [0.05, 0.1]) \
    .build()
#Param builder for CV can add more if need be.

For simple baselines, I train on a fixed training set and evaluate on a validation set. For tuned models, I use cross-validation on the combined train+validation data to select hyperparameters, keeping the test set completely untouched.

In [86]:
cv_dt = CrossValidator(
    estimator=pipeline_dt,
    estimatorParamMaps=dt_param_grid,
    evaluator=evaluator,
    numFolds=3,
    parallelism=4
)

dt_model = cv_dt.fit(train_df)


In [87]:
cv_xgb = CrossValidator(
    estimator=pipeline_xgb,
    estimatorParamMaps=xgb_param_grid,
    evaluator=evaluator,
    numFolds=3,
    parallelism=4
)

xgb_model = cv_xgb.fit(train_df)


INFO:XGBoost-PySpark:Running xgboost-3.1.2 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 4, 'objective': 'reg:squarederror', 'subsample': 0.8, 'num_round': 200, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
INFO:XGBoost-PySpark:Running xgboost-3.1.2 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'reg:squarederror', 'subsample': 0.8, 'num_round': 200, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
INFO:XGBoost-PySpark:Running xgboost-3.1.2 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.05, 'max_depth': 4, 'objective': 'reg:squarederror', 'subsample': 0.8, 'num_round': 200, 'nthread': 1}
	train_call_kwargs_params: {'

Final evaluation of all data and predictions

In [88]:
dt_val_pred = dt_model.transform(val_df)
xgb_val_pred = xgb_model.transform(val_df)
dt_val_pred.select("prediction", "median_house_value").show()

+------------------+------------------+
|        prediction|median_house_value|
+------------------+------------------+
|143371.42857142858|          103600.0|
|          107400.0|          106700.0|
|105343.76731301939|           73200.0|
|          107400.0|           90100.0|
| 79801.45772594752|           67000.0|
|209829.72972972973|          116100.0|
| 79801.45772594752|           62500.0|
|105343.76731301939|           85400.0|
|169372.72727272726|           90000.0|
|          107400.0|           86400.0|
| 79801.45772594752|           74100.0|
| 79801.45772594752|           57500.0|
| 79801.45772594752|           75100.0|
|203695.45454545456|          130600.0|
|105343.76731301939|           92100.0|
|105343.76731301939|           90200.0|
|105343.76731301939|           92600.0|
|          233987.5|          165600.0|
| 79801.45772594752|           36700.0|
|143371.42857142858|          116700.0|
+------------------+------------------+
only showing top 20 rows


In [89]:
xgb_val_pred.select("prediction", "median_house_value").show()

+--------------+------------------+
|    prediction|median_house_value|
+--------------+------------------+
|101881.0546875|          103600.0|
| 110946.859375|          106700.0|
| 91760.3671875|           73200.0|
|118546.9921875|           90100.0|
| 71620.6015625|           67000.0|
| 200916.484375|          116100.0|
|  76567.421875|           62500.0|
|   113046.8125|           85400.0|
| 104838.640625|           90000.0|
| 96730.0703125|           86400.0|
|     67494.125|           74100.0|
|65470.79296875|           57500.0|
|    75066.9375|           75100.0|
|  107287.90625|          130600.0|
| 91725.2734375|           92100.0|
| 98372.3828125|           90200.0|
| 98463.1796875|           92600.0|
| 214627.828125|          165600.0|
|   64703.53125|           36700.0|
|  108787.96875|          116700.0|
+--------------+------------------+
only showing top 20 rows


In [90]:
dt_rmse = evaluator.evaluate(dt_model.transform(val_df))
xgb_rmse = evaluator.evaluate(xgb_model.transform(val_df))
dt_rmse, xgb_rmse

(65224.15412693988, 45743.508103246895)

As expected XGB performed better than DT. Ensemble boosting with regularization better captures non‑linearities and interactions in tabular data, typically delivering lower RMSE/MAE and higher R² than a single decision tree, provided you tune and keep features consistent.

In [97]:
test_set = spark.read.csv("sample_data/california_housing_test.csv", header=True, inferSchema=True)

In [98]:
test_set.show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -122.05|   37.37|              27.0|     3885.0|         661.0|    1537.0|     606.0|       6.6085|          344700.0|
|   -118.3|   34.26|              43.0|     1510.0|         310.0|     809.0|     277.0|        3.599|          176500.0|
|  -117.81|   33.78|              27.0|     3589.0|         507.0|    1484.0|     495.0|       5.7934|          270500.0|
|  -118.36|   33.82|              28.0|       67.0|          15.0|      49.0|      11.0|       6.1359|          330000.0|
|  -119.67|   36.33|              19.0|     1241.0|         244.0|     850.0|     237.0|       2.9375|           81700.0|
|  -119.56|   36.51|    

In [103]:
test_df_modified = engineer_features(test_set)  # ensure same transforms!
test_df_modified.show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+-------------------+-------------------+------------------------+-------------------------------+------------------+------------------+------------------+------------------+-----------------------+---------------------+------------------+-----------------------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|rooms_per_household|  bedrooms_per_room|population_per_household|population_per_household_capped|    log_population|   log_total_rooms|log_total_bedrooms|    log_households|log_rooms_per_household|log_bedrooms_per_room| log_median_income|log_population_per_household_capped|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+-------------------+-------------------+------------------------+----------------------

In [105]:
best_model = xgb_model.bestModel
test_pred = best_model.transform(test_df_modified).select("prediction", TARGET, *feature_cols)

test_pred.select("prediction", "median_house_value").show()

+-------------+------------------+
|   prediction|median_house_value|
+-------------+------------------+
|  415811.6875|          344700.0|
|  195997.6875|          176500.0|
| 293338.21875|          270500.0|
|     313270.5|          330000.0|
|68359.5546875|           81700.0|
|55204.4765625|           67000.0|
| 61398.453125|           67000.0|
| 187763.53125|          166900.0|
|205372.359375|          194400.0|
|   157149.625|          164200.0|
|105220.609375|          125000.0|
|  60963.21875|           58300.0|
| 280366.84375|          252600.0|
|  204732.9375|          231200.0|
|240129.890625|          222500.0|
|157764.671875|          153100.0|
|  146514.3125|          181300.0|
|116351.296875|          137500.0|
|   334633.625|          300000.0|
| 334721.15625|          414300.0|
+-------------+------------------+
only showing top 20 rows


In [107]:
xgb_rmse_test = evaluator.evaluate(xgb_model.transform(test_df_modified))
xgb_rmse_test

46782.28093857735