<a href="https://colab.research.google.com/github/briann-kt/-Developing-an-End-to-End-Big-Data-Pipeline-Using-Hadoop-Spark-Hive-and-MLlib/blob/main/Boneage.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# PySpark MLlib Pipeline for Bone Age Prediction using Random Forest

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

In [8]:
# Start Spark Session
spark = SparkSession.builder.appName("BoneAgeMLModel").getOrCreate()

In [9]:
# Load Data from GCS
data_path = "/content/boneage-training-dataset.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

In [10]:
# Clean and Prepare Data
df = df.dropna(subset=["boneage", "male"])
df = df.withColumnRenamed("boneage", "label")  # MLlib expects target column to be named 'label'

In [11]:
# Feature Engineering
assembler = VectorAssembler(inputCols=["male"], outputCol="features")

In [14]:
# Define Model
rf = RandomForestRegressor(featuresCol="features", labelCol="label", numTrees=50)

# Create ML Pipeline
pipeline = Pipeline(stages=[assembler, rf])

# Train/Test Split
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# Train Model
model = pipeline.fit(train_data)

In [15]:
# Make Predictions
predictions = model.transform(test_data)

In [16]:
# Evaluate Model
evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

print("Root Mean Squared Error (RMSE):", rmse)
print("R² Score:", r2)

# Show some prediction samples
predictions.select("male", "label", "prediction").show(10)

Root Mean Squared Error (RMSE): 40.12248341904621
R² Score: 0.05070195587395243
+-----+-----+------------------+
| male|label|        prediction|
+-----+-----+------------------+
|false|   94| 118.0234050657813|
| true|  150|135.22307012449446|
| true|   36|135.22307012449446|
| true|  180|135.22307012449446|
|false|   36| 118.0234050657813|
|false|  159| 118.0234050657813|
|false|   33| 118.0234050657813|
|false|   24| 118.0234050657813|
| true|   54|135.22307012449446|
|false|  144| 118.0234050657813|
+-----+-----+------------------+
only showing top 10 rows

