<a href="https://colab.research.google.com/github/affanmohd65/affanmohd65/blob/main/Pipelines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Install Java (required for Spark)
!apt-get install openjdk-11-jdk-headless -qq > /dev/null

# Download latest Spark 3.5.x (Hadoop 3)
!wget -q https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz

# Extract Spark
!tar -xzf spark-3.5.1-bin-hadoop3.tgz

# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"

# 5️⃣ Install findspark
!pip install -q findspark


In [4]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ColabSparkPipeline") \
    .getOrCreate()

spark


In [5]:
data = [
    (65, 120),
    (70, 150),
    (72, 160),
    (60, 110),
    (80, 200),
    (85, 220),
    (75, 180),
    (68, 140)
]

columns = ["height", "weight"]
df = spark.createDataFrame(data, columns)
df.show()

+------+------+
|height|weight|
+------+------+
|    65|   120|
|    70|   150|
|    72|   160|
|    60|   110|
|    80|   200|
|    85|   220|
|    75|   180|
|    68|   140|
+------+------+



In [11]:
# preparing for pipeline stages
# we will use -> VectorAssembler = combine features ,StanderdScaler = standerd features
# LinearRegression = train a simple regression model

from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

assembler = VectorAssembler(inputCols=["height"], outputCol="features_unscaled")

scaler = StandardScaler(inputCol="features_unscaled", outputCol="features")

lr = LinearRegression(featuresCol="features", labelCol="weight")

pl = Pipeline(stages = [assembler, scaler, lr])

In [12]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed = 42)

model = pl.fit(train_df)

predictions = model.transform(test_df)
predictions.show()

+------+------+-----------------+-------------------+-----------------+
|height|weight|features_unscaled|           features|       prediction|
+------+------+-----------------+-------------------+-----------------+
|    70|   150|           [70.0]|[7.540990499768686]|151.7485493230173|
|    68|   140|           [68.0]|[7.325533628346724]|142.3945841392648|
+------+------+-----------------+-------------------+-----------------+



In [13]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="weight", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")


Root Mean Squared Error (RMSE): 2.10


In [14]:
lr_model = model.stages[-1]
print("Intercept:", lr_model.intercept)
print("Coefficient:", lr_model.coefficients)


Intercept: -175.64023210832153
Coefficient: [43.41455959152597]
