In [70]:
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, row_number, window
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from pyspark.sql.types import IntegerType
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import numpy as np

#1.2 Initialize a SparkSession
spark = SparkSession.builder \
    .appName("SimModeExample") \
    .master("local[*]") \
    .getOrCreate()
sc = spark.sparkContext

In [73]:
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
X,y = diabetes.data, diabetes.target

zero_col = np.zeros((X.shape[0], 1))
X = np.hstack((X, zero_col))
for i in range(0,len(X)):
    X[i][-1] = y[i]
column_names = diabetes.feature_names
column_names.append('label')
df = spark.createDataFrame(X, column_names)
df.show(5)

151.0
75.0
141.0
206.0
135.0
97.0
138.0
63.0
110.0
310.0
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|                 age|                 sex|                 bmi|                  bp|                  s1|                  s2|                  s3|                  s4|                  s5|                  s6|label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|0.038075906433423026| 0.05068011873981862|0.061696206518683294|  0.0218723855140367|-0.04422349842444599|-0.03482076283769895|-0.04340084565202491|-0.00259226199818...|0.019907486170462722|-0.01764612515980379|151.0|
|-0.00188201652779...|-0.04464163650698...|-0.05147406123880...|-0.0263

In [74]:
categorical_cols = [field for (field, dtype) in df.dtypes if dtype == "string"]
indexers = [StringIndexer(inputCol=col, outputCol=col + "_index", handleInvalid='keep')
            for col in categorical_cols]
numeric_cols = [field for (field, dtype) in df.dtypes if ((dtype == "double") | (dtype == "int")) and field != "label"]
feature_cols = numeric_cols + [col + "_index" for col in categorical_cols]

In [75]:
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_assembled")

# Step 5: Standardize features (optional but often useful)
scaler = StandardScaler(inputCol="features_assembled", outputCol="features", withMean=True, withStd=True)

# Step 6: Define Regression model
lr = LinearRegression(featuresCol="features", labelCol="label")

# Step 7: Build Pipeline
pipeline = Pipeline(stages=indexers + [assembler, scaler, lr])

In [76]:
# Step 8: Train/test split
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# Step 9: Fit pipeline on training data
model = pipeline.fit(train_df)

# Step 10: Predict on test data
predictions = model.transform(test_df)
predictions.select("label", "prediction").show(10)

# Step 11: Evaluate model
evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

print(f"RMSE on test data: {rmse}")
print(f"R2 on test data: {r2}")

+-----+------------------+
|label|        prediction|
+-----+------------------+
| 55.0| 75.33250271834473|
|200.0| 80.07849393946042|
| 97.0| 105.9013622061565|
| 68.0|117.81688255328869|
|103.0|144.76522375207668|
|310.0|215.06695024268214|
|101.0|  95.2968361705268|
|128.0|234.51396387075894|
| 68.0|124.60475420943374|
|200.0| 159.4760011427768|
+-----+------------------+
only showing top 10 rows

RMSE on test data: 60.02384524392721
R2 on test data: 0.35270812341282765


In [81]:
predictions.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+------------------+
|                 age|                 sex|                 bmi|                  bp|                  s1|                  s2|                  s3|                  s4|                  s5|                  s6|label|  features_assembled|            features|        prediction|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+------------------+
|-0.09996055470531495|-0.04464163650698...|-0.06764124234701265|-0.10895595156823522|-0.07449446130487065|-0.07271172671423268| 0.01550535921336615|-0.039493382874

In [None]:
# Stop Spark session
spark.stop()