<a href="https://colab.research.google.com/github/ankesh86/PySparkNotebooks/blob/main/SupervisedML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark==3.4.0

Collecting pyspark==3.4.0
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317122 sha256=1e8aa2441d0f8b840e85a909aa2e290f123375f50082a310499838caf764a7e1
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


# Data Ingestion

## Step 1: Create Spark session object

In [2]:
#Linear_regression_dataset.csv

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('supervised_ml').getOrCreate()

## Step 2: Reading the Dataset

In [7]:
df = spark.read.csv('sample_data/Linear_regression_dataset.csv', inferSchema=True, header=True)
print((df.count(), len(df.columns)))

(1232, 6)


In [8]:
df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- label: double (nullable = true)



In [9]:
df.show(10)

+-----+-----+-----+-----+-----+-----+
|var_1|var_2|var_3|var_4|var_5|label|
+-----+-----+-----+-----+-----+-----+
|  734|  688|   81|0.328|0.259|0.418|
|  700|  600|   94| 0.32|0.247|0.389|
|  712|  705|   93|0.311|0.247|0.417|
|  734|  806|   69|0.315| 0.26|0.415|
|  613|  759|   61|0.302| 0.24|0.378|
|  748|  676|   85|0.318|0.255|0.422|
|  669|  588|   97|0.315|0.251|0.411|
|  667|  845|   68|0.324|0.251|0.381|
|  758|  890|   64| 0.33|0.274|0.436|
|  726|  670|   88|0.335|0.268|0.422|
+-----+-----+-----+-----+-----+-----+
only showing top 10 rows



## Step 3: Feature Engineering

In [10]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

df.columns

['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'label']

In [29]:
vec_assembler = VectorAssembler(inputCols=['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'label'], outputCol='features')
features_df = vec_assembler.transform(df)


In [30]:
features_df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [31]:
df.show(10)

+-----+-----+-----+-----+-----+-----+
|var_1|var_2|var_3|var_4|var_5|label|
+-----+-----+-----+-----+-----+-----+
|  734|  688|   81|0.328|0.259|0.418|
|  700|  600|   94| 0.32|0.247|0.389|
|  712|  705|   93|0.311|0.247|0.417|
|  734|  806|   69|0.315| 0.26|0.415|
|  613|  759|   61|0.302| 0.24|0.378|
|  748|  676|   85|0.318|0.255|0.422|
|  669|  588|   97|0.315|0.251|0.411|
|  667|  845|   68|0.324|0.251|0.381|
|  758|  890|   64| 0.33|0.274|0.436|
|  726|  670|   88|0.335|0.268|0.422|
+-----+-----+-----+-----+-----+-----+
only showing top 10 rows



In [16]:
features_df.select('features','label').show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[734.0,688.0,81.0...|0.418|
|[700.0,600.0,94.0...|0.389|
|[712.0,705.0,93.0...|0.417|
|[734.0,806.0,69.0...|0.415|
|[613.0,759.0,61.0...|0.378|
|[748.0,676.0,85.0...|0.422|
|[669.0,588.0,97.0...|0.411|
|[667.0,845.0,68.0...|0.381|
|[758.0,890.0,64.0...|0.436|
|[726.0,670.0,88.0...|0.422|
|[583.0,794.0,55.0...|0.371|
|[676.0,746.0,72.0...|  0.4|
|[767.0,699.0,89.0...|0.433|
|[637.0,597.0,86.0...|0.374|
|[609.0,724.0,69.0...|0.382|
|[776.0,733.0,83.0...|0.437|
|[701.0,832.0,66.0...| 0.39|
|[650.0,709.0,74.0...|0.386|
|[804.0,668.0,95.0...|0.453|
|[713.0,614.0,94.0...|0.404|
+--------------------+-----+
only showing top 20 rows



## Step 4 : Splitting the Dataset

In [27]:
train, test = df.randomSplit([0.75,0.25])
print(f"Size of train Dataset : {train.count()}")
print(f"Size of test Dataset : {test.count()}")

Size of train Dataset : 936
Size of test Dataset : 296


## Step 5: Build and Train Linear Regression Model

In [28]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression()

lr_model = lr.fit(train)

prediction_df = lr_model.transform(test)
prediction_df.show()

IllegalArgumentException: features does not exist. Available: var_1, var_2, var_3, var_4, var_5, label

## Step 6: Evaluate Linear Regression Model

In [23]:
model_predictions = lr_model.evaluate(test)
model_predictions.r2

1.0

In [24]:
print(model_predictions.meanSquaredError)

9.542724230651031e-29


In [25]:
from pyspark.ml.evaluation import RegressionEvaluator

# Fit the model
lr_model = lr.fit(train)

# Make predictions
prediction_df = lr_model.transform(test)

# Show predictions
prediction_df.select("features", "label", "prediction").show()

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

# Calculate RMSE (Root Mean Squared Error)
rmse = evaluator.evaluate(prediction_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

# You can change the metricName to "r2", "mae", etc., to evaluate those as well
r2 = evaluator.evaluate(prediction_df, {evaluator.metricName: "r2"})
print("R-squared (R2) on test data = %g" % r2)


+--------------------+-----+-------------------+
|            features|label|         prediction|
+--------------------+-----+-------------------+
|[464.0,640.0,66.0...|0.301| 0.3010000000000124|
|[468.0,746.0,52.0...|0.329|0.32899999999999796|
|[486.0,610.0,61.0...|0.332|0.33199999999999885|
|[495.0,752.0,50.0...|0.327| 0.3270000000000184|
|[498.0,615.0,67.0...|0.318| 0.3180000000000089|
|[498.0,672.0,61.0...|0.325|0.32500000000001156|
|[510.0,588.0,72.0...|0.317|0.31700000000000605|
|[514.0,549.0,81.0...|0.339|0.33899999999998726|
|[516.0,504.0,86.0...|0.327| 0.3270000000000057|
|[519.0,595.0,73.0...|0.332|0.33199999999999746|
|[524.0,665.0,65.0...|0.336|0.33600000000000996|
|[528.0,652.0,71.0...|0.319|0.31900000000000617|
|[532.0,690.0,69.0...|0.351|0.35099999999998815|
|[550.0,631.0,76.0...|0.318| 0.3180000000000083|
|[554.0,536.0,77.0...|0.339| 0.3390000000000032|
|[556.0,674.0,62.0...|0.348|  0.348000000000005|
|[559.0,613.0,75.0...|0.359|0.35900000000000304|
|[568.0,708.0,57.0..

# **Generalized Linear Model Regression**

## Step 1: Build and Train Generalized Linear Regression Model

In [26]:
from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression()
glr_model = glr.fit(train)

glr_model.coefficients

DenseVector([0.0, -0.0, -0.0, -0.0, 0.0, 1.0])