<a href="https://colab.research.google.com/github/ankesh86/PySparkNotebooks/blob/main/SupervisedML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark==3.4.0



# Data Ingestion

## Step 1: Create Spark session object

In [2]:
#Linear_regression_dataset.csv

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('supervised_ml').getOrCreate()

## Step 2: Reading the Dataset

In [3]:
df = spark.read.csv('sample_data/Linear_regression_dataset.csv', inferSchema=True, header=True)
print((df.count(), len(df.columns)))

(1232, 6)


In [4]:
df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- label: double (nullable = true)



In [5]:
df.show(10)

+-----+-----+-----+-----+-----+-----+
|var_1|var_2|var_3|var_4|var_5|label|
+-----+-----+-----+-----+-----+-----+
|  734|  688|   81|0.328|0.259|0.418|
|  700|  600|   94| 0.32|0.247|0.389|
|  712|  705|   93|0.311|0.247|0.417|
|  734|  806|   69|0.315| 0.26|0.415|
|  613|  759|   61|0.302| 0.24|0.378|
|  748|  676|   85|0.318|0.255|0.422|
|  669|  588|   97|0.315|0.251|0.411|
|  667|  845|   68|0.324|0.251|0.381|
|  758|  890|   64| 0.33|0.274|0.436|
|  726|  670|   88|0.335|0.268|0.422|
+-----+-----+-----+-----+-----+-----+
only showing top 10 rows



## Step 3: Feature Engineering

In [6]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

df.columns

['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'label']

In [7]:
vec_assembler = VectorAssembler(inputCols=['var_1', 'var_2', 'var_3', 'var_4', 'var_5'], outputCol='features')
df = vec_assembler.transform(df)


In [8]:
df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [9]:
df.show(10)

+-----+-----+-----+-----+-----+-----+--------------------+
|var_1|var_2|var_3|var_4|var_5|label|            features|
+-----+-----+-----+-----+-----+-----+--------------------+
|  734|  688|   81|0.328|0.259|0.418|[734.0,688.0,81.0...|
|  700|  600|   94| 0.32|0.247|0.389|[700.0,600.0,94.0...|
|  712|  705|   93|0.311|0.247|0.417|[712.0,705.0,93.0...|
|  734|  806|   69|0.315| 0.26|0.415|[734.0,806.0,69.0...|
|  613|  759|   61|0.302| 0.24|0.378|[613.0,759.0,61.0...|
|  748|  676|   85|0.318|0.255|0.422|[748.0,676.0,85.0...|
|  669|  588|   97|0.315|0.251|0.411|[669.0,588.0,97.0...|
|  667|  845|   68|0.324|0.251|0.381|[667.0,845.0,68.0...|
|  758|  890|   64| 0.33|0.274|0.436|[758.0,890.0,64.0...|
|  726|  670|   88|0.335|0.268|0.422|[726.0,670.0,88.0...|
+-----+-----+-----+-----+-----+-----+--------------------+
only showing top 10 rows



In [10]:
df.select('features','label').show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[734.0,688.0,81.0...|0.418|
|[700.0,600.0,94.0...|0.389|
|[712.0,705.0,93.0...|0.417|
|[734.0,806.0,69.0...|0.415|
|[613.0,759.0,61.0...|0.378|
|[748.0,676.0,85.0...|0.422|
|[669.0,588.0,97.0...|0.411|
|[667.0,845.0,68.0...|0.381|
|[758.0,890.0,64.0...|0.436|
|[726.0,670.0,88.0...|0.422|
|[583.0,794.0,55.0...|0.371|
|[676.0,746.0,72.0...|  0.4|
|[767.0,699.0,89.0...|0.433|
|[637.0,597.0,86.0...|0.374|
|[609.0,724.0,69.0...|0.382|
|[776.0,733.0,83.0...|0.437|
|[701.0,832.0,66.0...| 0.39|
|[650.0,709.0,74.0...|0.386|
|[804.0,668.0,95.0...|0.453|
|[713.0,614.0,94.0...|0.404|
+--------------------+-----+
only showing top 20 rows



## Step 4 : Splitting the Dataset

In [11]:
train, test = df.randomSplit([0.75,0.25])
print(f"Size of train Dataset : {train.count()}")
print(f"Size of test Dataset : {test.count()}")

Size of train Dataset : 925
Size of test Dataset : 307


## Step 5: Build and Train Linear Regression Model

In [12]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression()

lr_model = lr.fit(train)

prediction_df = lr_model.transform(test)
prediction_df.show()

+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|var_1|var_2|var_3|var_4|var_5|label|            features|         prediction|
+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|  464|  640|   66|0.283| 0.22|0.301|[464.0,640.0,66.0...|0.31442967424014645|
|  495|  752|   50|0.277|0.221|0.327|[495.0,752.0,50.0...| 0.3315164501121478|
|  524|  665|   65|0.287|0.224|0.336|[524.0,665.0,65.0...|0.33510165720987023|
|  528|  652|   71|  0.3| 0.23|0.319|[528.0,652.0,71.0...|0.33155867135522354|
|  536|  531|   83|0.292|0.214|0.318|[536.0,531.0,83.0...|0.32836134694316266|
|  541|  830|   60|0.302|0.229| 0.33|[541.0,830.0,60.0...|0.34093370751380425|
|  550|  637|   76|0.288|0.223|0.326|[550.0,637.0,76.0...| 0.3436973657414395|
|  562|  546|   79|0.299|0.237| 0.35|[562.0,546.0,79.0...| 0.3432457140166861|
|  564|  648|   74|0.294|0.236|0.337|[564.0,648.0,74.0...| 0.3507577871333169|
|  569|  544|   82|0.304| 0.24|0.343|[569.0,544.0,82

## Step 6: Evaluate Linear Regression Model

In [13]:
model_predictions = lr_model.evaluate(test)
model_predictions.r2

0.8429815213039082

In [14]:
print(model_predictions.meanSquaredError)

0.00016146328999830253


In [15]:
from pyspark.ml.evaluation import RegressionEvaluator

# Fit the model
lr_model = lr.fit(train)

# Make predictions
prediction_df = lr_model.transform(test)

# Show predictions
prediction_df.select("features", "label", "prediction").show()

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

# Calculate RMSE (Root Mean Squared Error)
rmse = evaluator.evaluate(prediction_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

# You can change the metricName to "r2", "mae", etc., to evaluate those as well
r2 = evaluator.evaluate(prediction_df, {evaluator.metricName: "r2"})
print("R-squared (R2) on test data = %g" % r2)


+--------------------+-----+-------------------+
|            features|label|         prediction|
+--------------------+-----+-------------------+
|[464.0,640.0,66.0...|0.301|0.31442967424014645|
|[495.0,752.0,50.0...|0.327| 0.3315164501121478|
|[524.0,665.0,65.0...|0.336|0.33510165720987023|
|[528.0,652.0,71.0...|0.319|0.33155867135522354|
|[536.0,531.0,83.0...|0.318|0.32836134694316266|
|[541.0,830.0,60.0...| 0.33|0.34093370751380425|
|[550.0,637.0,76.0...|0.326| 0.3436973657414395|
|[562.0,546.0,79.0...| 0.35| 0.3432457140166861|
|[564.0,648.0,74.0...|0.337| 0.3507577871333169|
|[569.0,544.0,82.0...|0.343|0.34434878314339856|
|[572.0,646.0,71.0...|0.329|0.34146360381622354|
|[574.0,586.0,81.0...| 0.36| 0.3510259862227791|
|[575.0,680.0,68.0...|0.344| 0.3533351942669095|
|[575.0,864.0,55.0...|0.379|0.36315635802290336|
|[578.0,733.0,62.0...|0.348| 0.3517975994145861|
|[579.0,497.0,91.0...|0.352| 0.3403212908710359|
|[582.0,791.0,52.0...|0.359|0.35114870641330037|
|[583.0,472.0,97.0..

# **Generalized Linear Model Regression**

## Step 1: Build and Train Generalized Linear Regression Model

In [16]:
from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression()
glr_model = glr.fit(train)

glr_model.coefficients

DenseVector([0.0003, 0.0001, 0.0002, -0.6359, 0.4614])

In [17]:
glr_model.summary

Coefficients:
    Feature Estimate Std Error T Value P Value
(Intercept)   0.1895    0.0165 11.5012  0.0000
      var_1   0.0003    0.0000 23.1154  0.0000
      var_2   0.0001    0.0000  4.5082  0.0000
      var_3   0.0002    0.0001  2.1354  0.0330
      var_4  -0.6359    0.0681 -9.3311  0.0000
      var_5   0.4614    0.0616  7.4916  0.0000

(Dispersion parameter for gaussian family taken to be 0.0001)
    Null deviance: 1.0456 on 919 degrees of freedom
Residual deviance: 0.1291 on 919 degrees of freedom
AIC: -5572.3463

## Step 2: Evaluate the Model Performance on Test Data

In [18]:
model_predictions = glr_model.evaluate(test)
model_predictions.predictions.show()

+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|var_1|var_2|var_3|var_4|var_5|label|            features|         prediction|
+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|  464|  640|   66|0.283| 0.22|0.301|[464.0,640.0,66.0...|0.31442967424014645|
|  495|  752|   50|0.277|0.221|0.327|[495.0,752.0,50.0...| 0.3315164501121478|
|  524|  665|   65|0.287|0.224|0.336|[524.0,665.0,65.0...|0.33510165720987023|
|  528|  652|   71|  0.3| 0.23|0.319|[528.0,652.0,71.0...|0.33155867135522354|
|  536|  531|   83|0.292|0.214|0.318|[536.0,531.0,83.0...|0.32836134694316266|
|  541|  830|   60|0.302|0.229| 0.33|[541.0,830.0,60.0...|0.34093370751380425|
|  550|  637|   76|0.288|0.223|0.326|[550.0,637.0,76.0...| 0.3436973657414395|
|  562|  546|   79|0.299|0.237| 0.35|[562.0,546.0,79.0...| 0.3432457140166861|
|  564|  648|   74|0.294|0.236|0.337|[564.0,648.0,74.0...| 0.3507577871333169|
|  569|  544|   82|0.304| 0.24|0.343|[569.0,544.0,82

In [19]:
model_predictions.aic

-1795.2601941413911

In [20]:
glr = GeneralizedLinearRegression(family='Binomial')
glr_model = glr.fit(train)
model_predictions=glr_model.evaluate(test)
model_predictions.aic

323.4966699963205

In [21]:
glr = GeneralizedLinearRegression(family='Poisson')
glr_model = glr.fit(train)
model_predictions=glr_model.evaluate(test)
model_predictions.aic

255.80829061307116

In [22]:
glr = GeneralizedLinearRegression(family='Gamma')
glr_model = glr.fit(train)
model_predictions=glr_model.evaluate(test)
model_predictions.aic

-1759.4673967706617

In [23]:
glr = GeneralizedLinearRegression(family='Tweedie')
glr_model = glr.fit(train)
model_predictions=glr_model.evaluate(test)
model_predictions.aic

-1795.2601941413911

# **Decision Tree Regression**

## Step 1: Build and Train Decision Tree Regressors Model

In [24]:
from pyspark.ml.regression import DecisionTreeRegressor
dec_tree = DecisionTreeRegressor()

dec_tree_model = dec_tree.fit(train)
dec_tree_model.featureImportances

SparseVector(5, {0: 0.9749, 1: 0.013, 2: 0.0008, 3: 0.0012, 4: 0.01})

## Step 2: Evaluate the Model Performance on Test Data

In [25]:
model_predictions = dec_tree_model.transform(test)
model_predictions.show()

+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|var_1|var_2|var_3|var_4|var_5|label|            features|         prediction|
+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|  464|  640|   66|0.283| 0.22|0.301|[464.0,640.0,66.0...|0.32416666666666666|
|  495|  752|   50|0.277|0.221|0.327|[495.0,752.0,50.0...|0.32416666666666666|
|  524|  665|   65|0.287|0.224|0.336|[524.0,665.0,65.0...|0.32416666666666666|
|  528|  652|   71|  0.3| 0.23|0.319|[528.0,652.0,71.0...|0.32416666666666666|
|  536|  531|   83|0.292|0.214|0.318|[536.0,531.0,83.0...|             0.3184|
|  541|  830|   60|0.302|0.229| 0.33|[541.0,830.0,60.0...|0.32416666666666666|
|  550|  637|   76|0.288|0.223|0.326|[550.0,637.0,76.0...|0.34888709677419355|
|  562|  546|   79|0.299|0.237| 0.35|[562.0,546.0,79.0...|0.34888709677419355|
|  564|  648|   74|0.294|0.236|0.337|[564.0,648.0,74.0...|0.34888709677419355|
|  569|  544|   82|0.304| 0.24|0.343|[569.0,544.0,82

In [26]:
from pyspark.ml.evaluation import RegressionEvaluator
dt_evaluator = RegressionEvaluator(metricName='r2')
dt_r2 = dt_evaluator.evaluate(model_predictions)
print(f'The r-square of DecisionTreeRegressor is {dt_r2}')

dt_evaluator = RegressionEvaluator(metricName='rmse')
dt_rmse = dt_evaluator.evaluate(model_predictions)
print(f'The rmse value of DecisionTreeRegressor is {dt_rmse}')


The r-square of DecisionTreeRegressor is 0.7986831505803058
The rmse value of DecisionTreeRegressor is 0.01438803808288434
