In [28]:
import os
import sys

os.environ["JAVA_HOME"] = "../../.JDK 8" 
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Multiple Linear Regression

In [30]:
# Create SparkSession
spark = SparkSession \
    .builder \
        .appName("MultipleLinearRegression") \
            .getOrCreate()

In [31]:
path = "../../Week 11 - Spark Machine Learning (MLlib)/FuelConsumption.csv"

df = spark.read.csv(path, header=True, inferSchema=True)
df.show(5)


+---------+-----+----------+------------+----------+---------+------------+--------+--------------------+-------------------+--------------------+------------------------+------------+
|MODELYEAR| MAKE|     MODEL|VEHICLECLASS|ENGINESIZE|CYLINDERS|TRANSMISSION|FUELTYPE|FUELCONSUMPTION_CITY|FUELCONSUMPTION_HWY|FUELCONSUMPTION_COMB|FUELCONSUMPTION_COMB_MPG|CO2EMISSIONS|
+---------+-----+----------+------------+----------+---------+------------+--------+--------------------+-------------------+--------------------+------------------------+------------+
|     2014|ACURA|       ILX|     COMPACT|       2.0|        4|         AS5|       Z|                 9.9|                6.7|                 8.5|                      33|         196|
|     2014|ACURA|       ILX|     COMPACT|       2.4|        4|          M6|       Z|                11.2|                7.7|                 9.6|                      29|         221|
|     2014|ACURA|ILX HYBRID|     COMPACT|       1.5|        4|         AV7|

**Step 2:** Data Preparation

+ **Select Relevant Columns:** We will filter only the relevant columns for the regression model.
+ **Handle Missing Values:** We gonna drop rows or fill missing values in predictors or input columns.
+ **Assemble Features:** We'll combine predictors into a single features vector using VectorAssembler.

In [32]:
# Select relevant features for predicting CO2EMISSIONS
df_MLR = df.select("MODEL", "ENGINESIZE", "FUELCONSUMPTION_COMB", "CYLINDERS", "CO2EMISSIONS")
df_MLR.show(5)

+----------+----------+--------------------+---------+------------+
|     MODEL|ENGINESIZE|FUELCONSUMPTION_COMB|CYLINDERS|CO2EMISSIONS|
+----------+----------+--------------------+---------+------------+
|       ILX|       2.0|                 8.5|        4|         196|
|       ILX|       2.4|                 9.6|        4|         221|
|ILX HYBRID|       1.5|                 5.9|        4|         136|
|   MDX 4WD|       3.5|                11.1|        6|         255|
|   RDX AWD|       3.5|                10.6|        6|         244|
+----------+----------+--------------------+---------+------------+
only showing top 5 rows



In [33]:
# Handle missing values (if any)
null_count = df.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in df.columns])
null_count.show()

+---------+----+-----+------------+----------+---------+------------+--------+--------------------+-------------------+--------------------+------------------------+------------+
|MODELYEAR|MAKE|MODEL|VEHICLECLASS|ENGINESIZE|CYLINDERS|TRANSMISSION|FUELTYPE|FUELCONSUMPTION_CITY|FUELCONSUMPTION_HWY|FUELCONSUMPTION_COMB|FUELCONSUMPTION_COMB_MPG|CO2EMISSIONS|
+---------+----+-----+------------+----------+---------+------------+--------+--------------------+-------------------+--------------------+------------------------+------------+
|        0|   0|    0|           0|         0|        0|           0|       0|                   0|                  0|                   0|                       0|           0|
+---------+----+-----+------------+----------+---------+------------+--------+--------------------+-------------------+--------------------+------------------------+------------+



In [34]:
assembler = VectorAssembler(inputCols=["ENGINESIZE", "FUELCONSUMPTION_COMB", "CYLINDERS"]
                            ,outputCol="features")
data = assembler.transform(df_MLR)
data.show()

+------------+----------+--------------------+---------+------------+---------------+
|       MODEL|ENGINESIZE|FUELCONSUMPTION_COMB|CYLINDERS|CO2EMISSIONS|       features|
+------------+----------+--------------------+---------+------------+---------------+
|         ILX|       2.0|                 8.5|        4|         196|  [2.0,8.5,4.0]|
|         ILX|       2.4|                 9.6|        4|         221|  [2.4,9.6,4.0]|
|  ILX HYBRID|       1.5|                 5.9|        4|         136|  [1.5,5.9,4.0]|
|     MDX 4WD|       3.5|                11.1|        6|         255| [3.5,11.1,6.0]|
|     RDX AWD|       3.5|                10.6|        6|         244| [3.5,10.6,6.0]|
|         RLX|       3.5|                10.0|        6|         230| [3.5,10.0,6.0]|
|          TL|       3.5|                10.1|        6|         232| [3.5,10.1,6.0]|
|      TL AWD|       3.7|                11.1|        6|         255| [3.7,11.1,6.0]|
|      TL AWD|       3.7|                11.6|        

**Step 3:** Train-Test Split

In [35]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

**Step 4:** Train the Multiple Linear Regression Model

In [36]:
MLR = LinearRegression(featuresCol="features",
                       labelCol="CO2EMISSIONS")

LR_MODEL = MLR.fit(train_data)

25/03/22 14:20:56 WARN Instrumentation: [f8e0579b] regParam is zero, which might cause numerical instability and overfitting.


**EXERCISE:** Use the intercept and the different coefficients, to write the equation of the multiple regression line of your output feature

In [37]:
# y = ax + b
coef = LR_MODEL.coefficients

intercept = LR_MODEL.intercept

print(f"y = {coef}x + {intercept}")

y = [11.65067715620233,9.416590025632905,6.9178330618928054]x + 68.03186580328334


# y = 11.7x<sub>1</sub>, 9.4x<sub>2</sub>, 6.9x<sub>3</sub> + 68.0

**Step 5:** Make Predictions

In [40]:
MLR_prediction = LR_MODEL.transform(test_data)
MLR_prediction.show()

+--------------------+----------+--------------------+---------+------------+--------------+------------------+
|               MODEL|ENGINESIZE|FUELCONSUMPTION_COMB|CYLINDERS|CO2EMISSIONS|      features|        prediction|
+--------------------+----------+--------------------+---------+------------+--------------+------------------+
|      1500 4X4 (MDS)|       5.7|                14.1|        8|         324|[5.7,14.1,8.0]|322.55730945020304|
|        1500 4X4 FFV|       3.6|                17.8|        6|         285|[3.6,17.8,6.0]|319.09660439323426|
|            1500 FFV|       3.6|                11.9|        6|         274|[3.6,11.9,6.0]| 263.5387232420001|
| 200 CONVERTIBLE FFV|       3.6|                10.5|        6|         242|[3.6,10.5,6.0]|250.35549720611402|
|                 300|       3.6|                10.3|        6|         237|[3.6,10.3,6.0]|248.47217920098745|
|         300 AWD FFV|       3.6|                10.9|        6|         251|[3.6,10.9,6.0]|254.12213321