## Pyspark MLIB:
PySpark MLlib is the machine learning library provided by Apache Spark for scalable and distributed machine learning tasks. It is built on top of Spark's core functionalities and enables users to leverage distributed computing to train and deploy machine learning models.

PySpark MLlib provides a wide range of algorithms and utilities for various machine learning tasks, including classification, regression, clustering, collaborative filtering, dimensionality reduction, and more. It also offers tools for feature extraction, transformation, and selection to prepare data for machine learning tasks.

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Dataframe_Use').getOrCreate()

In [3]:
spark

In [4]:
## Reading the dataset

In [9]:
path = r"C:\Users\Azam\Desktop\Extra Work\Udemy ML\UNZIP_FOR_NOTEBOOKS_FINAL\08-Linear-Regression-Models\Advertising.csv"

In [10]:
## read the dataset
df=spark.read.option('header','true').csv(path,inferSchema=True) # IF inferSchema is not set to true then Pyspark will treat all the columns as string

In [11]:
df.show()

+-----+-----+---------+-----+
|   TV|radio|newspaper|sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
|151.5| 41.3|     58.5| 18.5|
|180.8| 10.8|     58.4| 12.9|
|  8.7| 48.9|     75.0|  7.2|
| 57.5| 32.8|     23.5| 11.8|
|120.2| 19.6|     11.6| 13.2|
|  8.6|  2.1|      1.0|  4.8|
|199.8|  2.6|     21.2| 10.6|
| 66.1|  5.8|     24.2|  8.6|
|214.7| 24.0|      4.0| 17.4|
| 23.8| 35.1|     65.9|  9.2|
| 97.5|  7.6|      7.2|  9.7|
|204.1| 32.9|     46.0| 19.0|
|195.4| 47.7|     52.9| 22.4|
| 67.8| 36.6|    114.0| 12.5|
|281.4| 39.6|     55.8| 24.4|
| 69.2| 20.5|     18.3| 11.3|
|147.3| 23.9|     19.1| 14.6|
+-----+-----+---------+-----+
only showing top 20 rows



In [14]:
df.count(),len(df.columns)

(200, 4)

In [21]:
from pyspark.ml.feature import VectorAssembler

## What vectorassembler does is  that it creates a vector from the columns that we give to it

In [17]:
feature_assembler = VectorAssembler(inputCols=['TV','radio','newspaper'],outputCol='Independent Features') 

In [19]:
output = feature_assembler.transform(df)

In [20]:
output.show()

+-----+-----+---------+-----+--------------------+
|   TV|radio|newspaper|sales|Independent Features|
+-----+-----+---------+-----+--------------------+
|230.1| 37.8|     69.2| 22.1|   [230.1,37.8,69.2]|
| 44.5| 39.3|     45.1| 10.4|    [44.5,39.3,45.1]|
| 17.2| 45.9|     69.3|  9.3|    [17.2,45.9,69.3]|
|151.5| 41.3|     58.5| 18.5|   [151.5,41.3,58.5]|
|180.8| 10.8|     58.4| 12.9|   [180.8,10.8,58.4]|
|  8.7| 48.9|     75.0|  7.2|     [8.7,48.9,75.0]|
| 57.5| 32.8|     23.5| 11.8|    [57.5,32.8,23.5]|
|120.2| 19.6|     11.6| 13.2|   [120.2,19.6,11.6]|
|  8.6|  2.1|      1.0|  4.8|       [8.6,2.1,1.0]|
|199.8|  2.6|     21.2| 10.6|    [199.8,2.6,21.2]|
| 66.1|  5.8|     24.2|  8.6|     [66.1,5.8,24.2]|
|214.7| 24.0|      4.0| 17.4|    [214.7,24.0,4.0]|
| 23.8| 35.1|     65.9|  9.2|    [23.8,35.1,65.9]|
| 97.5|  7.6|      7.2|  9.7|      [97.5,7.6,7.2]|
|204.1| 32.9|     46.0| 19.0|   [204.1,32.9,46.0]|
|195.4| 47.7|     52.9| 22.4|   [195.4,47.7,52.9]|
| 67.8| 36.6|    114.0| 12.5|  

In [22]:
output.columns

['TV', 'radio', 'newspaper', 'sales', 'Independent Features']

In [23]:
finalized_data = output.select('Independent Features','sales')

In [24]:
finalized_data.show()

+--------------------+-----+
|Independent Features|sales|
+--------------------+-----+
|   [230.1,37.8,69.2]| 22.1|
|    [44.5,39.3,45.1]| 10.4|
|    [17.2,45.9,69.3]|  9.3|
|   [151.5,41.3,58.5]| 18.5|
|   [180.8,10.8,58.4]| 12.9|
|     [8.7,48.9,75.0]|  7.2|
|    [57.5,32.8,23.5]| 11.8|
|   [120.2,19.6,11.6]| 13.2|
|       [8.6,2.1,1.0]|  4.8|
|    [199.8,2.6,21.2]| 10.6|
|     [66.1,5.8,24.2]|  8.6|
|    [214.7,24.0,4.0]| 17.4|
|    [23.8,35.1,65.9]|  9.2|
|      [97.5,7.6,7.2]|  9.7|
|   [204.1,32.9,46.0]| 19.0|
|   [195.4,47.7,52.9]| 22.4|
|   [67.8,36.6,114.0]| 12.5|
|   [281.4,39.6,55.8]| 24.4|
|    [69.2,20.5,18.3]| 11.3|
|   [147.3,23.9,19.1]| 14.6|
+--------------------+-----+
only showing top 20 rows



In [25]:
from pyspark.ml.regression import LinearRegression

In [26]:
train_data,test_data = finalized_data.randomSplit([0.8,0.2])


In [27]:
regressor = LinearRegression(featuresCol='Independent Features',labelCol='sales')

In [28]:
regressor = regressor.fit(train_data)

In [29]:
regressor.coefficients

DenseVector([0.047, 0.1861, -0.0019])

In [30]:
pred_results = regressor.evaluate(test_data)

In [32]:
pred_results.predictions.show()

+--------------------+-----+------------------+
|Independent Features|sales|        prediction|
+--------------------+-----+------------------+
|     [28.6,1.5,33.0]|  7.3| 4.386616805444243|
|     [31.5,24.6,2.2]|  9.5| 8.878738267615864|
|     [39.5,41.1,5.8]| 10.8|12.318710685222952|
|    [43.0,25.9,20.5]|  9.6| 9.627495101561959|
|    [44.7,25.8,20.6]| 10.1| 9.688641747429655|
|    [68.4,44.5,35.6]| 13.6|14.255146437514735|
|    [69.2,20.5,18.3]| 11.3|  9.85872866646402|
|     [75.5,10.8,6.0]|  9.9| 8.372732999823885|
|    [93.9,43.5,50.5]| 15.3|15.240541991740491|
|   [112.9,17.4,38.6]| 11.9|11.299163388249477|
|    [116.0,7.7,23.1]| 11.0| 9.668624637802612|
|    [117.2,14.7,5.4]| 11.9| 11.06049296244289|
|   [120.2,19.6,11.6]| 13.2|12.101900459139385|
|   [125.7,36.9,79.2]| 15.9|15.454505127565305|
|    [129.4,5.7,31.3]| 11.0| 9.911364968375935|
|   [131.1,42.8,28.9]| 18.0|16.899629160029914|
|   [137.9,46.4,59.0]| 19.2|  17.8334945228118|
|   [142.9,29.3,12.6]| 15.0|14.972547984

In [33]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(0.969684018088048, 1.6290024553048352)