# Spark ML

## Install

In [2]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 47 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 38.1 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=832d845884aa1aec9dce583c3db0b6a8d46ba4a070d2896a7e5d79711a085a0d
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [3]:
!pip install findspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


## Import

In [4]:
from pyspark.sql import SparkSession
import pyspark
import findspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
from pyspark.sql import DataFrameReader

## Create the Spark Session

In [5]:
#Initialisation de findspark
findspark.init()

In [6]:
#Création du SparkSession
spark = SparkSession.builder.appName("Exemple Spark").getOrCreate()
sc = SparkContext.getOrCreate()

## Load the data and change the units

In [7]:
df = spark.read.format("csv").option("header","true").load("/content/houses.csv")
df.show()

+------------------+--------+------+-----------+------------------+
|              size|nb_rooms|garden|orientation|             price|
+------------------+--------+------+-----------+------------------+
|  116.300633073418|       1|     1|        Sud| 284413.3068710591|
| 194.3028748040095|       2|     0|       Nord|237354.82009708052|
| 92.69496171573587|       2|     0|      Ouest|225301.99878400558|
|116.47994613889072|       2|     0|       Nord|214482.04415369357|
|138.25642489718405|       2|     1|        Est| 289134.2657668252|
| 254.9083895514043|       1|     1|       Nord| 279516.8787203982|
|116.24576857935243|       3|     1|        Sud|  324392.254300987|
|110.88938812358977|       2|     0|        Est|229287.98497631942|
|157.73240249812508|       1|     0|       Nord|208024.89015503184|
| 144.3021720928561|       2|     1|        Sud|   314046.69837688|
|106.71646333256254|       1|     1|       Nord|240577.60959136163|
| 274.2308111454345|       3|     1|      Ouest|

In [12]:
df = df.selectExpr("cast(size as float) size",
    "cast(nb_rooms as int) nb_rooms",
    "cast(garden as int) garden",
    "cast(orientation as string) orientation",
    "cast(price as float) price")               
df.printSchema()
df.show()

root
 |-- size: float (nullable = true)
 |-- nb_rooms: integer (nullable = true)
 |-- garden: integer (nullable = true)
 |-- orientation: string (nullable = true)
 |-- price: float (nullable = true)

+----------+--------+------+-----------+---------+
|      size|nb_rooms|garden|orientation|    price|
+----------+--------+------+-----------+---------+
| 116.30064|       1|     1|        Sud| 284413.3|
| 194.30287|       2|     0|       Nord|237354.81|
|  92.69496|       2|     0|      Ouest| 225302.0|
| 116.47995|       2|     0|       Nord|214482.05|
| 138.25642|       2|     1|        Est|289134.28|
| 254.90839|       1|     1|       Nord|279516.88|
|116.245766|       3|     1|        Sud|324392.25|
| 110.88939|       2|     0|        Est|229287.98|
|  157.7324|       1|     0|       Nord|208024.89|
| 144.30217|       2|     1|        Sud| 314046.7|
| 106.71646|       1|     1|       Nord|240577.61|
|  274.2308|       3|     1|      Ouest|343792.44|
|  34.31801|       1|     1|      O

## Vector Assembler with just the size

In [13]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler  = VectorAssembler(inputCols =  ['size'], outputCol="features")

In [14]:
df2 = vectorAssembler.transform(df)
df2 = df2.select(['features', 'price'])
df2.show(3)

+--------------------+---------+
|            features|    price|
+--------------------+---------+
| [116.3006362915039]| 284413.3|
|[194.30287170410156]|237354.81|
| [92.69496154785156]| 225302.0|
+--------------------+---------+
only showing top 3 rows



## Split

In [15]:
splits = df2.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

## Train the linear regression

In [16]:
from pyspark.ml.regression import LinearRegression

model = LinearRegression(featuresCol = 'features', labelCol='price', maxIter=10, regParam=0.3, elasticNetParam=0.8)
trained_model = model.fit(train_df)

print("Coefficients: " + str(trained_model.coefficients))
print("Intercept: " + str(trained_model.intercept))

Coefficients: [305.5592196891665]
Intercept: 215019.28382399757


In [17]:
trainingSummary = trained_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 32429.540676
r2: 0.228891


## Prediction and Evaluation

In [18]:
from pyspark.ml.evaluation import RegressionEvaluator
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="price", predictionCol="prediction") 

In [20]:
# View the predictions
test_predictions = trained_model.transform(test_df)
test_predictions.show()

RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

+--------------------+---------+------------------+
|            features|    price|        prediction|
+--------------------+---------+------------------+
|[24.959659576416016]|206544.47|222645.93792787447|
| [34.31800842285156]|211397.69|225505.46769897034|
| [92.69496154785156]| 225302.0|243343.08394367638|
|[116.24576568603516]|324392.25|250539.24927919215|
| [116.3006362915039]| 284413.3|250556.01549858306|
| [121.8032455444336]|253476.25|252237.38848816266|
|[122.16044616699219]|266606.97|252346.53443166413|
|[130.79840087890625]|220657.31|254985.94113314696|
|[131.08433532714844]| 269754.8|255073.31104003408|
|[131.53688049316406]|306908.75|255211.59038783592|
|[182.44317626953125]|324535.72| 270766.4784025286|
|[194.56739807128906]|277415.72| 274471.1461556121|
|[206.96255493164062]|281181.12|278258.60061378597|
|[234.57737731933594]|315791.78|286696.56419442507|
+--------------------+---------+------------------+

33177.785794024065


## String Indexer to change the categorical column

In [23]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

indexers = [StringIndexer(inputCol="orientation", outputCol="orientationIndex")]


pipeline = Pipeline(stages=indexers)
df3 = pipeline.fit(df).transform(df)

df3.show()

+----------+--------+------+-----------+---------+----------------+
|      size|nb_rooms|garden|orientation|    price|orientationIndex|
+----------+--------+------+-----------+---------+----------------+
| 116.30064|       1|     1|        Sud| 284413.3|             1.0|
| 194.30287|       2|     0|       Nord|237354.81|             0.0|
|  92.69496|       2|     0|      Ouest| 225302.0|             2.0|
| 116.47995|       2|     0|       Nord|214482.05|             0.0|
| 138.25642|       2|     1|        Est|289134.28|             3.0|
| 254.90839|       1|     1|       Nord|279516.88|             0.0|
|116.245766|       3|     1|        Sud|324392.25|             1.0|
| 110.88939|       2|     0|        Est|229287.98|             3.0|
|  157.7324|       1|     0|       Nord|208024.89|             0.0|
| 144.30217|       2|     1|        Sud| 314046.7|             1.0|
| 106.71646|       1|     1|       Nord|240577.61|             0.0|
|  274.2308|       3|     1|      Ouest|343792.4

In [25]:
df3 = df3.drop("orientation")
df3.show()

+----------+--------+------+---------+----------------+
|      size|nb_rooms|garden|    price|orientationIndex|
+----------+--------+------+---------+----------------+
| 116.30064|       1|     1| 284413.3|             1.0|
| 194.30287|       2|     0|237354.81|             0.0|
|  92.69496|       2|     0| 225302.0|             2.0|
| 116.47995|       2|     0|214482.05|             0.0|
| 138.25642|       2|     1|289134.28|             3.0|
| 254.90839|       1|     1|279516.88|             0.0|
|116.245766|       3|     1|324392.25|             1.0|
| 110.88939|       2|     0|229287.98|             3.0|
|  157.7324|       1|     0|208024.89|             0.0|
| 144.30217|       2|     1| 314046.7|             1.0|
| 106.71646|       1|     1|240577.61|             0.0|
|  274.2308|       3|     1|343792.44|             2.0|
|  34.31801|       1|     1|211397.69|             2.0|
| 172.84895|       2|     0| 232118.4|             0.0|
|  88.77562|       1|     1|253379.11|          

## One Hot Encoder for the categorical column

In [36]:
from pyspark.ml.feature import OneHotEncoder, OneHotEncoderModel


indexers = [StringIndexer(inputCol="orientation", outputCol="orientationIndex")]

encoder = OneHotEncoder(
    inputCols=[indexer.getOutputCol() for indexer in indexers],
    outputCols=[
        "{0}_encoded".format(indexer.getOutputCol()) for indexer in indexers]
)

assembler = VectorAssembler(
    inputCols=encoder.getOutputCols(),
    outputCol="features"
)

pipeline = Pipeline(stages=indexers + [encoder, assembler])
df4 = pipeline.fit(df).transform(df)

df4.show()

+----------+--------+------+-----------+---------+----------------+------------------------+-------------+
|      size|nb_rooms|garden|orientation|    price|orientationIndex|orientationIndex_encoded|     features|
+----------+--------+------+-----------+---------+----------------+------------------------+-------------+
| 116.30064|       1|     1|        Sud| 284413.3|             1.0|           (3,[1],[1.0])|[0.0,1.0,0.0]|
| 194.30287|       2|     0|       Nord|237354.81|             0.0|           (3,[0],[1.0])|[1.0,0.0,0.0]|
|  92.69496|       2|     0|      Ouest| 225302.0|             2.0|           (3,[2],[1.0])|[0.0,0.0,1.0]|
| 116.47995|       2|     0|       Nord|214482.05|             0.0|           (3,[0],[1.0])|[1.0,0.0,0.0]|
| 138.25642|       2|     1|        Est|289134.28|             3.0|               (3,[],[])|    (3,[],[])|
| 254.90839|       1|     1|       Nord|279516.88|             0.0|           (3,[0],[1.0])|[1.0,0.0,0.0]|
|116.245766|       3|     1|        S

## Train and Evaluate Model on all features

In [41]:
df5 = df4.select("features","price")
df5.show()

+-------------+---------+
|     features|    price|
+-------------+---------+
|[0.0,1.0,0.0]| 284413.3|
|[1.0,0.0,0.0]|237354.81|
|[0.0,0.0,1.0]| 225302.0|
|[1.0,0.0,0.0]|214482.05|
|    (3,[],[])|289134.28|
|[1.0,0.0,0.0]|279516.88|
|[0.0,1.0,0.0]|324392.25|
|    (3,[],[])|229287.98|
|[1.0,0.0,0.0]|208024.89|
|[0.0,1.0,0.0]| 314046.7|
|[1.0,0.0,0.0]|240577.61|
|[0.0,0.0,1.0]|343792.44|
|[0.0,0.0,1.0]|211397.69|
|[1.0,0.0,0.0]| 232118.4|
|[0.0,0.0,1.0]|253379.11|
|[0.0,1.0,0.0]|279459.62|
|    (3,[],[])|306908.75|
|[0.0,0.0,1.0]| 269197.5|
|[0.0,1.0,0.0]|282646.06|
|[0.0,1.0,0.0]|206544.47|
+-------------+---------+
only showing top 20 rows



In [42]:
splits = df5.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

In [43]:
from pyspark.ml.regression import LinearRegression

model = LinearRegression(featuresCol = 'features', labelCol='price', maxIter=10, regParam=0.3, elasticNetParam=0.8)
trained_model = model.fit(train_df)

print("Coefficients: " + str(trained_model.coefficients))
print("Intercept: " + str(trained_model.intercept))

Coefficients: [461.21582926095795,28285.30550128099,13878.259467412461]
Intercept: 249114.58576965457


In [44]:
trainingSummary = trained_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 32778.966223
r2: 0.109176


In [45]:
from pyspark.ml.evaluation import RegressionEvaluator
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="price", predictionCol="prediction") 

In [46]:
# View the predictions
test_predictions = trained_model.transform(test_df)
test_predictions.show()

RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

+-------------+---------+------------------+
|     features|    price|        prediction|
+-------------+---------+------------------+
|    (3,[],[])|229287.98|249114.58576965457|
|    (3,[],[])|253476.25|249114.58576965457|
|    (3,[],[])|289134.28|249114.58576965457|
|    (3,[],[])|306908.75|249114.58576965457|
|[0.0,0.0,1.0]|211397.69|  262992.845237067|
|[0.0,0.0,1.0]|253379.11|  262992.845237067|
|[0.0,1.0,0.0]|279459.62|277399.89127093554|
|[0.0,1.0,0.0]| 314046.7|277399.89127093554|
|[0.0,1.0,0.0]|350864.44|277399.89127093554|
|[1.0,0.0,0.0]|208024.89|249575.80159891554|
|[1.0,0.0,0.0]|233645.66|249575.80159891554|
|[1.0,0.0,0.0]|237354.81|249575.80159891554|
|[1.0,0.0,0.0]| 269754.8|249575.80159891554|
|[1.0,0.0,0.0]|315791.78|249575.80159891554|
|[1.0,0.0,0.0]|320174.25|249575.80159891554|
+-------------+---------+------------------+

42235.92728054293
