# Examples of Pyspark ML

In [1]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession

In [2]:
pizza_dataset_path = "../datasets/demo/salary.csv"

In [3]:
spark = SparkSession.builder.appName('Learning Spark').getOrCreate()
spark

In [4]:
data = spark.read.option('header', 'true').csv(pizza_dataset_path, inferSchema=True)

In [5]:
data.show()

+------+---+---+------+
|  Name|age|exp|salary|
+------+---+---+------+
|  Amin| 25|  3|100000|
|Sajjad| 26|  4|100000|
|  Zari| 24|  1| 80000|
|Danial| 24|  4| 85000|
| Kimia| 25|  2| 90000|
+------+---+---+------+



In [7]:
data.columns

['Name', 'age', 'exp', 'salary']

In [9]:
from pyspark.ml.feature import VectorAssembler
new_feature = VectorAssembler(inputCols=['age', 'exp'], outputCol="independentFeature")

In [10]:
output = new_feature.transform(data)

In [11]:
output.show()

+------+---+---+------+------------------+
|  Name|age|exp|salary|independentFeature|
+------+---+---+------+------------------+
|  Amin| 25|  3|100000|        [25.0,3.0]|
|Sajjad| 26|  4|100000|        [26.0,4.0]|
|  Zari| 24|  1| 80000|        [24.0,1.0]|
|Danial| 24|  4| 85000|        [24.0,4.0]|
| Kimia| 25|  2| 90000|        [25.0,2.0]|
+------+---+---+------+------------------+



In [14]:
finalized_data = output.select('salary', 'independentFeature')
finalized_data.show()

+------+------------------+
|salary|independentFeature|
+------+------------------+
|100000|        [25.0,3.0]|
|100000|        [26.0,4.0]|
| 80000|        [24.0,1.0]|
| 85000|        [24.0,4.0]|
| 90000|        [25.0,2.0]|
+------+------------------+



## Apply a machine learning framework

In [15]:
from pyspark.ml.regression import LinearRegression

In [16]:
# devide the data set into a train and test

train_data, test_data = finalized_data.randomSplit([0.75, 0.25])

In [17]:
regressor = LinearRegression(featuresCol='independentFeature', labelCol='salary')
regressor = regressor.fit(train_data)

In [18]:
regressor.coefficients

DenseVector([7500.0, -7500.0])

In [19]:
regressor.intercept

-65000.00000005057

In [20]:
pre_results = regressor.evaluate(test_data)

In [22]:
pre_results.predictions.show()

+------+------------------+------------------+
|salary|independentFeature|        prediction|
+------+------------------+------------------+
| 80000|        [24.0,1.0]|107499.99999999722|
| 90000|        [25.0,2.0]|107499.99999999949|
+------+------------------+------------------+



In [23]:
pre_results.meanAbsoluteError


22499.999999998356