### PySpark MLIB

In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Tutorial_6').getOrCreate()
spark

In [5]:
training = spark.read.csv('DataSet2.csv', header=True, inferSchema=True).dropna()
training.show()


+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
| Aayush| 24|         3| 25000|
| Tripti| 22|         1| 20000|
| Aditya| 25|         3| 30000|
| Shivam| 25|         1| 30000|
| Ashish| 25|         3| 28000|
| Naincy| 25|         2| 32000|
|Vaibhav| 26|         2| 35000|
| Preeti| 26|         5| 40000|
| Chirag| 26|         4| 90000|
+-------+---+----------+------+



In [6]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

In [None]:
# [Age, Experience]   ---> new featu8re ---> independent feature

In [11]:
from pyspark.ml.feature import VectorAssembler
featureAssembler = VectorAssembler(inputCols=['Age', 'Experience'], outputCol='Independent_Features')

In [12]:
output = featureAssembler.transform(training)

In [13]:
output.show()

+-------+---+----------+------+--------------------+
|   Name|Age|Experience|Salary|Independent_Features|
+-------+---+----------+------+--------------------+
| Aayush| 24|         3| 25000|          [24.0,3.0]|
| Tripti| 22|         1| 20000|          [22.0,1.0]|
| Aditya| 25|         3| 30000|          [25.0,3.0]|
| Shivam| 25|         1| 30000|          [25.0,1.0]|
| Ashish| 25|         3| 28000|          [25.0,3.0]|
| Naincy| 25|         2| 32000|          [25.0,2.0]|
|Vaibhav| 26|         2| 35000|          [26.0,2.0]|
| Preeti| 26|         5| 40000|          [26.0,5.0]|
| Chirag| 26|         4| 90000|          [26.0,4.0]|
+-------+---+----------+------+--------------------+



In [14]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Independent_Features']

In [17]:
finalized_dataset = output.select('Independent_Features', 'Salary')
finalized_dataset.show()

+--------------------+------+
|Independent_Features|Salary|
+--------------------+------+
|          [24.0,3.0]| 25000|
|          [22.0,1.0]| 20000|
|          [25.0,3.0]| 30000|
|          [25.0,1.0]| 30000|
|          [25.0,3.0]| 28000|
|          [25.0,2.0]| 32000|
|          [26.0,2.0]| 35000|
|          [26.0,5.0]| 40000|
|          [26.0,4.0]| 90000|
+--------------------+------+



In [None]:
from pyspark.ml.regression import LinearRegression
# train, test, split
train_data, test_data = finalized_dataset.randomSplit([0.75 , 0.25])
regressor = LinearRegression(featuresCol='Independent_Features', labelCol='Salary')
regressor = regressor.fit(train_data)

In [20]:
# COEFFICIENTS
regressor.coefficients

DenseVector([5746.3235, 9772.0588])

In [21]:
# intercepts
regressor.intercept

-129180.14705881322

In [24]:
# prediction
pred_results = regressor.evaluate(test_data)
pred_results.predictions.show()

+--------------------+------+-----------------+
|Independent_Features|Salary|       prediction|
+--------------------+------+-----------------+
|          [26.0,5.0]| 40000|69084.55882352954|
+--------------------+------+-----------------+



In [25]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(29084.558823529544, 845911561.9593502)