<a href="https://colab.research.google.com/github/augustine-uba1/PySparkLinearRegression/blob/main/Linear_Regression_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### A simple use case of predicting salary using Linear regression in PySpark

In [None]:
!pip install pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Sample').getOrCreate()
spark

In [2]:
## Reading the sample dataset with PySpark
training = spark.read.csv('salary.csv', header=True, inferSchema=True)
training.show()

+------+---+----------+------+
|  Name|age|Experience|Salary|
+------+---+----------+------+
|   Sam| 31|        12| 50000|
|  Juga| 36|         6| 51000|
|Philip| 45|        15| 60000|
|Samson| 50|        20| 65000|
|  Paul| 25|         3| 30000|
|  Raul| 21|         2| 35000|
|Robert| 29|         9| 29000|
| Sambo| 23|         4| 28000|
+------+---+----------+------+



In [3]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [5]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["age", "Experience"], outputCol="Independent Features")
output=featureassembler.transform(training)
output.show()

+------+---+----------+------+--------------------+
|  Name|age|Experience|Salary|Independent Features|
+------+---+----------+------+--------------------+
|   Sam| 31|        12| 50000|         [31.0,12.0]|
|  Juga| 36|         6| 51000|          [36.0,6.0]|
|Philip| 45|        15| 60000|         [45.0,15.0]|
|Samson| 50|        20| 65000|         [50.0,20.0]|
|  Paul| 25|         3| 30000|          [25.0,3.0]|
|  Raul| 21|         2| 35000|          [21.0,2.0]|
|Robert| 29|         9| 29000|          [29.0,9.0]|
| Sambo| 23|         4| 28000|          [23.0,4.0]|
+------+---+----------+------+--------------------+



In [9]:
final_data=output.select("Independent Features", "Salary")
final_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|         [31.0,12.0]| 50000|
|          [36.0,6.0]| 51000|
|         [45.0,15.0]| 60000|
|         [50.0,20.0]| 65000|
|          [25.0,3.0]| 30000|
|          [21.0,2.0]| 35000|
|          [29.0,9.0]| 29000|
|          [23.0,4.0]| 28000|
+--------------------+------+



In [10]:
## train test split
from pyspark.ml.regression import LinearRegression
training_data, test_data=final_data.randomSplit([0.75, 0.25])
regressor=LinearRegression(featuresCol='Independent Features', labelCol='Salary')
regressor=regressor.fit(training_data)

In [13]:
## coefficients
regressor.coefficients

DenseVector([140.0862, 1705.8029])

In [12]:
## Intercepts
regressor.intercept

25440.68450849188

In [16]:
## prediction
predicted_result=regressor.evaluate(test_data)
predicted_result.predictions.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [23.0,4.0]| 28000| 35485.87879567676|
|          [29.0,9.0]| 29000|44855.410447761155|
|          [36.0,6.0]| 51000| 40718.60524961404|
+--------------------+------+------------------+



In [17]:
predicted_result.meanAbsoluteError,predicted_result.meanSquaredError

(11207.561331274626, 137713166.60793352)