In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [4]:
spart = SparkSession.builder.appName('missing').getOrCreate() 

In [6]:
## Read the dataset
training = spart.read.csv('emp_ml.csv',header=True, inferSchema = True)

In [7]:
training.show()

+------+---+---+------+
|  Name|age|exp|Salary|
+------+---+---+------+
|Sachin| 31| 11| 50000|
| Rahul| 34| 16| 60000|
|   Raj| 21|  2| 15000|
|  Kimi| 26|  1| 20000|
|  Paul| 23|  5| 24000|
|Harsha| 39| 13| 70000|
|  Deep| 22|  4| 26000|
|  Anna| 20|  2| 15000|
+------+---+---+------+



In [8]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- exp: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [9]:
training.columns

['Name', 'age', 'exp', 'Salary']

In [14]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=['age','exp'],outputCol='Independent_feature')

In [15]:
output=featureassembler.transform(training)

In [16]:
output.show()

+------+---+---+------+-------------------+
|  Name|age|exp|Salary|Independent_feature|
+------+---+---+------+-------------------+
|Sachin| 31| 11| 50000|        [31.0,11.0]|
| Rahul| 34| 16| 60000|        [34.0,16.0]|
|   Raj| 21|  2| 15000|         [21.0,2.0]|
|  Kimi| 26|  1| 20000|         [26.0,1.0]|
|  Paul| 23|  5| 24000|         [23.0,5.0]|
|Harsha| 39| 13| 70000|        [39.0,13.0]|
|  Deep| 22|  4| 26000|         [22.0,4.0]|
|  Anna| 20|  2| 15000|         [20.0,2.0]|
+------+---+---+------+-------------------+



In [18]:
output.columns

['Name', 'age', 'exp', 'Salary', 'Independent_feature']

In [20]:
finalized_data=output.select('Independent_feature','Salary')

In [21]:
finalized_data.show()

+-------------------+------+
|Independent_feature|Salary|
+-------------------+------+
|        [31.0,11.0]| 50000|
|        [34.0,16.0]| 60000|
|         [21.0,2.0]| 15000|
|         [26.0,1.0]| 20000|
|         [23.0,5.0]| 24000|
|        [39.0,13.0]| 70000|
|         [22.0,4.0]| 26000|
|         [20.0,2.0]| 15000|
+-------------------+------+



In [43]:
from pyspark.ml.regression import LinearRegression
## Train test split
train_data,test_data=finalized_data.randomSplit([0.7,0.3])
regressor=LinearRegression(featuresCol='Independent_feature', labelCol='Salary')
regressor=regressor.fit(train_data)

In [44]:
### Coefficeints
regressor.coefficients

DenseVector([1758.4098, 1710.5866])

In [45]:
### Intercepts
regressor.intercept

-23834.343638754068

In [46]:
### prediction
pred_results=regressor.evaluate(test_data)

In [47]:
pred_results.predictions.show()

+-------------------+------+-----------------+
|Independent_feature|Salary|       prediction|
+-------------------+------+-----------------+
|         [21.0,2.0]| 15000|16513.43445586092|
+-------------------+------+-----------------+



In [48]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError


(1513.4344558609191, 2290483.8521870365)