In [10]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [3]:
spark = SparkSession.builder.appName("Missing").getOrCreate()

In [4]:
training = spark.read.csv("Datasets/testfile3.csv", header=True, inferSchema=True)
training.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|     Yash| 23|         1|  8000|
|    Mohan| 25|         4| 25000|
|Sudhanshu| 30|         8| 35000|
|   Mahesh| 28|         6| 32000|
|    Krish| 21|         0|  6000|
|    Harsh| 42|        16| 65000|
|  Shubham| 56|        23| 77000|
+---------+---+----------+------+



In [5]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [6]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

In [7]:
featureassembler = VectorAssembler(inputCols=["Age", "Experience"], outputCol = "Independent Features")

In [8]:
output = featureassembler.transform(training)
output.show()

+---------+---+----------+------+--------------------+
|     Name|Age|Experience|Salary|Independent Features|
+---------+---+----------+------+--------------------+
|     Yash| 23|         1|  8000|          [23.0,1.0]|
|    Mohan| 25|         4| 25000|          [25.0,4.0]|
|Sudhanshu| 30|         8| 35000|          [30.0,8.0]|
|   Mahesh| 28|         6| 32000|          [28.0,6.0]|
|    Krish| 21|         0|  6000|          [21.0,0.0]|
|    Harsh| 42|        16| 65000|         [42.0,16.0]|
|  Shubham| 56|        23| 77000|         [56.0,23.0]|
+---------+---+----------+------+--------------------+



In [9]:
finalized_data = output.select("Independent Features", "Salary")
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [23.0,1.0]|  8000|
|          [25.0,4.0]| 25000|
|          [30.0,8.0]| 35000|
|          [28.0,6.0]| 32000|
|          [21.0,0.0]|  6000|
|         [42.0,16.0]| 65000|
|         [56.0,23.0]| 77000|
+--------------------+------+



In [11]:
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])

regressor = LinearRegression(featuresCol="Independent Features", labelCol="Salary")

regressor = regressor.fit(train_data)

In [12]:
regressor.coefficients

DenseVector([-2130.6991, 6296.3526])

In [13]:
regressor.intercept

51411.854103354555

In [14]:
pred_results = regressor.evaluate(test_data)

In [15]:
pred_results.predictions.show()

+--------------------+------+-----------------+
|Independent Features|Salary|       prediction|
+--------------------+------+-----------------+
|         [42.0,16.0]| 65000|62664.13373860309|
+--------------------+------+-----------------+



In [16]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(2335.8662613969136, 5456271.191132395)

In [17]:
print(f"Test data count: {test_data.count()}")

Test data count: 1
