In [1]:
# Let's first import the pyspark library
import pyspark

In [2]:
## then imporing the SparkSession to start the pyspark session
from pyspark.sql import SparkSession

In [3]:
## next step is to create a spark session using 'SparkSession'
spark = SparkSession.builder.appName('Missing').getOrCreate()
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/06 12:09:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
## second method to load the dataset
training = spark.read.csv('test2.csv', header=True, inferSchema=True)
training.show()

                                                                                

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 24|         3| 30000|
|   Ravi| 29|         4| 32000|
|Praveen| 22|         2| 25000|
|   Ayan| 19|         1| 23000|
|Sarvesh| 32|         6| 35000|
|   Gyan| 26|         2| 31000|
|Sreyash| 27|         3| 29500|
|Parvati| 24|         1| 23000|
|Parmesh| 28|         4| 24000|
+-------+---+----------+------+



In [5]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [6]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

Since in machine learning, we need to selkect the independent features, we also have to do here as:

[Age, Expereince] --- > new features ---> independent features

In [7]:
from pyspark.ml.feature import VectorAssembler
featuresassmbler = VectorAssembler(inputCols=['Age', 'Experience'], outputCol='Independent Features')

In [8]:
output = featuresassmbler.transform(training)

In [9]:
output.show()

+-------+---+----------+------+--------------------+
|   Name|Age|Experience|Salary|Independent Features|
+-------+---+----------+------+--------------------+
|  Sunny| 24|         3| 30000|          [24.0,3.0]|
|   Ravi| 29|         4| 32000|          [29.0,4.0]|
|Praveen| 22|         2| 25000|          [22.0,2.0]|
|   Ayan| 19|         1| 23000|          [19.0,1.0]|
|Sarvesh| 32|         6| 35000|          [32.0,6.0]|
|   Gyan| 26|         2| 31000|          [26.0,2.0]|
|Sreyash| 27|         3| 29500|          [27.0,3.0]|
|Parvati| 24|         1| 23000|          [24.0,1.0]|
|Parmesh| 28|         4| 24000|          [28.0,4.0]|
+-------+---+----------+------+--------------------+



In [10]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Independent Features']

In [11]:
finalized_data = output.select('Independent Features', 'Salary')

In [12]:
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [24.0,3.0]| 30000|
|          [29.0,4.0]| 32000|
|          [22.0,2.0]| 25000|
|          [19.0,1.0]| 23000|
|          [32.0,6.0]| 35000|
|          [26.0,2.0]| 31000|
|          [27.0,3.0]| 29500|
|          [24.0,1.0]| 23000|
|          [28.0,4.0]| 24000|
+--------------------+------+



In [14]:
from pyspark.ml.regression import LinearRegression
# train test split
train_data, test_data = finalized_data.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol='Independent Features', labelCol = 'Salary')
regressor = regressor.fit(train_data)

24/02/06 12:13:51 WARN Instrumentation: [1a1d43a7] regParam is zero, which might cause numerical instability and overfitting.
24/02/06 12:13:51 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/02/06 12:13:51 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [15]:
regressor.coefficients

DenseVector([471.9052, 943.3714])

In [17]:
### Interecepts
regressor.intercept

13233.977172956998

In [22]:
### Prediction 
pred_results = regressor.evaluate(test_data)

In [23]:
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [19.0,1.0]| 23000|23143.546971026903|
+--------------------+------+------------------+



In [25]:
# Get the summary statistics from the LinearRegression model
summary = regressor.evaluate(test_data)

# Get the mean absolute error (MAE)
mae = summary.meanAbsoluteError

# Get the mean squared error (MSE)
mse = summary.meanSquaredError

# Print the MAE and MSE
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)

Mean Absolute Error (MAE): 143.5469710269026
Mean Squared Error (MSE): 20605.732890998417
