<a href="https://colab.research.google.com/github/banno-0720/big-data/blob/main/PySpark_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Missing').getOrCreate()

In [2]:
### Read the dataset
training = spark.read.csv('test1.csv',header=True,inferSchema=True)

In [3]:
training.show()

+----+---+----------+------+
|Name|Age|Experience|Salary|
+----+---+----------+------+
| abc| 21|         3| 30000|
| def| 18|         0| 25000|
| xyz| 24|         4| 10000|
| ghi| 30|         6| 15000|
| jkl| 36|        11| 40000|
| pqr| 34|        10| 38000|
| mln| 36|        16| 19000|
+----+---+----------+------+



In [4]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [5]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

In [9]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=["Age","Experience"],outputCol="Independent Features")

In [10]:
output = featureassembler.transform(training)

In [11]:
output.show()

+----+---+----------+------+--------------------+
|Name|Age|Experience|Salary|Independent Features|
+----+---+----------+------+--------------------+
| abc| 21|         3| 30000|          [21.0,3.0]|
| def| 18|         0| 25000|          [18.0,0.0]|
| xyz| 24|         4| 10000|          [24.0,4.0]|
| ghi| 30|         6| 15000|          [30.0,6.0]|
| jkl| 36|        11| 40000|         [36.0,11.0]|
| pqr| 34|        10| 38000|         [34.0,10.0]|
| mln| 36|        16| 19000|         [36.0,16.0]|
+----+---+----------+------+--------------------+



In [12]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Independent Features']

In [13]:
finalized_data = output.select("Independent Features","Salary")

In [14]:
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [21.0,3.0]| 30000|
|          [18.0,0.0]| 25000|
|          [24.0,4.0]| 10000|
|          [30.0,6.0]| 15000|
|         [36.0,11.0]| 40000|
|         [34.0,10.0]| 38000|
|         [36.0,16.0]| 19000|
+--------------------+------+



In [15]:
from pyspark.ml.regression import LinearRegression
##train test split
train_data,test_data = finalized_data.randomSplit([0.75,0.25])

In [16]:
regressor = LinearRegression(featuresCol='Independent Features', labelCol='Salary')
regressor = regressor.fit(train_data)

In [17]:
### Coefficients
regressor.coefficients

DenseVector([620.6897, -791.2395])

In [18]:
regressor.intercept

13993.47623485593

In [19]:
### Prediction
pred_results = regressor.evaluate(test_data)

In [20]:
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [24.0,4.0]| 10000|25725.069897483696|
|         [34.0,10.0]| 38000|27184.529356943105|
+--------------------+------+------------------+



In [21]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(13270.270270270295, 182126114.2557867)