### Exaple of Mllib

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Missing').getOrCreate()

In [29]:
spark

In [2]:
## Read the dataset
training=spark.read.csv('test7.csv',header=True,inferSchema=True)

In [3]:
training.show()

+--------+---+---+------+
|    Name|Age|Exp|Salary|
+--------+---+---+------+
|  Avnish| 22|  3| 30000|
|     Ram| 21|  2| 20000|
|    Sham| 24|  3| 20000|
|    Dham| 26|  3| 30000|
|   Manan| 25|  4| 40000|
|  Shivam| 27|  6| 60000|
| Kashyap| 21|  5| 50000|
|Kashyapi| 22|  2| 20000|
|  Shilpa| 23|  1| 10000|
+--------+---+---+------+



In [4]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Exp: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [7]:
training.columns

['Name', 'Age', 'Exp', 'Salary']

In [8]:
from pyspark.ml.feature import VectorAssembler

In [10]:
featureassembler=VectorAssembler(inputCols=["Age","Exp"],outputCol="Independent feature")

In [11]:
output=featureassembler.transform(training)

In [12]:
output.show()

+--------+---+---+------+-------------------+
|    Name|Age|Exp|Salary|Independent feature|
+--------+---+---+------+-------------------+
|  Avnish| 22|  3| 30000|         [22.0,3.0]|
|     Ram| 21|  2| 20000|         [21.0,2.0]|
|    Sham| 24|  3| 20000|         [24.0,3.0]|
|    Dham| 26|  3| 30000|         [26.0,3.0]|
|   Manan| 25|  4| 40000|         [25.0,4.0]|
|  Shivam| 27|  6| 60000|         [27.0,6.0]|
| Kashyap| 21|  5| 50000|         [21.0,5.0]|
|Kashyapi| 22|  2| 20000|         [22.0,2.0]|
|  Shilpa| 23|  1| 10000|         [23.0,1.0]|
+--------+---+---+------+-------------------+



In [13]:
output.columns

['Name', 'Age', 'Exp', 'Salary', 'Independent feature']

In [14]:
final_data = output.select(,"Salary")

In [15]:
final_data.show()

+-------------------+------+
|Independent feature|Salary|
+-------------------+------+
|         [22.0,3.0]| 30000|
|         [21.0,2.0]| 20000|
|         [24.0,3.0]| 20000|
|         [26.0,3.0]| 30000|
|         [25.0,4.0]| 40000|
|         [27.0,6.0]| 60000|
|         [21.0,5.0]| 50000|
|         [22.0,2.0]| 20000|
|         [23.0,1.0]| 10000|
+-------------------+------+



In [23]:
from pyspark.ml.regression import LinearRegression
## train test split
train_data,test_data=final_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol="Independent feature",labelCol="Salary")
regressor=regressor.fit(train_data)

In [24]:
## coefficients
regressor.coefficients

DenseVector([0.0, 10000.0])

In [25]:
## intercepts
regressor.intercept

-5.766865592832038e-10

In [26]:
## prediction
pred = regressor.evaluate(test_data)

In [27]:
pred.predictions.show()

+-------------------+------+-----------------+
|Independent feature|Salary|       prediction|
+-------------------+------+-----------------+
|         [21.0,5.0]| 50000|49999.99999999985|
|         [24.0,3.0]| 20000|30000.00000000002|
|         [26.0,3.0]| 30000|30000.00000000008|
+-------------------+------+-----------------+



In [28]:
pred.meanAbsoluteError,pred.meanSquaredError

(3333.333333333417, 33333333.33333345)

### Great