**1. Import the required class from PySpark to create a SparkSession.**

In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

**2. Write the statement to create a SparkSession with an appropriate application name.**

In [25]:
spark = SparkSession.builder.appName("Tutorial 6").getOrCreate()

**3. Display the SparkSession object in the Jupyter Notebook.**

In [26]:
spark

**4. Write the command used to read a CSV file into a PySpark DataFrame with the header option enabled.**

In [27]:
df = spark.read.csv('test1.csv',header=True,inferSchema=True)

**5. Display the contents of the DataFrame after loading the dataset.**

In [28]:
df.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



**6. Write the statement used to check the schema of the DataFrame.**

In [29]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



**7. Import the required PySpark ML class to convert categorical string columns into numeric form.**

In [30]:
from pyspark.ml.feature import StringIndexer

**8. Write the command used to apply StringIndexer on a categorical column.**

In [31]:
ir = StringIndexer(inputCol="Name",outputCol="NameIndex")
df_name_indexed = ir.fit(df).transform(df)

**9. Write the statement used to display the transformed DataFrame after applying StringIndexer.**

In [32]:
df_name_indexed.show()

+---------+---+----------+------+---------+
|     Name|age|Experience|Salary|NameIndex|
+---------+---+----------+------+---------+
|    Krish| 31|        10| 30000|      1.0|
|Sudhanshu| 30|         8| 25000|      4.0|
|    Sunny| 29|         4| 20000|      5.0|
|     Paul| 24|         3| 20000|      2.0|
|   Harsha| 21|         1| 15000|      0.0|
|  Shubham| 23|         2| 18000|      3.0|
+---------+---+----------+------+---------+



**10. Import the required class used to assemble multiple input columns into a single feature vector.**

In [33]:
from pyspark.ml.feature import VectorAssembler

**11. Write the command used to create a VectorAssembler with the specified input columns and output column.**

In [38]:
va = VectorAssembler(inputCols=["age","Experience"],outputCol="Input")

**12. Write the statement used to transform the DataFrame using VectorAssembler.**

In [40]:
df_transformed = va.transform(df_name_indexed)
df_modified = df_transformed.select("Input","Salary")
df_modified.show()

+-----------+------+
|      Input|Salary|
+-----------+------+
|[31.0,10.0]| 30000|
| [30.0,8.0]| 25000|
| [29.0,4.0]| 20000|
| [24.0,3.0]| 20000|
| [21.0,1.0]| 15000|
| [23.0,2.0]| 18000|
+-----------+------+



**13. Import the required PySpark ML class to build a regression model.**

In [41]:
from pyspark.ml.regression import LinearRegression

**14. Write the statement used to create a Linear Regression model with the specified features and label columns.**

In [46]:
train_data, test_data = df_modified.randomSplit(weights=[0.75,0.25])
train_data.show()

+-----------+------+
|      Input|Salary|
+-----------+------+
| [23.0,2.0]| 18000|
| [24.0,3.0]| 20000|
| [29.0,4.0]| 20000|
| [30.0,8.0]| 25000|
|[31.0,10.0]| 30000|
+-----------+------+



In [55]:
lr = LinearRegression(featuresCol="Input",labelCol="Salary")

**15. Write the command used to fit (train) the Linear Regression model on the prepared DataFrame.**

In [56]:
model = lr.fit(train_data)

**16. Write the statement used to display the model coefficients and intercept.**

In [58]:
print(model.coefficients)
print(model.intercept)

[-323.28668819526484,1696.8066020811573]
22295.299605312008


In [61]:
test_data.show()

+----------+------+
|     Input|Salary|
+----------+------+
|[21.0,1.0]| 15000|
+----------+------+



In [67]:
pred = model.transform(test_data)
pred.show()

+----------+------+------------------+
|     Input|Salary|        prediction|
+----------+------+------------------+
|[21.0,1.0]| 15000|17203.085755292603|
+----------+------+------------------+

