1. Import the required class from PySpark to create a Spark session.

In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *

2. Write the statement to create a SparkSession with an appropriate application name.

In [2]:
spark = SparkSession.builder.appName("Amith").getOrCreate()

3. Display the SparkSession object in the Jupyter Notebook.

In [3]:
spark

4. Write the command used to read a CSV file into a PySpark DataFrame with headers enabled.

In [4]:
df = spark.read.csv("test2.csv",header=True,inferSchema=True)

5. Display the contents of the DataFrame after loading the dataset.

In [5]:
df.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



6. Write the statement used to check the schema of the PySpark DataFrame.

In [6]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



7. How do you drop all rows containing null values from the DataFrame?

In [7]:
df_2 = df.na.drop(how="any")
df_2.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



8. Write the statement used to drop rows that contain null values in all columns only.

In [8]:
df_3 = df.na.drop(how="all")
df_3.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



9. How do you drop rows based on null values in a specific column?

In [9]:
df_4 = df.na.drop(how="any",subset="age")
df_4.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|     NULL| 34|        10| 38000|
|     NULL| 36|      NULL|  NULL|
+---------+---+----------+------+



10. Write the command used to drop rows only when the number of non-null values is below a specified threshold.

In [10]:
df_5 = df.na.drop(how="any",thresh=2)
df_5.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
+---------+----+----------+------+



11. How do you fill all missing values in the DataFrame with a constant value?

In [11]:
df_6 = df.na.fill("Missing Values")
df_6.show()

+--------------+----+----------+------+
|          Name| age|Experience|Salary|
+--------------+----+----------+------+
|         Krish|  31|        10| 30000|
|     Sudhanshu|  30|         8| 25000|
|         Sunny|  29|         4| 20000|
|          Paul|  24|         3| 20000|
|        Harsha|  21|         1| 15000|
|       Shubham|  23|         2| 18000|
|        Mahesh|NULL|      NULL| 40000|
|Missing Values|  34|        10| 38000|
|Missing Values|  36|      NULL|  NULL|
+--------------+----+----------+------+



12. Write the statement used to fill missing values in a specific column only.

In [12]:
df_7 = df.na.fill("Missing Values",subset=["Experience","age"])
df_7.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



13. How do you replace null values with the mean value of a column?

In [13]:
df_7 = df.na.fill(df.agg(median(col("Experience"))).first()[0],subset=["Experience"])
df_7.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|         4| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|         4|  NULL|
+---------+----+----------+------+



14. Write the command used to calculate the mean of a column required for filling missing values.

In [14]:
from pyspark.ml.feature import *

imp = Imputer(
    inputCols = ["age","Experience","Salary"],
    outputCols = ["age_imputed", "Experience_imputed","Salary_imputed"]
).setStrategy("median")

df_6 = imp.fit(df).transform(df)

15. Write the statement used to display the DataFrame after handling missing values.

In [15]:
df_6.show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         1| 15000|         21|                 1|         15000|
|  Shubham|  23|         2| 18000|         23|                 2|         18000|
|   Mahesh|NULL|      NULL| 40000|         29|                 4|         40000|
|     NULL|  34|        10| 38000|         34|                10|         38000|
|     NULL|  36|      NULL|  NULL|         36|                 4|         20000|
+---------+----+----------+-