In [0]:
df = spark.read.option("header",True).csv('/FileStore/tables/Null_Sample-3.csv')
df.show()
df.printSchema()

+---+------+------+
| ID|  Name|Salary|
+---+------+------+
|  1|Shivam|  2800|
|  2| Sagar|  null|
|  3|  null|  3000|
|  4|   Raj|  6000|
|  5| Kunal|  null|
+---+------+------+

root
 |-- ID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Salary: string (nullable = true)



In [0]:
# 1st method of casting columns
df = df.withColumn("ID",df.ID.cast('integer'))
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Salary: string (nullable = true)



In [0]:
df = df.withColumn("ID",df.ID.cast('integer')).withColumn("Salary",df.Salary.cast('int'))
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Salary: integer (nullable = true)



In [0]:
# 2nd method of casting columns
from pyspark.sql.functions import col
df = df.select(col("ID").cast('integer'),col("Name"),col("Salary").cast('int'))
df.show()
df.printSchema()

+---+------+------+
| ID|  Name|Salary|
+---+------+------+
|  1|Shivam|  2800|
|  2| Sagar|  null|
|  3|  null|  3000|
|  4|   Raj|  6000|
|  5| Kunal|  null|
+---+------+------+

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Salary: integer (nullable = true)



In [0]:
# 3rd method of casting columns
df = df.selectExpr('cast(ID as integer)',"Name",'cast(Salary as int)')
df.show()
df.printSchema()

+---+------+------+
| ID|  Name|Salary|
+---+------+------+
|  1|Shivam|  2800|
|  2| Sagar|  null|
|  3|  null|  3000|
|  4|   Raj|  6000|
|  5| Kunal|  null|
+---+------+------+

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Salary: integer (nullable = true)



In [0]:
# Name column will be replaced by None value because Name is a string type column
df1 = df.na.fill("None","Name")
df1.show()

+---+------+------+
| ID|  Name|Salary|
+---+------+------+
|  1|Shivam|  2800|
|  2| Sagar|  null|
|  3|  None|  3000|
|  4|   Raj|  6000|
|  5| Kunal|  null|
+---+------+------+



In [0]:
df1.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Name: string (nullable = false)
 |-- Salary: string (nullable = true)



In [0]:
# Null can not be replaced by 0 if it is a string type column.
df1 = df1.na.fill(0,"Salary")
df1.show()

+---+------+------+
| ID|  Name|Salary|
+---+------+------+
|  1|Shivam|  2800|
|  2| Sagar|  null|
|  3|  None|  3000|
|  4|   Raj|  6000|
|  5| Kunal|  null|
+---+------+------+



In [0]:
df1 = df1.withColumn("Salary",df1.Salary.cast('integer'))
df1 = df1.na.fill(0,"Salary")
df1.show()

+---+------+------+
| ID|  Name|Salary|
+---+------+------+
|  1|Shivam|  2800|
|  2| Sagar|     0|
|  3|  None|  3000|
|  4|   Raj|  6000|
|  5| Kunal|     0|
+---+------+------+



In [0]:
df1.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Name: string (nullable = false)
 |-- Salary: integer (nullable = true)



In [0]:
# Null value is getting replaced in multiple columns
df1 = df1.withColumn("ID",df1.ID.cast('integer')).withColumn("Salary",df1.Salary.cast('int'))
df1 = df1.na.fill(0,["ID","Salary"])
df1.show()

+---+------+------+
| ID|  Name|Salary|
+---+------+------+
|  1|Shivam|  2800|
|  2| Sagar|     0|
|  3|  None|  3000|
|  4|   Raj|  6000|
|  5| Kunal|     0|
+---+------+------+

