In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
.appName('handling-data-types') \
.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/30 20:19:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data = [
    (1, "John Doe", "Banglore", "2023-01-15", "152.75", "True"),
    (2, "Jane Smith", "Delhi", "2023-05-20", "89.75", "True"),
    (3, "Robert Brown", "Mumbai", "InvalidDate", "200.00", "True"),
    (4, "Linda White", "Kolkata", "2023-01-16", "None", "yes"),
    (5, "Mike Green", "Chennai", "2023-08-10", "NaN", "1"),
    (6, "Sarah Blue", "Hyderabad", "InvalidDate", "300.25", "No"),
]

columns = ["id", "name", "city", "date", "amount", "is_active"]

df = spark.createDataFrame(data, schema = columns)

df.show()

                                                                                

+---+------------+---------+-----------+------+---------+
| id|        name|     city|       date|amount|is_active|
+---+------------+---------+-----------+------+---------+
|  1|    John Doe| Banglore| 2023-01-15|152.75|     True|
|  2|  Jane Smith|    Delhi| 2023-05-20| 89.75|     True|
|  3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|
|  4| Linda White|  Kolkata| 2023-01-16|  None|      yes|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|
|  6|  Sarah Blue|Hyderabad|InvalidDate|300.25|       No|
+---+------------+---------+-----------+------+---------+



In [5]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- is_active: string (nullable = true)



In [6]:
# handle integer column

In [7]:
df['id']

Column<'id'>

In [8]:
df.filter(df.id > 3).show()

+---+-----------+---------+-----------+------+---------+
| id|       name|     city|       date|amount|is_active|
+---+-----------+---------+-----------+------+---------+
|  4|Linda White|  Kolkata| 2023-01-16|  None|      yes|
|  5| Mike Green|  Chennai| 2023-08-10|   NaN|        1|
|  6| Sarah Blue|Hyderabad|InvalidDate|300.25|       No|
+---+-----------+---------+-----------+------+---------+



In [10]:
df.withColumn('double_id', df.id*2).show()

+---+------------+---------+-----------+------+---------+---------+
| id|        name|     city|       date|amount|is_active|double_id|
+---+------------+---------+-----------+------+---------+---------+
|  1|    John Doe| Banglore| 2023-01-15|152.75|     True|        2|
|  2|  Jane Smith|    Delhi| 2023-05-20| 89.75|     True|        4|
|  3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|        6|
|  4| Linda White|  Kolkata| 2023-01-16|  None|      yes|        8|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|       10|
|  6|  Sarah Blue|Hyderabad|InvalidDate|300.25|       No|       12|
+---+------------+---------+-----------+------+---------+---------+



In [13]:
# type cast long to int

from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col

df = df.withColumn('id', col('id').cast(IntegerType())) 

In [14]:
df.show()

+---+------------+---------+-----------+------+---------+
| id|        name|     city|       date|amount|is_active|
+---+------------+---------+-----------+------+---------+
|  1|    John Doe| Banglore| 2023-01-15|152.75|     True|
|  2|  Jane Smith|    Delhi| 2023-05-20| 89.75|     True|
|  3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|
|  4| Linda White|  Kolkata| 2023-01-16|  None|      yes|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|
|  6|  Sarah Blue|Hyderabad|InvalidDate|300.25|       No|
+---+------------+---------+-----------+------+---------+



In [15]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- is_active: string (nullable = true)



In [19]:
# handling strings
from pyspark.sql.functions import upper
from pyspark.sql.functions import col

df.withColumn('upper_name', upper('name')).show()

+---+------------+---------+-----------+------+---------+------------+
| id|        name|     city|       date|amount|is_active|  upper_name|
+---+------------+---------+-----------+------+---------+------------+
|  1|    John Doe| Banglore| 2023-01-15|152.75|     True|    JOHN DOE|
|  2|  Jane Smith|    Delhi| 2023-05-20| 89.75|     True|  JANE SMITH|
|  3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|ROBERT BROWN|
|  4| Linda White|  Kolkata| 2023-01-16|  None|      yes| LINDA WHITE|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|  MIKE GREEN|
|  6|  Sarah Blue|Hyderabad|InvalidDate|300.25|       No|  SARAH BLUE|
+---+------------+---------+-----------+------+---------+------------+



In [22]:
from pyspark.sql.functions import startswith
df.filter(df.name.startswith('J')).show()

+---+----------+--------+----------+------+---------+
| id|      name|    city|      date|amount|is_active|
+---+----------+--------+----------+------+---------+
|  1|  John Doe|Banglore|2023-01-15|152.75|     True|
|  2|Jane Smith|   Delhi|2023-05-20| 89.75|     True|
+---+----------+--------+----------+------+---------+



In [23]:
# float type

In [24]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- is_active: string (nullable = true)



In [25]:
from pyspark.sql.types import FloatType
df = df.withColumn('amount', col('amount').cast(FloatType()))

In [26]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: float (nullable = true)
 |-- is_active: string (nullable = true)



In [27]:
df.show()

+---+------------+---------+-----------+------+---------+
| id|        name|     city|       date|amount|is_active|
+---+------------+---------+-----------+------+---------+
|  1|    John Doe| Banglore| 2023-01-15|152.75|     True|
|  2|  Jane Smith|    Delhi| 2023-05-20| 89.75|     True|
|  3|Robert Brown|   Mumbai|InvalidDate| 200.0|     True|
|  4| Linda White|  Kolkata| 2023-01-16|  NULL|      yes|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|
|  6|  Sarah Blue|Hyderabad|InvalidDate|300.25|       No|
+---+------------+---------+-----------+------+---------+



In [28]:
df_filled = df.fillna({'amount': 0.0})
df_filled.show()

+---+------------+---------+-----------+------+---------+
| id|        name|     city|       date|amount|is_active|
+---+------------+---------+-----------+------+---------+
|  1|    John Doe| Banglore| 2023-01-15|152.75|     True|
|  2|  Jane Smith|    Delhi| 2023-05-20| 89.75|     True|
|  3|Robert Brown|   Mumbai|InvalidDate| 200.0|     True|
|  4| Linda White|  Kolkata| 2023-01-16|   0.0|      yes|
|  5|  Mike Green|  Chennai| 2023-08-10|   0.0|        1|
|  6|  Sarah Blue|Hyderabad|InvalidDate|300.25|       No|
+---+------------+---------+-----------+------+---------+

