
# Filter

## Using filter method and SQL expressions

In [0]:
data_list = [
    (101, "Mumbai", "Goa", 587),
    (102, "Mumbai", "Bangalore", 985),
    (102, "Mumbai", "Bangalore", 985),
    (103, "Delhi", "Chennai", 2208),
    (105, "Bangalore", "Kolkata", 1868),
    (104, "Delhi", "Chennai", 2208),
    (105, "Bangalore", "Kolkata", 1865)
]

test_df = spark.createDataFrame(data_list).toDF("id", "source", "destination", "distance")

display(test_df)

id,source,destination,distance
101,Mumbai,Goa,587
102,Mumbai,Bangalore,985
102,Mumbai,Bangalore,985
103,Delhi,Chennai,2208
105,Bangalore,Kolkata,1868
104,Delhi,Chennai,2208
105,Bangalore,Kolkata,1865



Filter method allows you to filter records in a DataFrame.

where is an alias to the filter method.

In [0]:
test_df.filter("source like 'Mum%' and destination not like 'Ban%'").show()

+---+------+-----------+--------+
| id|source|destination|distance|
+---+------+-----------+--------+
|101|Mumbai|        Goa|     587|
+---+------+-----------+--------+




## Using Functions and column API

In [0]:
from pyspark.sql.functions import col

+---+------+-----------+--------+
| id|source|destination|distance|
+---+------+-----------+--------+
|101|Mumbai|        Goa|     587|
+---+------+-----------+--------+



In [0]:
test_df.filter(col("source").like("Mum%"))\
    .where("destination not like 'Ban%'").show()

+---+------+-----------+--------+
| id|source|destination|distance|
+---+------+-----------+--------+
|101|Mumbai|        Goa|     587|
+---+------+-----------+--------+



In [0]:
test_df.filter(col("source").like("Mum%"))\
    .where(col("destination").like("Ban%")).show()

+---+------+-----------+--------+
| id|source|destination|distance|
+---+------+-----------+--------+
|102|Mumbai|  Bangalore|     985|
|102|Mumbai|  Bangalore|     985|
+---+------+-----------+--------+



In [0]:
# And condition

test_df.filter(col("source").like("Mum%") & col("destination").like("Ban%")).show()

+---+------+-----------+--------+
| id|source|destination|distance|
+---+------+-----------+--------+
|102|Mumbai|  Bangalore|     985|
|102|Mumbai|  Bangalore|     985|
+---+------+-----------+--------+




# Remove Duplicates

In [0]:
display(test_df)

id,source,destination,distance
101,Mumbai,Goa,587
102,Mumbai,Bangalore,985
102,Mumbai,Bangalore,985
103,Delhi,Chennai,2208
105,Bangalore,Kolkata,1868
104,Delhi,Chennai,2208
105,Bangalore,Kolkata,1865


In [0]:
display(test_df.distinct())

id,source,destination,distance
101,Mumbai,Goa,587
102,Mumbai,Bangalore,985
103,Delhi,Chennai,2208
105,Bangalore,Kolkata,1868
104,Delhi,Chennai,2208
105,Bangalore,Kolkata,1865


In [0]:
display(test_df.dropDuplicates()) #same as distinct()

id,source,destination,distance
101,Mumbai,Goa,587
102,Mumbai,Bangalore,985
103,Delhi,Chennai,2208
105,Bangalore,Kolkata,1868
104,Delhi,Chennai,2208
105,Bangalore,Kolkata,1865


In [0]:
test_df.dropDuplicates(["id"])\
    .dropDuplicates(["source", "destination", "distance"]).show()

+---+---------+-----------+--------+
| id|   source|destination|distance|
+---+---------+-----------+--------+
|102|   Mumbai|  Bangalore|     985|
|105|Bangalore|    Kolkata|    1868|
|101|   Mumbai|        Goa|     587|
|103|    Delhi|    Chennai|    2208|
+---+---------+-----------+--------+

