# PySpark: Filter Operations

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('session-04').getOrCreate()
spark

In [3]:
df = spark.read.csv('../data/test1.csv', 
                    header=True,
                    inferSchema=True)
df.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [5]:
## Salary < 20000
df.filter('Salary < 20000').show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [6]:
df.filter('Salary < 20000').select('Name', 'age').show()

+-------+---+
|   Name|age|
+-------+---+
| Harsha| 21|
|Shubham| 23|
+-------+---+



In [8]:
df.filter(df['Salary'] < 20000).show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [11]:
df.filter((df['Salary'] < 20000) & (df['Experience'] > 1)).show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [12]:
df.filter((df['Salary'] < 20000) | (df['Experience'] > 1)).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [15]:
df.filter('Salary < 20000 and Experience > 1').show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [21]:
df.filter('Salary < 20000 AND Experience > 1').show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [18]:
df.filter('(Salary < 20000) or (Experience > 1)').show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [22]:
df.filter('(Salary < 20000) OR (Experience > 1)').show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [20]:
df.filter(~(df['Salary'] < 20000)).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
+---------+---+----------+------+



In [None]:
df.filter('NOT Sala')