In [0]:
from pyspark.sql import Row
import datetime

In [0]:
users = [
    {
        "id": 1,
        "first_name": "Corrie",
        "last_name": "Van den Oord",
        "email": "cvandenoord0@etsy.com",
        "gender": "male",
        "current_city": "Dallas",
        "phone_numbers": Row(mobile="+1 234 567 8901", home="+1 234 567 8911"),
        "courses": [1, 2],
        "is_customer": True,
        "amount_paid": 1000.55,
        "customer_from": datetime.date(2021, 1, 15),
        "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "gender": "male",
        "current_city": "Houston",
        "phone_numbers":  Row(mobile="+1 234 567 8923", home="1 234 567 8934"),
        "courses": [3],
        "is_customer": True,
        "amount_paid": 900.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "gender": "female",
        "current_city": "",
        "phone_numbers": Row(mobile="+1 714 512 9752", home="+1 714 512 6601"),
        "courses": [2, 4],
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "gender": "male",
        "current_city": "San Fransisco",
        "phone_numbers": Row(mobile=None, home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 5,
        "first_name": "Kurt",
        "last_name": "Rome",
        "email": "krome4@shutterfly.com",
        "gender": "female",
        "current_city": None,
        "phone_numbers": Row(mobile="+1 817 934 7142", home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 2, 0, 55, 18)
    }
]

In [0]:
import pandas as pd

In [0]:
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', False)

In [0]:
users_df = spark.createDataFrame(pd.DataFrame(users))

In [0]:
users_df.show()

+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender| current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|       Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|  male|      Houston|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|female|             |{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home

In [0]:
help(users_df.filter)

Help on method filter in module pyspark.sql.dataframe:

filter(condition) method of pyspark.sql.dataframe.DataFrame instance
    Filters rows using the given condition.
    
    :func:`where` is an alias for :func:`filter`.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    condition : :class:`Column` or str
        a :class:`Column` of :class:`types.BooleanType`
        or a string of SQL expression.
    
    Examples
    --------
    >>> df.filter(df.age > 3).collect()
    [Row(age=5, name='Bob')]
    >>> df.where(df.age == 2).collect()
    [Row(age=2, name='Alice')]
    
    >>> df.filter("age > 3").collect()
    [Row(age=5, name='Bob')]
    >>> df.where("age = 2").collect()
    [Row(age=2, name='Alice')]



* `where` and `filter` are synonyms
* We can pass conditions either by using SQL Style or Non SQL Style.
* For Non SQL Style we can pass columns using `col` function on column name as string or using the notation of `df['column_name']`

In [0]:
from pyspark.sql.functions import col

In [0]:
users_df.filter(col('id')==1).show()

+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender|current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|      Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+



In [0]:
users_df.filter(users_df['id']==1).show()

+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender|current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|      Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+



In [0]:
users_df.where(users_df['id']==1).show()

+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender|current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|      Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+



In [0]:
users_df.filter('id =1').show()

+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender|current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|      Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+



In [0]:
users_df.where('id = 1').show()

+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender|current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|      Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+



In [0]:
users_df.createOrReplaceTempView('users')

In [0]:
spark.sql("""
      select * from users where id = 1
""").show()

+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender|current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|      Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+



* Equal -> `=` or `==`
* Not Equal -> `!=`
* Greater Than -> `>`
* Less Than -> `<`
* Greater Than or Equal To -> `>=`
* Less Than or Equal To -> `<=`
* IN Operator -> `isin` function or `IN` or `contains` function
* Between Operator -> `between` function or `BETWEEN` with `AND`

In [0]:
# list of customers (is_customer flag is set to true)

users_df.filter(col('is_customer')==True).show()

+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender|current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|      Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|  male|     Houston|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|female|            |{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
+---+----------+------------+--------------------+--

In [0]:
users_df.filter(col('is_customer')=='true').show()

+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender|current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|      Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|  male|     Houston|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|female|            |{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
+---+----------+------------+--------------------+--

In [0]:
users_df.filter("is_customer = 'true'").show()

+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender|current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|      Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|  male|     Houston|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|female|            |{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
+---+----------+------------+--------------------+--

In [0]:
spark.sql('''
    select * from users where is_customer == "true"
    ''').show()

+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender|current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|      Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|  male|     Houston|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|female|            |{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
+---+----------+------------+--------------------+--

In [0]:
# list of non_customers(is_customer flag is set to false)

users_df.filter(col('is_customer')==False).show()

+---+----------+---------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|               email|gender| current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  4|     Ashby| Maddocks|  amaddocks3@home.pl|  male|San Fransisco|        {null, null}|     []|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|     Rome|krome4@shutterfly...|female|         null|{+1 817 934 7142,...|     []|      false|        NaN|         null|2021-04-02 00:55:18|
+---+----------+---------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+



In [0]:
users_df.filter(col('is_customer')=='false').show()

+---+----------+---------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|               email|gender| current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  4|     Ashby| Maddocks|  amaddocks3@home.pl|  male|San Fransisco|        {null, null}|     []|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|     Rome|krome4@shutterfly...|female|         null|{+1 817 934 7142,...|     []|      false|        NaN|         null|2021-04-02 00:55:18|
+---+----------+---------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+



In [0]:
from pyspark.sql.functions import lit

In [0]:
# users from Dallas
# trying with lit, it will also work without lit

users_df.filter(col('current_city')==lit('Dallas')).show()

+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender|current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|      Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+



In [0]:
users_df.filter("current_city == 'Dallas'").show()

+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender|current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|      Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+



In [0]:
# get the customers who paid 900.0

users_df.filter(col('amount_paid')=='900.0').show()

+---+----------+---------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|               email|gender|current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  2|  Nikolaus|  Brewitt|nbrewitt1@dailyma...|  male|     Houston|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
+---+----------+---------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+



In [0]:
users_df.filter("amount_paid == '900.0'").show()

+---+----------+---------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|               email|gender|current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  2|  Nikolaus|  Brewitt|nbrewitt1@dailyma...|  male|     Houston|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
+---+----------+---------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+



In [0]:
# get the customer where paid amount is not a number

In [0]:
users_df.show()

+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender| current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|       Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|  male|      Houston|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|female|             |{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home

In [0]:
from pyspark.sql.functions import isnan

In [0]:
users_df.select('amount_paid',isnan('amount_paid')).show()

+-----------+------------------+
|amount_paid|isnan(amount_paid)|
+-----------+------------------+
|    1000.55|             false|
|      900.0|             false|
|     850.55|             false|
|        NaN|              true|
|        NaN|              true|
+-----------+------------------+



In [0]:
users_df.filter(isnan('amount_paid')==True).show()

+---+----------+---------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|               email|gender| current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  4|     Ashby| Maddocks|  amaddocks3@home.pl|  male|San Fransisco|        {null, null}|     []|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|     Rome|krome4@shutterfly...|female|         null|{+1 817 934 7142,...|     []|      false|        NaN|         null|2021-04-02 00:55:18|
+---+----------+---------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+



In [0]:
users_df.filter('isnan(amount_paid) = True').show()

+---+----------+---------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|               email|gender| current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  4|     Ashby| Maddocks|  amaddocks3@home.pl|  male|San Fransisco|        {null, null}|     []|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|     Rome|krome4@shutterfly...|female|         null|{+1 817 934 7142,...|     []|      false|        NaN|         null|2021-04-02 00:55:18|
+---+----------+---------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+



In [0]:
# all the users who are not living in Dallas

In [0]:
users_df.select('id','current_city').show()

+---+-------------+
| id| current_city|
+---+-------------+
|  1|       Dallas|
|  2|      Houston|
|  3|             |
|  4|San Fransisco|
|  5|         null|
+---+-------------+



In [0]:
users_df.select(
  'id','current_city').\
  filter(col('current_city')!='Dallas').\
  show()

+---+-------------+
| id| current_city|
+---+-------------+
|  2|      Houston|
|  3|             |
|  4|San Fransisco|
+---+-------------+



In [0]:
users_df.select('id','current_city').\
  filter((col('current_city')!='Dallas') | (col('current_city').isNull())).show()

+---+-------------+
| id| current_city|
+---+-------------+
|  2|      Houston|
|  3|             |
|  4|San Fransisco|
|  5|         null|
+---+-------------+



In [0]:
users_df.\
  select('id','current_city').\
  filter('current_city != "Dallas" or current_city is null').show()

+---+-------------+
| id| current_city|
+---+-------------+
|  2|      Houston|
|  3|             |
|  4|San Fransisco|
|  5|         null|
+---+-------------+



In [0]:
# all the users whose city name is not empty string.Nulls can be ignored.

users_df.\
  filter(col('current_city')!='').\
  select('id','current_city').show()

+---+-------------+
| id| current_city|
+---+-------------+
|  1|       Dallas|
|  2|      Houston|
|  4|San Fransisco|
+---+-------------+



In [0]:
# all the users whose city name is not empty string and city is not Dallas.Nulls can be ignored.
users_df.\
  select('id','current_city').\
  filter("current_city !='' and current_city!='Dallas'").show()

+---+-------------+
| id| current_city|
+---+-------------+
|  2|      Houston|
|  4|San Fransisco|
+---+-------------+



In [0]:
spark.sql('''
    select id,current_city from users 
    where not current_city ="" and not current_city ="Dallas"
  ''').show()

+---+-------------+
| id| current_city|
+---+-------------+
|  2|      Houston|
|  4|San Fransisco|
+---+-------------+



In [0]:
# user id and email whose last updated timestamp is between 2021 Feb 15th and 2021 March 15th.

users_df.\
  select('id','email','last_updated_ts').\
  show()

+---+--------------------+-------------------+
| id|               email|    last_updated_ts|
+---+--------------------+-------------------+
|  1|cvandenoord0@etsy...|2021-02-10 01:15:00|
|  2|nbrewitt1@dailyma...|2021-02-18 03:33:00|
|  3|openney2@vistapri...|2021-03-15 15:16:55|
|  4|  amaddocks3@home.pl|2021-04-10 17:45:30|
|  5|krome4@shutterfly...|2021-04-02 00:55:18|
+---+--------------------+-------------------+



In [0]:
c = col('last_update_ts')

In [0]:
help(c.between)

Help on method between in module pyspark.sql.column:

between(lowerBound, upperBound) method of pyspark.sql.column.Column instance
    True if the current column is between the lower bound and upper bound, inclusive.
    
    .. versionadded:: 1.3.0
    
    Examples
    --------
    >>> df.select(df.name, df.age.between(2, 4)).show()
    +-----+---------------------------+
    | name|((age >= 2) AND (age <= 4))|
    +-----+---------------------------+
    |Alice|                       true|
    |  Bob|                      false|
    +-----+---------------------------+



In [0]:
users_df.\
  select('id','email','last_updated_ts').\
  filter(col('last_updated_ts').between('2021-02-15 00:00:00','2021-03-15 23:59:59')).show()

+---+--------------------+-------------------+
| id|               email|    last_updated_ts|
+---+--------------------+-------------------+
|  2|nbrewitt1@dailyma...|2021-02-18 03:33:00|
|  3|openney2@vistapri...|2021-03-15 15:16:55|
+---+--------------------+-------------------+



In [0]:
users_df.\
  select('id','email','last_updated_ts').\
  filter("last_updated_ts between '2021-02-15 00:00:00' and '2021-03-15 23:59:59'").\
show()

+---+--------------------+-------------------+
| id|               email|    last_updated_ts|
+---+--------------------+-------------------+
|  2|nbrewitt1@dailyma...|2021-02-18 03:33:00|
|  3|openney2@vistapri...|2021-03-15 15:16:55|
+---+--------------------+-------------------+



In [0]:
# all the users whose payment is in the range of 850 and 900

In [0]:
users_df.\
  select('id','amount_paid').show()

+---+-----------+
| id|amount_paid|
+---+-----------+
|  1|    1000.55|
|  2|      900.0|
|  3|     850.55|
|  4|        NaN|
|  5|        NaN|
+---+-----------+



In [0]:
users_df.\
  select('id','amount_paid').\
  filter(col('amount_paid').between(850,900)).show()

+---+-----------+
| id|amount_paid|
+---+-----------+
|  2|      900.0|
|  3|     850.55|
+---+-----------+



In [0]:
users_df.dtypes

Out[49]: [('id', 'bigint'),
 ('first_name', 'string'),
 ('last_name', 'string'),
 ('email', 'string'),
 ('gender', 'string'),
 ('current_city', 'string'),
 ('phone_numbers', 'struct<mobile:string,home:string>'),
 ('courses', 'array<bigint>'),
 ('is_customer', 'boolean'),
 ('amount_paid', 'double'),
 ('customer_from', 'date'),
 ('last_updated_ts', 'timestamp')]

In [0]:
users_df.\
  select('id','amount_paid').\
  filter('amount_paid between 850 and 900').show()

+---+-----------+
| id|amount_paid|
+---+-----------+
|  2|      900.0|
|  3|     850.55|
+---+-----------+



In [0]:
users_df.\
  select('id','current_city').show()

+---+-------------+
| id| current_city|
+---+-------------+
|  1|       Dallas|
|  2|      Houston|
|  3|             |
|  4|San Fransisco|
|  5|         null|
+---+-------------+



In [0]:
# Not recommended, use `isin` instead

users_df.\
  select('id','current_city').\
  filter((col('current_city')=='Houston') | (col('current_city')=='Dallas')).show()

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  2|     Houston|
+---+------------+



In [0]:
users_df.\
  select('id','current_city').\
  filter(col('current_city').isin('Houston','Dallas')).show()

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  2|     Houston|
+---+------------+



In [0]:
# not recommended, use in instead

users_df.\
  select('id','current_city').\
  filter("current_city = 'Houston' or current_city = 'Dallas'").show()

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  2|     Houston|
+---+------------+



In [0]:
users_df.\
  select('id','current_city').\
  filter("current_city in ('Houston','Dallas')").show()

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  2|     Houston|
+---+------------+



* Get list of users whose city is either Houston or Dallas or empty string or null.

In [0]:
# Passing null will not be effective

users_df.\
  select('id','current_city').\
  filter("current_city IN ('Houston','Dallas','',NULL)").show()

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  2|     Houston|
|  3|            |
+---+------------+



In [0]:
# Boolean OR including null check

users_df.\
  select('id','current_city').\
  filter((col('current_city').isin('Houston','Dallas',''))|(col('current_city').isNull())).show()

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  2|     Houston|
|  3|            |
|  5|        null|
+---+------------+



In [0]:
users_df.\
  select('id','current_city').\
  filter("current_city in ('Houston','Dallas','') or current_city is null").show()

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  2|     Houston|
|  3|            |
|  5|        null|
+---+------------+



* `>`
* `<`
* `>=` (equivalent to boolean with `col1 > val1 or cal1 = val1`)
* `<=` (equivalent to boolean with `col1 < val1 or cal1 = val1`)

In [0]:
users_df.\
  select('id','amount_paid').\
  show()

+---+-----------+
| id|amount_paid|
+---+-----------+
|  1|    1000.55|
|  2|      900.0|
|  3|     850.55|
|  4|        NaN|
|  5|        NaN|
+---+-----------+



In [0]:
# customers who paid greater than 900

users_df.\
  filter((col('amount_paid')>900) & (isnan(col('amount_paid'))==False)).show()

+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender|current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|      Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+



In [0]:
users_df.\
  filter('amount_paid >900 and isnan(amount_paid) = false').\
  select('id','amount_paid').show()

+---+-----------+
| id|amount_paid|
+---+-----------+
|  1|    1000.55|
+---+-----------+



In [0]:
# customers who paid less than 900

users_df.\
  filter((col('amount_paid')<900) & (isnan(col('amount_paid'))==False)).\
  select('id','amount_paid').\
  show()

+---+-----------+
| id|amount_paid|
+---+-----------+
|  3|     850.55|
+---+-----------+



In [0]:
# customers who paid greater than or equals to 900

users_df.\
  filter('amount_paid>=900 and isnan(amount_paid)=false').\
  select(['id','amount_paid']).show()

+---+-----------+
| id|amount_paid|
+---+-----------+
|  1|    1000.55|
|  2|      900.0|
+---+-----------+



In [0]:
# users who became customers after 2021-01-21

users_df.\
  filter('customer_from > "2021-01-21"').\
  select('id','customer_from').show()

+---+-------------+
| id|customer_from|
+---+-------------+
|  2|   2021-02-14|
+---+-------------+



In [0]:
users_df.\
  filter(col('customer_from')> '2021-01-21').\
  select('id','customer_from').show()

+---+-------------+
| id|customer_from|
+---+-------------+
|  2|   2021-02-14|
+---+-------------+



In [0]:
users_df.\
  select('id','gender','is_customer').\
  show()

+---+------+-----------+
| id|gender|is_customer|
+---+------+-----------+
|  1|  male|       true|
|  2|  male|       true|
|  3|female|       true|
|  4|  male|      false|
|  5|female|      false|
+---+------+-----------+



In [0]:
# male customers (gender is male and is_customer is equals to true)

users_df.\
  filter((col('gender')=='male') & (col('is_customer')==True)).\
  select('id','gender','is_customer').\
  show()

+---+------+-----------+
| id|gender|is_customer|
+---+------+-----------+
|  1|  male|       true|
|  2|  male|       true|
+---+------+-----------+



In [0]:
users_df.\
  filter("gender='male' and is_customer = true").\
  select('id','gender','is_customer').\
  show()

+---+------+-----------+
| id|gender|is_customer|
+---+------+-----------+
|  1|  male|       true|
|  2|  male|       true|
+---+------+-----------+



In [0]:
users_df.\
  select('id','customer_from').\
  show()

+---+-------------+
| id|customer_from|
+---+-------------+
|  1|   2021-01-15|
|  2|   2021-02-14|
|  3|   2021-01-21|
|  4|         null|
|  5|         null|
+---+-------------+



In [0]:
# users who become customers between 2021 Jan 20th and 2021 Feb 15th

users_df.\
  filter("customer_from between '2021-01-20' and '2021-02-15'").\
  select('id','customer_from').show()

+---+-------------+
| id|customer_from|
+---+-------------+
|  2|   2021-02-14|
|  3|   2021-01-21|
+---+-------------+



In [0]:
users_df.\
  select('id','email','current_city','is_customer').\
  show()

+---+--------------------+-------------+-----------+
| id|               email| current_city|is_customer|
+---+--------------------+-------------+-----------+
|  1|cvandenoord0@etsy...|       Dallas|       true|
|  2|nbrewitt1@dailyma...|      Houston|       true|
|  3|openney2@vistapri...|             |       true|
|  4|  amaddocks3@home.pl|San Fransisco|      false|
|  5|krome4@shutterfly...|         null|      false|
+---+--------------------+-------------+-----------+



In [0]:
# get id and email of users who are not customers or city contain empty string.

users_df.\
  filter((col('is_customer')==False) | (col('current_city')=='')).\
  select('id','email').show()

+---+--------------------+
| id|               email|
+---+--------------------+
|  3|openney2@vistapri...|
|  4|  amaddocks3@home.pl|
|  5|krome4@shutterfly...|
+---+--------------------+



In [0]:
users_df.\
  filter("current_city = '' or is_customer=false").\
  select('id','email').show()

+---+--------------------+
| id|               email|
+---+--------------------+
|  3|openney2@vistapri...|
|  4|  amaddocks3@home.pl|
|  5|krome4@shutterfly...|
+---+--------------------+



In [0]:
# get id and email of users who are not customers or customers whose last updated time is before 2021-03-01

users_df.\
  filter("is_customer = false or last_updated_ts <'2021-03-01 00:00:00'").\
  select('id','email').show()

+---+--------------------+
| id|               email|
+---+--------------------+
|  1|cvandenoord0@etsy...|
|  2|nbrewitt1@dailyma...|
|  4|  amaddocks3@home.pl|
|  5|krome4@shutterfly...|
+---+--------------------+



In [0]:
users_df.\
  filter((col('is_customer')==False)|(col('last_updated_ts')<'2021-03-01')).\
  select('id','email').show()

+---+--------------------+
| id|               email|
+---+--------------------+
|  1|cvandenoord0@etsy...|
|  2|nbrewitt1@dailyma...|
|  4|  amaddocks3@home.pl|
|  5|krome4@shutterfly...|
+---+--------------------+

