#Expression language : whenever a pyspark function is not found, we can use a SQL statement inside a pyspark statement using the Expresion language
#syntax : expr("SQL STATEMENT")

In [0]:
#create a dataframe from a static list
# synatx: spark.createDataFrame(data = <list> , schema =<schemalist>)

from pyspark.sql.types import *

staticlist = [("    James","Smith","USA","",3000),
    ("    Michael","Rose","USA","NY",2500),
    ("Robert","Williams","USA","CA",6000),
    ("mARIA","Jones           ","USA","FL",20000),
    ("james","Anderson","UK","LND",8000),
    ("MICHEAL","Bevon","UK","LND",3500),
    ("Robert","Patrick","UK","MCR",2800),
    ("Maria","Gonzales","UK","MCR",7000)   
  ]

columns = ["firstname", "lastname", "country", "state", "salary"]
df = spark.createDataFrame( data=staticlist,  schema=columns )
df.show()

+-----------+----------------+-------+-----+------+
|  firstname|        lastname|country|state|salary|
+-----------+----------------+-------+-----+------+
|      James|           Smith|    USA|     |  3000|
|    Michael|            Rose|    USA|   NY|  2500|
|     Robert|        Williams|    USA|   CA|  6000|
|      mARIA|Jones           |    USA|   FL| 20000|
|      james|        Anderson|     UK|  LND|  8000|
|    MICHEAL|           Bevon|     UK|  LND|  3500|
|     Robert|         Patrick|     UK|  MCR|  2800|
|      Maria|        Gonzales|     UK|  MCR|  7000|
+-----------+----------------+-------+-----+------+



In [0]:
#Replace USQ with United States
from pyspark.sql.functions import *
df2 = df.withColumn("New_country",expr("case when country = 'USA' then 'United States' else country end"))
df2.show()

+-----------+----------------+-------+-----+------+-------------+
|  firstname|        lastname|country|state|salary|  New_country|
+-----------+----------------+-------+-----+------+-------------+
|      James|           Smith|    USA|     |  3000|United States|
|    Michael|            Rose|    USA|   NY|  2500|United States|
|     Robert|        Williams|    USA|   CA|  6000|United States|
|      mARIA|Jones           |    USA|   FL| 20000|United States|
|      james|        Anderson|     UK|  LND|  8000|           UK|
|    MICHEAL|           Bevon|     UK|  LND|  3500|           UK|
|     Robert|         Patrick|     UK|  MCR|  2800|           UK|
|      Maria|        Gonzales|     UK|  MCR|  7000|           UK|
+-----------+----------------+-------+-----+------+-------------+



In [0]:
df2 = df.withColumn("New_country",expr("replace(country,'USA','United States' )"))
df2.show()

+-----------+----------------+-------+-----+------+-------------+
|  firstname|        lastname|country|state|salary|  New_country|
+-----------+----------------+-------+-----+------+-------------+
|      James|           Smith|    USA|     |  3000|United States|
|    Michael|            Rose|    USA|   NY|  2500|United States|
|     Robert|        Williams|    USA|   CA|  6000|United States|
|      mARIA|Jones           |    USA|   FL| 20000|United States|
|      james|        Anderson|     UK|  LND|  8000|           UK|
|    MICHEAL|           Bevon|     UK|  LND|  3500|           UK|
|     Robert|         Patrick|     UK|  MCR|  2800|           UK|
|      Maria|        Gonzales|     UK|  MCR|  7000|           UK|
+-----------+----------------+-------+-----+------+-------------+



Date Functions


In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import *
data = [
  (1, '2024-02-07', '12/31/2024'),
  (2, '2023-02-08', '11/30/2024'),
  (3, '2025-05-09', '08/29/2025'),
  (4, '2025-02-10', '10/30/2025'),
  (5, '2023-12-11', '12/31/2023'),
  (6, '2022-01-01', '12/31/2022'),
  (7, '2024-01-31', '03/31/2024')
]
col = ['S_no','Start_Date','End_Date']
df = spark.createDataFrame(data=data,schema=col)
df.show()
df.printSchema()

+----+----------+----------+
|S_no|Start_Date|  End_Date|
+----+----------+----------+
|   1|2024-02-07|12/31/2024|
|   2|2023-02-08|11/30/2024|
|   3|2025-05-09|08/29/2025|
|   4|2025-02-10|10/30/2025|
|   5|2023-12-11|12/31/2023|
|   6|2022-01-01|12/31/2022|
|   7|2024-01-31|03/31/2024|
+----+----------+----------+

root
 |-- S_no: long (nullable = true)
 |-- Start_Date: string (nullable = true)
 |-- End_Date: string (nullable = true)



In [0]:
#cast a string data type to date type
#cast to a date will return a value if the string is in yyyy-mm-dd format. Any other format, it will return null.
#use to_date function to handle other format scenarious

df_cast = df.withColumn("Start_Date",df.Start_Date.cast(DateType()))\
            .withColumn("End_Date",to_date(df.End_Date.cast(DateType())))
df_cast.show()
df_cast.printSchema()


+----+----------+--------+
|S_no|Start_Date|End_Date|
+----+----------+--------+
|   1|2024-02-07|    NULL|
|   2|2024-02-08|    NULL|
|   3|2025-05-09|    NULL|
|   4|2025-02-10|    NULL|
|   5|2023-12-11|    NULL|
|   6|2022-01-01|    NULL|
|   7|2024-01-31|    NULL|
+----+----------+--------+

root
 |-- S_no: long (nullable = true)
 |-- Start_Date: date (nullable = true)
 |-- End_Date: date (nullable = true)



In [0]:
# to handle above scenario, use to_date function
#use todate to convert the string into a date format
# syntax: to_date("DATECOL","dateformat")
#to_date doesnt required any optionalstring format if the current string is in yyyy-mm-dd format. if the string date is any other format,optional string is madatory  

df_cast = df.withColumn("start_date",df.Start_Date.cast(DateType()))\
            .withColumn("end_date",to_date("End_Date","MM/dd/yyyy"))
df_cast.show()
df_cast.printSchema()



+----+----------+----------+
|S_no|start_date|  end_date|
+----+----------+----------+
|   1|2024-02-07|2024-12-31|
|   2|2024-02-08|2024-11-30|
|   3|2025-05-09|2025-08-29|
|   4|2025-02-10|2025-10-30|
|   5|2023-12-11|2023-12-31|
|   6|2022-01-01|2022-12-31|
|   7|2024-01-31|2024-03-31|
+----+----------+----------+

root
 |-- S_no: long (nullable = true)
 |-- start_date: date (nullable = true)
 |-- end_date: date (nullable = true)



In [0]:
#get year, month,quarter,day etc..

df_dates =df.withColumn("YEAR",year("start_date"))\
            .withColumn("MONTH",month("start_date"))\
            .withColumn("QUARTER",quarter("start_date"))\
            .withColumn("DAY",dayofmonth("start_date"))\
            .withColumn("DAY_OF_WEEK",dayofweek("start_date"))\
            .withColumn("DAY_OF_YEAR",dayofyear("start_date"))\
            .withColumn("WEEK_OF_YEAR",weekofyear("start_date"))
df_dates.show()
df_dates.printSchema()

+----+----------+----------+----+-----+-------+---+-----------+-----------+------------+
|S_no|Start_Date|  End_Date|YEAR|MONTH|QUARTER|DAY|DAY_OF_WEEK|DAY_OF_YEAR|WEEK_OF_YEAR|
+----+----------+----------+----+-----+-------+---+-----------+-----------+------------+
|   1|2024-02-07|12/31/2024|2024|    2|      1|  7|          4|         38|           6|
|   2|2024-02-08|11/30/2024|2024|    2|      1|  8|          5|         39|           6|
|   3|2025-05-09|08/29/2025|2025|    5|      2|  9|          6|        129|          19|
|   4|2025-02-10|10/30/2025|2025|    2|      1| 10|          2|         41|           7|
|   5|2023-12-11|12/31/2023|2023|   12|      4| 11|          2|        345|          50|
|   6|2022-01-01|12/31/2022|2022|    1|      1|  1|          7|          1|          52|
|   7|2024-01-31|03/31/2024|2024|    1|      1| 31|          4|         31|           5|
+----+----------+----------+----+-----+-------+---+-----------+-----------+------------+

root
 |-- S_no: long

In [0]:
# Add date to existing Dates
df_add1 = df.withColumn("Add2_days", date_add("start_date",2))
df_add1.show()


+----+----------+----------+----------+
|S_no|Start_Date|  End_Date| Add2_days|
+----+----------+----------+----------+
|   1|2024-02-07|12/31/2024|2024-02-09|
|   2|2024-02-08|11/30/2024|2024-02-10|
|   3|2025-05-09|08/29/2025|2025-05-11|
|   4|2025-02-10|10/30/2025|2025-02-12|
|   5|2023-12-11|12/31/2023|2023-12-13|
|   6|2022-01-01|12/31/2022|2022-01-03|
|   7|2024-01-31|03/31/2024|2024-02-02|
+----+----------+----------+----------+



In [0]:
#rEMOVE dAYS FROM EXISTING DATE
df_rem = df.withColumn("rem2_days", date_sub("start_date",2))
df_rem.show()

+----+----------+----------+----------+
|S_no|Start_Date|  End_Date| rem2_days|
+----+----------+----------+----------+
|   1|2024-02-07|12/31/2024|2024-02-05|
|   2|2024-02-08|11/30/2024|2024-02-06|
|   3|2025-05-09|08/29/2025|2025-05-07|
|   4|2025-02-10|10/30/2025|2025-02-08|
|   5|2023-12-11|12/31/2023|2023-12-09|
|   6|2022-01-01|12/31/2022|2021-12-30|
|   7|2024-01-31|03/31/2024|2024-01-29|
+----+----------+----------+----------+



In [0]:
#add/Remove months to existing date
#for year, quarter,Half year use multiples of months as we dont have a direct dateadd function for year, quarter month
#2year = 24 months, 2 quarters = 5 months

df_add2 = df.withColumn("Add2_months", add_months("start_date",2))\
            .withColumn("rem2_months", add_months("start_date",-2))
df_add2.show()

+----+----------+----------+-----------+-----------+
|S_no|Start_Date|  End_Date|Add2_months|rem2_months|
+----+----------+----------+-----------+-----------+
|   1|2024-02-07|12/31/2024| 2024-04-07| 2023-12-07|
|   2|2024-02-08|11/30/2024| 2024-04-08| 2023-12-08|
|   3|2025-05-09|08/29/2025| 2025-07-09| 2025-03-09|
|   4|2025-02-10|10/30/2025| 2025-04-10| 2024-12-10|
|   5|2023-12-11|12/31/2023| 2024-02-11| 2023-10-11|
|   6|2022-01-01|12/31/2022| 2022-03-01| 2021-11-01|
|   7|2024-01-31|03/31/2024| 2024-03-31| 2023-11-30|
+----+----------+----------+-----------+-----------+



In [0]:
#Alternate option to use date add for year, quarter,week etc
#use expression language

df_add3 = df.withColumn("Add_2_years",expr("date_add(year,2,start_date)"))\
            .withColumn("Add_3_quarters",expr("date_add(quarter,3,start_date)"))\
            .withColumn("Add_5_weeks",expr("date_add(week,5,start_date)"))
df_add3.show()

+----+----------+----------+-------------------+-------------------+-------------------+
|S_no|Start_Date|  End_Date|        Add_2_years|     Add_3_quarters|        Add_5_weeks|
+----+----------+----------+-------------------+-------------------+-------------------+
|   1|2024-02-07|12/31/2024|2026-02-07 00:00:00|2024-11-07 00:00:00|2024-03-13 00:00:00|
|   2|2024-02-08|11/30/2024|2026-02-08 00:00:00|2024-11-08 00:00:00|2024-03-14 00:00:00|
|   3|2025-05-09|08/29/2025|2027-05-09 00:00:00|2026-02-09 00:00:00|2025-06-13 00:00:00|
|   4|2025-02-10|10/30/2025|2027-02-10 00:00:00|2025-11-10 00:00:00|2025-03-17 00:00:00|
|   5|2023-12-11|12/31/2023|2025-12-11 00:00:00|2024-09-11 00:00:00|2024-01-15 00:00:00|
|   6|2022-01-01|12/31/2022|2024-01-01 00:00:00|2022-10-01 00:00:00|2022-02-05 00:00:00|
|   7|2024-01-31|03/31/2024|2026-01-31 00:00:00|2024-10-31 00:00:00|2024-03-06 00:00:00|
+----+----------+----------+-------------------+-------------------+-------------------+



In [0]:
#Last day of the Month
df_lst = df.withColumn("Last_Day",last_day("start_date"))
df_lst.show()

+----+----------+----------+----------+
|S_no|Start_Date|  End_Date|  Last_Day|
+----+----------+----------+----------+
|   1|2024-02-07|12/31/2024|2024-02-29|
|   2|2023-02-08|11/30/2024|2023-02-28|
|   3|2025-05-09|08/29/2025|2025-05-31|
|   4|2025-02-10|10/30/2025|2025-02-28|
|   5|2023-12-11|12/31/2023|2023-12-31|
|   6|2022-01-01|12/31/2022|2022-01-31|
|   7|2024-01-31|03/31/2024|2024-01-31|
+----+----------+----------+----------+



In [0]:
#currentdate will give the output as date only
#currenttimestam will retun both date and time in UTC timezone

df_curnt = df.withColumn("CurrentDate",current_date())\
             .withColumn("CurrentTimestamp",current_timestamp())
df_curnt.show()

+----+----------+----------+-----------+--------------------+
|S_no|Start_Date|  End_Date|CurrentDate|    CurrentTimestamp|
+----+----------+----------+-----------+--------------------+
|   1|2024-02-07|12/31/2024| 2025-08-29|2025-08-29 12:17:...|
|   2|2023-02-08|11/30/2024| 2025-08-29|2025-08-29 12:17:...|
|   3|2025-05-09|08/29/2025| 2025-08-29|2025-08-29 12:17:...|
|   4|2025-02-10|10/30/2025| 2025-08-29|2025-08-29 12:17:...|
|   5|2023-12-11|12/31/2023| 2025-08-29|2025-08-29 12:17:...|
|   6|2022-01-01|12/31/2022| 2025-08-29|2025-08-29 12:17:...|
|   7|2024-01-31|03/31/2024| 2025-08-29|2025-08-29 12:17:...|
+----+----------+----------+-----------+--------------------+



In [0]:
#date_trunc will take you the start of the year, month,week etc..
#synatx: date_trunc('interval','DateCol')
#ACCEPTABLE PARAMETERS in datetrunc function
#'year','yyyy','yy','month','mon,''mm','day','dd','hour','minute','second','week','quarter'


df_trunc = df.withColumn("start_of_year",date_trunc('year','start_date'))\
             .withColumn("start_of_month",date_trunc('month','start_date'))\
             .withColumn("start_of_week",date_trunc('week','start_date'))\
             .withColumn("start_of_quarter",date_trunc('quarter','start_date'))\
             .withColumn("start_of_day",date_trunc('day','start_date'))
df_trunc.show()

+----+----------+----------+-------------------+-------------------+-------------------+-------------------+-------------------+
|S_no|Start_Date|  End_Date|      start_of_year|     start_of_month|      start_of_week|   start_of_quarter|       start_of_day|
+----+----------+----------+-------------------+-------------------+-------------------+-------------------+-------------------+
|   1|2024-02-07|12/31/2024|2024-01-01 00:00:00|2024-02-01 00:00:00|2024-02-05 00:00:00|2024-01-01 00:00:00|2024-02-07 00:00:00|
|   2|2023-02-08|11/30/2024|2023-01-01 00:00:00|2023-02-01 00:00:00|2023-02-06 00:00:00|2023-01-01 00:00:00|2023-02-08 00:00:00|
|   3|2025-05-09|08/29/2025|2025-01-01 00:00:00|2025-05-01 00:00:00|2025-05-05 00:00:00|2025-04-01 00:00:00|2025-05-09 00:00:00|
|   4|2025-02-10|10/30/2025|2025-01-01 00:00:00|2025-02-01 00:00:00|2025-02-10 00:00:00|2025-01-01 00:00:00|2025-02-10 00:00:00|
|   5|2023-12-11|12/31/2023|2023-01-01 00:00:00|2023-12-01 00:00:00|2023-12-11 00:00:00|2023-10-0

In [0]:
#Dateformat helps you in changing the format of the date
#syntax: date_format('DateCol','dateformat')
#Accepted string in date format
# yyyy, yy, MMMM,MMM,MM, dd, hh, mm, ss

# remove timestamp from above df_trunc dateformat from Start_Date column
df_trunc = df_trunc.withColumn("start_of_day",date_format("start_of_day","yyyy-MM-dd"))
df_trunc.show()

+----+----------+----------+-------------------+-------------------+-------------------+-------------------+------------+
|S_no|start_date|  End_Date|      start_of_year|     start_of_month|      start_of_week|   start_of_quarter|start_of_day|
+----+----------+----------+-------------------+-------------------+-------------------+-------------------+------------+
|   1|2024-02-07|12/31/2024|2024-01-01 00:00:00|2024-02-01 00:00:00|2024-02-05 00:00:00|2024-01-01 00:00:00|  2024-02-07|
|   2|2023-02-08|11/30/2024|2023-01-01 00:00:00|2023-02-01 00:00:00|2023-02-06 00:00:00|2023-01-01 00:00:00|  2023-02-08|
|   3|2025-05-09|08/29/2025|2025-01-01 00:00:00|2025-05-01 00:00:00|2025-05-05 00:00:00|2025-04-01 00:00:00|  2025-05-09|
|   4|2025-02-10|10/30/2025|2025-01-01 00:00:00|2025-02-01 00:00:00|2025-02-10 00:00:00|2025-01-01 00:00:00|  2025-02-10|
|   5|2023-12-11|12/31/2023|2023-01-01 00:00:00|2023-12-01 00:00:00|2023-12-11 00:00:00|2023-10-01 00:00:00|  2023-12-11|
|   6|2022-01-01|12/31/2

In [0]:
#get teh date format in "Aug-29"
df_fmt2 = df.withColumn("MM-YY format",date_format("start_date","MMM-yy"))
df_fmt2.show()
#get teh date format in "Auguest-29"
df_fmt3 = df.withColumn("Month-YY format",date_format("start_date","MMMM-yy"))
df_fmt3.show()

+----+----------+----------+------------+
|S_no|Start_Date|  End_Date|MM-YY format|
+----+----------+----------+------------+
|   1|2024-02-07|12/31/2024|      Feb-24|
|   2|2023-02-08|11/30/2024|      Feb-23|
|   3|2025-05-09|08/29/2025|      May-25|
|   4|2025-02-10|10/30/2025|      Feb-25|
|   5|2023-12-11|12/31/2023|      Dec-23|
|   6|2022-01-01|12/31/2022|      Jan-22|
|   7|2024-01-31|03/31/2024|      Jan-24|
+----+----------+----------+------------+

+----+----------+----------+---------------+
|S_no|Start_Date|  End_Date|Month-YY format|
+----+----------+----------+---------------+
|   1|2024-02-07|12/31/2024|    February-24|
|   2|2023-02-08|11/30/2024|    February-23|
|   3|2025-05-09|08/29/2025|         May-25|
|   4|2025-02-10|10/30/2025|    February-25|
|   5|2023-12-11|12/31/2023|    December-23|
|   6|2022-01-01|12/31/2022|     January-22|
|   7|2024-01-31|03/31/2024|     January-24|
+----+----------+----------+---------------+



In [0]:
#get the hour, minutes,seconds from a date

df2 = df.withColumn("Current_timestamp", current_timestamp())
df3 = df2.withColumn("hour",hour("Current_timestamp"))\
        .withColumn("minute",minute("Current_timestamp"))\
        .withColumn("second",second("Current_timestamp"))
df3.show()

+----+----------+----------+--------------------+----+------+------+
|S_no|Start_Date|  End_Date|   Current_timestamp|hour|minute|second|
+----+----------+----------+--------------------+----+------+------+
|   1|2024-02-07|12/31/2024|2025-08-29 12:31:...|  12|    31|    47|
|   2|2023-02-08|11/30/2024|2025-08-29 12:31:...|  12|    31|    47|
|   3|2025-05-09|08/29/2025|2025-08-29 12:31:...|  12|    31|    47|
|   4|2025-02-10|10/30/2025|2025-08-29 12:31:...|  12|    31|    47|
|   5|2023-12-11|12/31/2023|2025-08-29 12:31:...|  12|    31|    47|
|   6|2022-01-01|12/31/2022|2025-08-29 12:31:...|  12|    31|    47|
|   7|2024-01-31|03/31/2024|2025-08-29 12:31:...|  12|    31|    47|
+----+----------+----------+--------------------+----+------+------+



In [0]:
df2 = df.withColumn("Current_timestamp", current_timestamp())
df4 = (
    df2.withColumn("hour", date_format("Current_timestamp", "HH"))
       .withColumn("minute", date_format("Current_timestamp", "mm"))
       .withColumn("second", date_format("Current_timestamp", "ss"))
)
display(df4)

S_no,Start_Date,End_Date,Current_timestamp,hour,minute,second
1,2024-02-07,12/31/2024,2025-08-29T12:33:50.904001Z,12,33,50
2,2023-02-08,11/30/2024,2025-08-29T12:33:50.904001Z,12,33,50
3,2025-05-09,08/29/2025,2025-08-29T12:33:50.904001Z,12,33,50
4,2025-02-10,10/30/2025,2025-08-29T12:33:50.904001Z,12,33,50
5,2023-12-11,12/31/2023,2025-08-29T12:33:50.904001Z,12,33,50
6,2022-01-01,12/31/2022,2025-08-29T12:33:50.904001Z,12,33,50
7,2024-01-31,03/31/2024,2025-08-29T12:33:50.904001Z,12,33,50


In [0]:
#datediff function will return the difference in the form of no of days

df2 = df.withColumn("Current_Time_Stamp", current_timestamp())
df_diff = df2.withColumn("Diff_days",datediff("Current_Time_Stamp","start_date"))
df_diff.show()

+----+----------+----------+--------------------+---------+
|S_no|Start_Date|  End_Date|  Current_Time_Stamp|Diff_days|
+----+----------+----------+--------------------+---------+
|   1|2024-02-07|12/31/2024|2025-08-29 12:37:...|      569|
|   2|2023-02-08|11/30/2024|2025-08-29 12:37:...|      933|
|   3|2025-05-09|08/29/2025|2025-08-29 12:37:...|      112|
|   4|2025-02-10|10/30/2025|2025-08-29 12:37:...|      200|
|   5|2023-12-11|12/31/2023|2025-08-29 12:37:...|      627|
|   6|2022-01-01|12/31/2022|2025-08-29 12:37:...|     1336|
|   7|2024-01-31|03/31/2024|2025-08-29 12:37:...|      576|
+----+----------+----------+--------------------+---------+



In [0]:
# get the difference in time between start of todays date and current time

df2 = df.withColumn("StartOfToday",date_trunc("day",current_timestamp()))\
        .withColumn("Current_Time_Stamp", current_timestamp())
df2.show()

df_diff2 = df2.withColumn("Diff_time",unix_timestamp("Current_Time_Stamp")-unix_timestamp("StartOfToday"))
df_diff2.show()


+----+----------+----------+-------------------+--------------------+
|S_no|Start_Date|  End_Date|       StartOfToday|  Current_Time_Stamp|
+----+----------+----------+-------------------+--------------------+
|   1|2024-02-07|12/31/2024|2025-08-29 00:00:00|2025-08-29 12:40:...|
|   2|2023-02-08|11/30/2024|2025-08-29 00:00:00|2025-08-29 12:40:...|
|   3|2025-05-09|08/29/2025|2025-08-29 00:00:00|2025-08-29 12:40:...|
|   4|2025-02-10|10/30/2025|2025-08-29 00:00:00|2025-08-29 12:40:...|
|   5|2023-12-11|12/31/2023|2025-08-29 00:00:00|2025-08-29 12:40:...|
|   6|2022-01-01|12/31/2022|2025-08-29 00:00:00|2025-08-29 12:40:...|
|   7|2024-01-31|03/31/2024|2025-08-29 00:00:00|2025-08-29 12:40:...|
+----+----------+----------+-------------------+--------------------+

+----+----------+----------+-------------------+--------------------+---------+
|S_no|Start_Date|  End_Date|       StartOfToday|  Current_Time_Stamp|Diff_time|
+----+----------+----------+-------------------+--------------------+

In [0]:
from datetime import datetime# create a list with start date and end date


date_list = [
    {"Start Date": datetime(2025,5,28,12,32,0,0), "End Date": datetime(2025,5,28,12,32,0,0)},
    {"Start Date": datetime(2025,8,28,12,32,0,0), "End Date": datetime(2025,8,28,12,32,0,0)}
]
print(date_list)

[{'Start Date': datetime.datetime(2025, 5, 28, 12, 32), 'End Date': datetime.datetime(2025, 5, 28, 12, 32)}, {'Start Date': datetime.datetime(2025, 8, 28, 12, 32), 'End Date': datetime.datetime(2025, 8, 28, 12, 32)}]


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, unix_timestamp

#Initialize Spark session
spark = SparkSession.builder.appName("Datediffexample").getOrCreate()

# Creata dataframe from the date list

date_df = spark.createDataFrame(date_list)

# Calculate the difference in seconds b/w start date and end date
date_df = date_df.withColumn("Diff", (unix_timestamp(col("End Date")) - unix_timestamp(col("Start Date"))))

date_df.show()

+-------------------+-------------------+----+
|           End Date|         Start Date|Diff|
+-------------------+-------------------+----+
|2025-05-28 12:32:00|2025-05-28 12:32:00|   0|
|2025-08-28 12:32:00|2025-08-28 12:32:00|   0|
+-------------------+-------------------+----+

