In [0]:
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.window import Window


In [0]:
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("user_info").getOrCreate()


data = [
    (1, "2022-01-01 10:00", "start"),
    (1, "2022-01-01 10:15", "end"),
    (2, "2022-01-01 11:00", "start"),
    (1, "2022-01-01 11:30", "start"),
    (2, "2022-01-01 11:45", "end"),
    (1, "2022-01-01 12:00", "end")
]
schema = ["user_id", "timestamp", "action"]

df=spark.createDataFrame(data,schema)
df.show()


+-------+----------------+------+
|user_id|       timestamp|action|
+-------+----------------+------+
|      1|2022-01-01 10:00| start|
|      1|2022-01-01 10:15|   end|
|      2|2022-01-01 11:00| start|
|      1|2022-01-01 11:30| start|
|      2|2022-01-01 11:45|   end|
|      1|2022-01-01 12:00|   end|
+-------+----------------+------+



In [0]:
df=df.withColumn("unix_timestamp",unix_timestamp("timestamp","yyyy-MM-dd HH:mm"))
df.show()

+-------+----------------+------+--------------+
|user_id|       timestamp|action|unix_timestamp|
+-------+----------------+------+--------------+
|      1|2022-01-01 10:00| start|    1641031200|
|      1|2022-01-01 10:15|   end|    1641032100|
|      2|2022-01-01 11:00| start|    1641034800|
|      1|2022-01-01 11:30| start|    1641036600|
|      2|2022-01-01 11:45|   end|    1641037500|
|      1|2022-01-01 12:00|   end|    1641038400|
+-------+----------------+------+--------------+



<h3>𝐂𝐚𝐥𝐜𝐮𝐥𝐚𝐭𝐞 𝐬𝐞𝐬𝐬𝐢𝐨𝐧 duration</h3>

In [0]:
win=Window().partitionBy("user_id").orderBy("unix_timestamp")
df=df.withColumn("session_duration",when(col("action")=="end",col("unix_timestamp")-lag(col("unix_timestamp")).over(win)))
df.show()

+-------+----------------+------+--------------+----------------+
|user_id|       timestamp|action|unix_timestamp|session_duration|
+-------+----------------+------+--------------+----------------+
|      1|2022-01-01 10:00| start|    1641031200|            null|
|      1|2022-01-01 10:15|   end|    1641032100|             900|
|      1|2022-01-01 11:30| start|    1641036600|            null|
|      1|2022-01-01 12:00|   end|    1641038400|            1800|
|      2|2022-01-01 11:00| start|    1641034800|            null|
|      2|2022-01-01 11:45|   end|    1641037500|            2700|
+-------+----------------+------+--------------+----------------+



<h3>𝐂𝐚𝐥𝐜𝐮𝐥𝐚𝐭𝐞 𝐭𝐨𝐭𝐚𝐥 𝐬𝐞𝐬𝐬𝐢𝐨𝐧 duration 𝐩𝐞𝐫 𝐮𝐬𝐞𝐫.</h3>

In [0]:
df.withColumn("total_session_duration",sum("session_duration").over(Window.partitionBy("user_id"))).show()
df.withColumn("total_session_duration",sum("session_duration").over(Window.partitionBy("user_id").orderBy("timestamp"))).show()

+-------+----------------+------+--------------+----------------+----------------------+
|user_id|       timestamp|action|unix_timestamp|session_duration|total_session_duration|
+-------+----------------+------+--------------+----------------+----------------------+
|      1|2022-01-01 10:00| start|    1641031200|            null|                  2700|
|      1|2022-01-01 10:15|   end|    1641032100|             900|                  2700|
|      1|2022-01-01 11:30| start|    1641036600|            null|                  2700|
|      1|2022-01-01 12:00|   end|    1641038400|            1800|                  2700|
|      2|2022-01-01 11:00| start|    1641034800|            null|                  2700|
|      2|2022-01-01 11:45|   end|    1641037500|            2700|                  2700|
+-------+----------------+------+--------------+----------------+----------------------+

+-------+----------------+------+--------------+----------------+----------------------+
|user_id|       time