In [0]:
from pyspark.sql.functions import unix_timestamp,lag,when,col,expr,row_number,lit,concat,lit,sum
from pyspark.sql.window import Window
from pyspark.sql.types import StructType,StructField,StringType,TimestampType
from datetime import datetime

In [0]:
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("user_info").getOrCreate()


data = [
    ("user1", "2023-08-26 10:10:00"),
    ("user1", "2023-08-26 10:10:25"),
    ("user2", "2023-08-26 12:00:00"),
    ("user2", "2023-08-26 12:10:00"),
    ("user1", "2023-08-26 14:30:00"),
    ("user1", "2023-08-26 16:00:00"),
    ("user2", "2023-08-26 16:30:00"),
    ("user1", "2023-08-26 18:00:00"),
]
schema = StructType([
    StructField("userid", StringType(), nullable=False),
    StructField("timestamp", TimestampType(), nullable=False),])

rows=[(row[0],datetime.strptime(row[1],"%Y-%m-%d %H:%M:%S")) for row in data]

df=spark.createDataFrame(rows,schema=schema)
df.show()


+------+-------------------+
|userid|          timestamp|
+------+-------------------+
| user1|2023-08-26 10:10:00|
| user1|2023-08-26 10:10:25|
| user2|2023-08-26 12:00:00|
| user2|2023-08-26 12:10:00|
| user1|2023-08-26 14:30:00|
| user1|2023-08-26 16:00:00|
| user2|2023-08-26 16:30:00|
| user1|2023-08-26 18:00:00|
+------+-------------------+



In [0]:
win=Window().partitionBy("userid").orderBy("timestamp")
df=df.withColumn("session_gap",unix_timestamp("timestamp")-unix_timestamp(lag("timestamp").over(win)))

#df.show()
df=df.withColumn("sessionid",when(col("session_gap")>30,None).\
    otherwise(concat(col("userid"),lit("_session"),row_number().\
        over(Window.partitionBy("userid").orderBy("timestamp")).cast("string"))))
df.show()

+------+-------------------+-----------+--------------+
|userid|          timestamp|session_gap|     sessionid|
+------+-------------------+-----------+--------------+
| user1|2023-08-26 10:10:00|       null|user1_session1|
| user1|2023-08-26 10:10:25|         25|user1_session2|
| user1|2023-08-26 14:30:00|      15575|          null|
| user1|2023-08-26 16:00:00|       5400|          null|
| user1|2023-08-26 18:00:00|       7200|          null|
| user2|2023-08-26 12:00:00|       null|user2_session1|
| user2|2023-08-26 12:10:00|        600|          null|
| user2|2023-08-26 16:30:00|      15600|          null|
+------+-------------------+-----------+--------------+



<h3>𝐂𝐚𝐥𝐜𝐮𝐥𝐚𝐭𝐞 𝐭𝐨𝐭𝐚𝐥 𝐭𝐢𝐦𝐞 𝐬𝐩𝐞𝐧𝐭 𝐩𝐞𝐫 𝐬𝐞𝐬𝐬𝐢𝐨𝐧</h3>

In [0]:
df=df.withColumn("session_start",when(col("session_gap")>30,None).otherwise(lag("timestamp")\
    .over(Window.partitionBy("userid").orderBy("timestamp"))))
df=df.withColumn("session_duration",when(col("session_start").isNotNull(),col("timestamp").cast("long")-col("session_start").cast("long")).otherwise(0))
df.show()

+------+-------------------+-----------+--------------+-------------------+----------------+
|userid|          timestamp|session_gap|     sessionid|      session_start|session_duration|
+------+-------------------+-----------+--------------+-------------------+----------------+
| user1|2023-08-26 10:10:00|       null|user1_session1|               null|               0|
| user1|2023-08-26 10:10:25|         25|user1_session2|2023-08-26 10:10:00|              25|
| user1|2023-08-26 14:30:00|      15575|          null|               null|               0|
| user1|2023-08-26 16:00:00|       5400|          null|               null|               0|
| user1|2023-08-26 18:00:00|       7200|          null|               null|               0|
| user2|2023-08-26 12:00:00|       null|user2_session1|               null|               0|
| user2|2023-08-26 12:10:00|        600|          null|               null|               0|
| user2|2023-08-26 16:30:00|      15600|          null|               

<h3>𝐂𝐚𝐥𝐜𝐮𝐥𝐚𝐭𝐞 𝐭𝐨𝐭𝐚𝐥 𝐬𝐞𝐬𝐬𝐢𝐨𝐧 𝐭𝐢𝐦𝐞 𝐩𝐞𝐫 𝐮𝐬𝐞𝐫.</h3>

In [0]:
df.withColumn("total_user_session_time",sum("session_duration").over(Window.partitionBy("userid"))).show()
df.withColumn("total_user_session_time",sum("session_duration").over(Window.partitionBy("userid").orderBy("timestamp"))).show()

+------+-------------------+-----------+--------------+-------------------+----------------+-----------------------+
|userid|          timestamp|session_gap|     sessionid|      session_start|session_duration|total_user_session_time|
+------+-------------------+-----------+--------------+-------------------+----------------+-----------------------+
| user1|2023-08-26 10:10:00|       null|user1_session1|               null|               0|                     25|
| user1|2023-08-26 10:10:25|         25|user1_session2|2023-08-26 10:10:00|              25|                     25|
| user1|2023-08-26 14:30:00|      15575|          null|               null|               0|                     25|
| user1|2023-08-26 16:00:00|       5400|          null|               null|               0|                     25|
| user1|2023-08-26 18:00:00|       7200|          null|               null|               0|                     25|
| user2|2023-08-26 12:00:00|       null|user2_session1|         