In [0]:
from pyspark.sql.functions import sum,desc,avg,count,max,col,lag
from pyspark.sql.window import Window

In [0]:
data = [
    (1, "login", "2023-08-20 10:23:45"),
    (2, "view", "2023-08-20 11:15:30"),
    (1, "purchase", "2023-08-20 12:45:18"),
    (3, "view", "2023-08-20 13:30:22")
]
columns = ["user_id", "action", "timestamp"]

df=spark.createDataFrame(data,columns)
df.show()

+-------+--------+-------------------+
|user_id|  action|          timestamp|
+-------+--------+-------------------+
|      1|   login|2023-08-20 10:23:45|
|      2|    view|2023-08-20 11:15:30|
|      1|purchase|2023-08-20 12:45:18|
|      3|    view|2023-08-20 13:30:22|
+-------+--------+-------------------+



<h3>Count the number of actions.</h3>

In [0]:
df.groupBy("action").count().show()

+--------+-----+
|  action|count|
+--------+-----+
|   login|    1|
|    view|    2|
|purchase|    1|
+--------+-----+



<h3>What are the unique actions recorded in the dataset.</h3>

In [0]:
df.groupBy("action").agg(count("*").alias("cnt")).filter(col("cnt")==1).select("action").show()

+--------+
|  action|
+--------+
|   login|
|purchase|
+--------+



<h3>Calculate the time duration between consecutive activities for each user</h3>

In [0]:
df=df.withColumn("timestamp",df.timestamp.cast("timestamp"))
df1=df.withColumn("lag_time",lag("timestamp").over(Window.partitionBy("user_id").orderBy("timestamp")))
df1.withColumn("time_diff",df1.timestamp.cast("long")-df1.lag_time.cast("long")).show()

+-------+--------+-------------------+-------------------+---------+
|user_id|  action|          timestamp|           lag_time|time_diff|
+-------+--------+-------------------+-------------------+---------+
|      1|   login|2023-08-20 10:23:45|               null|     null|
|      1|purchase|2023-08-20 12:45:18|2023-08-20 10:23:45|     8493|
|      2|    view|2023-08-20 11:15:30|               null|     null|
|      3|    view|2023-08-20 13:30:22|               null|     null|
+-------+--------+-------------------+-------------------+---------+

