In [1]:
from os import environ
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.appName("user").getOrCreate()

In [143]:
file_path = "file:///"+environ['DATA_LAKE']
user_df = spark.createDataFrame([
  ("blue", "view", 1610494094750, 11),
  ("green", "add to bag", 1510593114350, 21),
  ("red", "close", 1610493115350, 41),
  ("blue", "view", 1610494094350, 11),
  ("blue", "close", 1510593114312, 21),
  ("red", "view", 1610493114350, 41),
  ("red", "view", 1610593114350, 41),
  ("green", "purchase", 1610494094350, 31)
], ["item", "event", "timestamp", "userId"])

windowSpec = Window.partitionBy("userId").orderBy("time_stamp")

user_df = user_df \
            .withColumn("time_stamp", f.from_unixtime(user_df.timestamp).alias("time_stamp")) \
            .drop(user_df.timestamp) \
            .withColumn("time_stamp_logout", f.lead(f.col("time_stamp")).over(windowSpec)) \
            .orderBy(f.col("userId").asc(), f.col("time_stamp").asc()) \
            .filter(f.col("time_stamp_logout").isNotNull() ) \
            .withColumn("time_diff", f.datediff(f.col("time_stamp_logout"), f.col("time_stamp")))

In [144]:
user_df.printSchema()

root
 |-- item: string (nullable = true)
 |-- event: string (nullable = true)
 |-- userId: long (nullable = true)
 |-- time_stamp: string (nullable = true)
 |-- time_stamp_logout: string (nullable = true)
 |-- time_diff: integer (nullable = true)



In [145]:
user_df.show()

+----+-----+------+--------------------+--------------------+---------+
|item|event|userId|          time_stamp|   time_stamp_logout|time_diff|
+----+-----+------+--------------------+--------------------+---------+
|blue| view|    11|53004-07-11 22:39:10|53004-07-11 22:45:50|     null|
|blue|close|    21|49838-10-16 14:25:12|49838-10-16 14:25:50|     null|
| red| view|    41|53004-06-30 14:25:50|53004-06-30 14:42:30|     null|
| red|close|    41|53004-06-30 14:42:30|53007-09-01 00:12:30|     null|
+----+-----+------+--------------------+--------------------+---------+

