In [1]:
from pyspark.sql import *
spark = SparkSession.builder.getOrCreate()

In [2]:
## https://www.linkedin.com/feed/update/urn:li:activity:7124450661517131776?updateEntityUrn=urn%3Ali%3Afs_feedUpdate%3A%28V2%2Curn%3Ali%3Aactivity%3A7124450661517131776%29

In [3]:
spark

In [4]:
from pyspark.sql.functions import *

In [5]:
sales_df = spark.read.format("csv").option("header","true").load("/user/itv007180/sales.txt")

In [6]:
updated_sales_df = sales_df.select(col('customer_id').cast('int'),col('order_id').cast('int'),col('order_date').cast('string'),col('total_order_value').cast('int'))

In [7]:
updated_sales_df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- total_order_value: integer (nullable = true)



In [8]:
customer_lifetime_value = updated_sales_df.groupBy("customer_id").sum("total_order_value").withColumnRenamed("sum(total_order_value)","total_value").show()

+-----------+-----------+
|customer_id|total_value|
+-----------+-----------+
|          1|        630|
|          3|        590|
|          4|        330|
|          2|        765|
+-----------+-----------+



In [9]:
customer_avg_value = updated_sales_df.groupBy("customer_id").agg(avg("total_order_value")).show()

+-----------+----------------------+
|customer_id|avg(total_order_value)|
+-----------+----------------------+
|          1|                  90.0|
|          3|    196.66666666666666|
|          4|                 110.0|
|          2|    109.28571428571429|
+-----------+----------------------+



In [10]:
updated_sales_df.show(2)

+-----------+--------+----------+-----------------+
|customer_id|order_id|order_date|total_order_value|
+-----------+--------+----------+-----------------+
|          1|     101|2023-01-05|              100|
|          2|     102|2023-02-10|              150|
+-----------+--------+----------+-----------------+
only showing top 2 rows



In [11]:
from pyspark.sql.window import Window

In [12]:
mywindow = Window.partitionBy("customer_id").orderBy("order_date")

In [13]:
windowed_df = updated_sales_df.withColumn("lead_date",lead("order_date").over(mywindow))

In [14]:
dated_df = windowed_df.withColumn("date_diff",datediff("lead_date","order_date"))

In [15]:
purchase_freq = dated_df.groupBy("customer_id").agg(avg("date_diff")).withColumnRenamed("avg(date_diff)","diff").show()

+-----------+-----------------+
|customer_id|             diff|
+-----------+-----------------+
|          1|90.33333333333333|
|          3|            183.0|
|          4|            183.0|
|          2|91.16666666666667|
+-----------+-----------------+

