Customer retention refers to the ability of a company or product to retain its 							
customers over some specified period. High customer retention means customers of							
the product or business tend to return to, continue to buy or in some other way 							
not defect to another product or business, or to non-use entirely. 							
Company programs to retain customers: Zomato Pro , Cashbacks, Reward Programs etc.							


In [1]:
# Import and create SparkSession
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
#from pyspark.sql.types import StructField,IntegerType, StringType , DateType ,StructType, TimestampType
from pyspark.sql.functions import *
#from pyspark.sql.functions import count,sum,col, to_date
from pyspark.sql import Window

In [2]:
# Set configuration
my_conf = SparkConf()
my_conf.set("spark.app.name", "My application 1")
my_conf.set("spark.master", "local[*]")
my_conf.set("spark.logConf", "false")  # Disable Spark's internal logging configuration
my_conf.set("spark.driver.log.level", "ERROR")  # Set the log level for the driver

# Create spark session
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()


In [3]:
# Read csv to dataframe
df = spark.read \
    .format("csv") \
    .option("header", True) \
    .option("path", r"C:\Users\ajith\Practice Python\Pyspark_coding\Dataset\1.txt") \
    .load()

In [4]:
df.show()

+--------+-------+----------+------+
|order_id|cust_id|order_date|amount|
+--------+-------+----------+------+
|       1|      1|15-01-2020|   150|
|       2|      1|10-02-2020|   150|
|       3|      2|16-01-2020|   150|
|       4|      2|25-02-2020|   150|
|       5|      3|10-01-2020|   150|
|       6|      3|20-02-2020|   150|
|       7|      4|20-01-2020|   150|
|       8|      5|20-02-2020|   150|
+--------+-------+----------+------+



In [5]:
df1 = df.withColumn("order_date",to_date(col("order_date"),'dd-MM-yyyy')) 

In [6]:
df1.show()
df1.printSchema()

+--------+-------+----------+------+
|order_id|cust_id|order_date|amount|
+--------+-------+----------+------+
|       1|      1|2020-01-15|   150|
|       2|      1|2020-02-10|   150|
|       3|      2|2020-01-16|   150|
|       4|      2|2020-02-25|   150|
|       5|      3|2020-01-10|   150|
|       6|      3|2020-02-20|   150|
|       7|      4|2020-01-20|   150|
|       8|      5|2020-02-20|   150|
+--------+-------+----------+------+

root
 |-- order_id: string (nullable = true)
 |-- cust_id: string (nullable = true)
 |-- order_date: date (nullable = true)
 |-- amount: string (nullable = true)



In [7]:
windowSpec  = Window.partitionBy("cust_id").orderBy("order_date").rowsBetween(Window.unboundedPreceding,Window.currentRow)

df2 = df1.withColumn("retention_map",count("order_id").over(windowSpec)).orderBy("order_id") 

df3 = df2.withColumn("retetion_count_flag",when(col("retention_map")>1,1).otherwise(0))

df4 = df3.groupBy(month(col("order_date"))).agg(sum("retetion_count_flag").alias("retention_count"))

df4.show()

+-----------------+---------------+
|month(order_date)|retention_count|
+-----------------+---------------+
|                1|              0|
|                2|              3|
+-----------------+---------------+



In [8]:
windowSpec_churn  = Window.partitionBy("cust_id").orderBy("order_date").rowsBetween(Window.currentRow,Window.unboundedFollowing)

df4 = df1.withColumn("churn_map",count("order_id").over(windowSpec_churn)) 

df5 = df4.withColumn("churn_count_flag",when(col("churn_map")>1,0).otherwise(1))
        
df6 = df5.groupBy(month(col("order_date"))).agg(sum("churn_count_flag").alias("churn_count_flag"))

df6.show()

+-----------------+----------------+
|month(order_date)|churn_count_flag|
+-----------------+----------------+
|                1|               1|
|                2|               4|
+-----------------+----------------+

