<h3> Customer Retention and Churn Analysis (Part 2/2) | SQL Interview Question Product </h3>

In [0]:
%sql
create table transactions(
order_id int,
cust_id int,
order_date date,
amount int
);
delete from transactions;
insert into transactions values 
(1,1,'2020-01-15',150)
,(2,1,'2020-02-10',150)
,(3,2,'2020-01-16',150)
,(4,2,'2020-02-25',150)
,(5,3,'2020-01-10',150)
,(6,3,'2020-02-20',150)
,(7,4,'2020-01-20',150)
,(8,5,'2020-02-20',150)
;

num_affected_rows,num_inserted_rows
8,8


In [0]:
%sql
select * from transactions

order_id,cust_id,order_date,amount
1,1,2020-01-15,150
2,1,2020-02-10,150
3,2,2020-01-16,150
4,2,2020-02-25,150
5,3,2020-01-10,150
6,3,2020-02-20,150
7,4,2020-01-20,150
8,5,2020-02-20,150


<h3> Solution in sql(hive) </h3>

In [0]:
%sql 
select month(t1.order_date) month_no,count(distinct t1.cust_id) as no_of_lost_cust
from transactions t1
left join transactions t2 on t1.cust_id=t2.cust_id and  month(t2.order_date)-month(t1.order_date) =1
where t2.cust_id is null
group by month(t1.order_date)

month_no,no_of_lost_cust
1,1
2,4


<h3> solution in pyspark <\h3>

In [0]:
from pyspark.sql.functions import col,month,sum,isnull,count


In [0]:
df_t=spark.sql("select * from transactions")
df_t.show()

+--------+-------+----------+------+
|order_id|cust_id|order_date|amount|
+--------+-------+----------+------+
|       1|      1|2020-01-15|   150|
|       2|      1|2020-02-10|   150|
|       3|      2|2020-01-16|   150|
|       4|      2|2020-02-25|   150|
|       5|      3|2020-01-10|   150|
|       6|      3|2020-02-20|   150|
|       7|      4|2020-01-20|   150|
|       8|      5|2020-02-20|   150|
+--------+-------+----------+------+



In [0]:
df_t=df_t.withColumn("month_no",month(col("order_date")))
df_t2=df_t.select(df_t["cust_id"].alias("ci"),df_t["month_no"].alias("mn"))
df_final=df_t.join(df_t2,(df_t2.ci==df_t.cust_id)&((df_t2.mn-df_t.month_no)==1),"left")\
    .withColumn("cnt",when(isnull(df_t2.ci),0).otherwise(1)).groupBy("month_no")\
        .agg(sum("cnt").alias("cnt"),count(col("cnt")).alias("as_tot"))
df_final.select("month_no",(col("as_tot")-col("cnt")).alias("no_of_lost_cust")).show()

+--------+---------------+
|month_no|no_of_lost_cust|
+--------+---------------+
|       1|              1|
|       2|              4|
+--------+---------------+

