<h3> Data Science SQL Interview Question | Recommendation System | Complex SQL 13 </h3>

In [0]:
%sql
drop table if exists orders;
create table orders
(
order_id int,
customer_id int,
product_id int
);

insert into orders VALUES 
(1, 1, 1),
(1, 1, 2),
(1, 1, 3),
(2, 2, 1),
(2, 2, 2),
(2, 2, 4),
(3, 1, 5);

drop table if exists products;
create table products (
id int,
name varchar(10)
);
insert into products VALUES 
(1, 'A'),
(2, 'B'),
(3, 'C'),
(4, 'D'),
(5, 'E');




num_affected_rows,num_inserted_rows
5,5


In [0]:
%sql
select * from orders

order_id,customer_id,product_id
1,1,1
1,1,2
1,1,3
2,2,1
2,2,2
2,2,4
3,1,5


In [0]:
%sql
select * from products

id,name
1,A
2,B
3,C
4,D
5,E


<h3> Solution in sql(hive) </h3>

In [0]:
%sql 
select p1.name||' '||p2.name as pair,count(*) as purchase_freq from orders o1
inner join orders o2 on o1.order_id=o2.order_id
inner join products p1 on p1.id=o1.product_id
inner join  products p2 on p2.id=o2.product_id
where o1.product_id<o2.product_id
group by p1.name,p2.name
order by p1.name||' '||p2.name  asc

pair,purchase_freq
A B,2
A C,1
A D,1
B C,1
B D,1


<h3> Solution in pyspark </h3>

In [0]:
from pyspark.sql.functions import col,concat


In [0]:
df_o=spark.sql("select * from orders")
df_p=spark.sql("select * from products")
df_o.show()
df_p.show()

+--------+-----------+----------+
|order_id|customer_id|product_id|
+--------+-----------+----------+
|       1|          1|         1|
|       1|          1|         2|
|       1|          1|         3|
|       2|          2|         1|
|       2|          2|         2|
|       2|          2|         4|
|       3|          1|         5|
+--------+-----------+----------+

+---+----+
| id|name|
+---+----+
|  1|   A|
|  2|   B|
|  3|   C|
|  4|   D|
|  5|   E|
+---+----+



In [0]:
df1=df_o.join(df_p,df_p["id"]==df_o["product_id"],"inner").\
    select("order_id",col("product_id").alias("p1"),col("name").alias("n1"))
df2=df_o.join(df_p,df_p["id"]==df_o["product_id"],"inner").\
    select("order_id",col("product_id").alias("p2"),col("name").alias("n2"))
df3=df1.join(df2,"order_id","inner").filter(col("p1")<col("p2")).withColumn("pair",concat(col("n1"),col("n2"))).\
    groupBy("pair").count()
df3=df3.withColumnRenamed("count","purchase_freq").orderBy("pair")
df3.show()



+----+-------------+
|pair|purchase_freq|
+----+-------------+
|  AB|            2|
|  AC|            1|
|  AD|            1|
|  BC|            1|
|  BD|            1|
+----+-------------+

