<h3>Complex SQL 6 | Scenario based on join, group by and having clauses | SQL Interview Question </h3>

In [0]:
%sql
drop table if exists person;
CREATE TABLE person (
    PersonID INT,
    Name STRING,
    Email STRING,
    Score INT
);
INSERT INTO TABLE person
VALUES
    (1, 'Alice', 'alice2018@hotmail.com', 88),
    (2, 'Bob', 'bob2018@hotmail.com', 11),
    (3, 'Davis', 'davis2018@hotmail.com', 27),
    (4, 'Tara', 'tara2018@hotmail.com', 45),
    (5, 'John', 'john2018@hotmail.com', 63);

drop table if exists friend;
CREATE TABLE friend (
    PersonID INT,
    FriendID INT
);
INSERT INTO TABLE friend
VALUES
    (1, 2),
    (1, 3),
    (2, 1),
    (2, 3),
    (3, 5),
    (4, 2),
    (4, 3),
    (4, 5);



num_affected_rows,num_inserted_rows
8,8


In [0]:
%sql
select * from person;

PersonID,Name,Email,Score
1,Alice,alice2018@hotmail.com,88
2,Bob,bob2018@hotmail.com,11
3,Davis,davis2018@hotmail.com,27
4,Tara,tara2018@hotmail.com,45
5,John,john2018@hotmail.com,63


In [0]:
%sql
select * from friend;

PersonID,FriendID
1,2
1,3
2,1
2,3
3,5
4,2
4,3
4,5


<h3>solution in hive (friendship score >100)</h3>

In [0]:
%sql 
select p1.personid,p2.total_score as total_friend_score, p2.total_friend as no_of_friend ,p1.name as person_name from person p1 
inner join (
select f.PersonID,sum(score) as total_score,count(1) as total_friend from  friend f 
inner join person p on p.PersonID=f.FriendID
group by f.PersonID)p2 on p1.PersonID=p2.PersonID
where p2.total_score>100
order by p1.personid

personid,total_friend_score,no_of_friend,person_name
2,115,2,Bob
4,101,3,Tara


<h3> solution in pyspark</h3>

In [0]:
from pyspark.sql.functions import sum,count,col

In [0]:
df_person=spark.sql("select * from person")
df_friend=spark.sql("select * from friend")
df_person.show(truncate= False)
df_friend.show()

+--------+-----+---------------------+-----+
|PersonID|Name |Email                |Score|
+--------+-----+---------------------+-----+
|1       |Alice|alice2018@hotmail.com|88   |
|2       |Bob  |bob2018@hotmail.com  |11   |
|3       |Davis|davis2018@hotmail.com|27   |
|4       |Tara |tara2018@hotmail.com |45   |
|5       |John |john2018@hotmail.com |63   |
+--------+-----+---------------------+-----+

+--------+--------+
|PersonID|FriendID|
+--------+--------+
|       1|       2|
|       1|       3|
|       2|       1|
|       2|       3|
|       3|       5|
|       4|       2|
|       4|       3|
|       4|       5|
+--------+--------+



In [0]:
df1=df_friend.alias("friend").join(df_person.alias("person"), col("person.PersonID") == col("friend.FriendID"), "inner") \
    .groupBy(col("friend.PersonID")).agg(sum(col("person.score")).alias("total_friend_score"),\
         count(col("person.PersonID")).alias("no_of_friend")).filter(col("total_friend_score")>100)
df_2=df1.join(df_person,on="PersonID", how="inner").\
    select("PersonID","total_friend_score","no_of_friend","Name")
df_2=df_2.withColumnRenamed("Name","person_name")
df_2.orderBy("personid").show()


+--------+------------------+------------+-----------+
|PersonID|total_friend_score|no_of_friend|person_name|
+--------+------------------+------------+-----------+
|       2|               115|           2|        Bob|
|       4|               101|           3|       Tara|
+--------+------------------+------------+-----------+

