<h3> Solving 4 Tricky SQL Problems </h3>

In [0]:
%sql
drop table if exists students;
CREATE TABLE students(
 studentid int ,
 studentname String ,
 subject string ,
 marks int ,
 testid int ,
 testdate date 
)
;
insert into students values (2,'Max Ruin','Subject1',63,1,'2022-01-02');
insert into students values (3,'Arnold','Subject1',95,1,'2022-01-02');
insert into students values (4,'Krish Star','Subject1',61,1,'2022-01-02');
insert into students values (5,'John Mike','Subject1',91,1,'2022-01-02');
insert into students values (4,'Krish Star','Subject2',71,1,'2022-01-02');
insert into students values (3,'Arnold','Subject2',32,1,'2022-01-02');
insert into students values (5,'John Mike','Subject2',61,2,'2022-11-02');
insert into students values (1,'John Deo','Subject2',60,1,'2022-01-02');
insert into students values (2,'Max Ruin','Subject2',84,1,'2022-01-02');
insert into students values (2,'Max Ruin','Subject3',29,3,'2022-01-03');
insert into students values (5,'John Mike','Subject3',98,2,'2022-11-02');


num_affected_rows,num_inserted_rows
1,1


In [0]:
%sql
select * from students

studentid,studentname,subject,marks,testid,testdate
4,Krish Star,Subject2,71,1,2022-01-02
4,Krish Star,Subject1,61,1,2022-01-02
5,John Mike,Subject2,61,2,2022-11-02
5,John Mike,Subject3,98,2,2022-11-02
5,John Mike,Subject1,91,1,2022-01-02
2,Max Ruin,Subject2,84,1,2022-01-02
2,Max Ruin,Subject1,63,1,2022-01-02
1,John Deo,Subject2,60,1,2022-01-02
2,Max Ruin,Subject3,29,3,2022-01-03
3,Arnold,Subject2,32,1,2022-01-02


<h3> write sql to get list of student who get marks above avg in each sub <\h3>

In [0]:
%sql
select s.* from students s
inner join(
select subject,avg(marks)avg_mark from students group by subject) a on a.subject=s.subject
where a.avg_mark<s.marks

studentid,studentname,subject,marks,testid,testdate
4,Krish Star,Subject2,71,1,2022-01-02
5,John Mike,Subject3,98,2,2022-11-02
5,John Mike,Subject1,91,1,2022-01-02
2,Max Ruin,Subject2,84,1,2022-01-02
3,Arnold,Subject1,95,1,2022-01-02


<h3> write sql query to get per of student who score more then 90 in any subject among the all student <\h3>

In [0]:
%sql
select  count(distinct case when marks>90 then studentid else null end)*100.0/count( distinct studentid) as a from students

a
40.0


<h3> write sql query get 2nd highest and 2nd lowest marks in each subject <\h3>

In [0]:
%sql
SELECT subject, sum( case when rn_desc=2 then marks else 0 end ) as second_hightest,
sum( case when rn_asc=2 then marks else 0 end ) as second_lowest 
from(
SELECT *,rank()over(PARTITION BY subject ORDER BY marks) rn_asc,
rank()over(PARTITION BY subject ORDER BY marks desc) rn_desc FROM students
)a GROUP BY subject

subject,second_hightest,second_lowest
Subject1,91,63
Subject2,71,60
Subject3,29,98


<h3> for each student and test ,identfy if their marks increase or decrease from prev test <\h3>

In [0]:
%sql
select *, case when marks<prev_marks then 'desc' 
when marks>prev_marks then ' inc' else null end as progress from (
select *,lag(marks,1)over(partition by studentid order by testdate,subject)prev_marks from students
)a1

studentid,studentname,subject,marks,testid,testdate,prev_marks,progress
1,John Deo,Subject2,60,1,2022-01-02,,
2,Max Ruin,Subject1,63,1,2022-01-02,,
2,Max Ruin,Subject2,84,1,2022-01-02,63.0,inc
2,Max Ruin,Subject3,29,3,2022-01-03,84.0,desc
3,Arnold,Subject1,95,1,2022-01-02,,
3,Arnold,Subject2,32,1,2022-01-02,95.0,desc
4,Krish Star,Subject1,61,1,2022-01-02,,
4,Krish Star,Subject2,71,1,2022-01-02,61.0,inc
5,John Mike,Subject1,91,1,2022-01-02,,
5,John Mike,Subject2,61,2,2022-11-02,91.0,desc


<h3> solution in pyspark <\h3>

In [0]:
from pyspark.sql.functions import when,col,sum,avg,lag,rank,when,max
from pyspark.sql.window import Window

In [0]:
df_s=spark.sql("select * from students")
df_s.show()

+---------+-----------+--------+-----+------+----------+
|studentid|studentname| subject|marks|testid|  testdate|
+---------+-----------+--------+-----+------+----------+
|        4| Krish Star|Subject2|   71|     1|2022-01-02|
|        4| Krish Star|Subject1|   61|     1|2022-01-02|
|        5|  John Mike|Subject2|   61|     2|2022-11-02|
|        5|  John Mike|Subject3|   98|     2|2022-11-02|
|        5|  John Mike|Subject1|   91|     1|2022-01-02|
|        2|   Max Ruin|Subject2|   84|     1|2022-01-02|
|        2|   Max Ruin|Subject1|   63|     1|2022-01-02|
|        1|   John Deo|Subject2|   60|     1|2022-01-02|
|        2|   Max Ruin|Subject3|   29|     3|2022-01-03|
|        3|     Arnold|Subject2|   32|     1|2022-01-02|
|        3|     Arnold|Subject1|   95|     1|2022-01-02|
+---------+-----------+--------+-----+------+----------+



<h3> write sql to get list of student who get marks above avg in each sub <\h3>

In [0]:
df_avg=df_s.groupBy("subject").agg(avg("marks").alias("avg_marks"))
df_s.join(df_avg,"subject","inner").filter(col("avg_marks")<col("marks")).show()

+--------+---------+-----------+-----+------+----------+---------+
| subject|studentid|studentname|marks|testid|  testdate|avg_marks|
+--------+---------+-----------+-----+------+----------+---------+
|Subject2|        4| Krish Star|   71|     1|2022-01-02|     61.6|
|Subject3|        5|  John Mike|   98|     2|2022-11-02|     63.5|
|Subject1|        5|  John Mike|   91|     1|2022-01-02|     77.5|
|Subject2|        2|   Max Ruin|   84|     1|2022-01-02|     61.6|
|Subject1|        3|     Arnold|   95|     1|2022-01-02|     77.5|
+--------+---------+-----------+-----+------+----------+---------+



<h3> write sql query to get per of student who score more then 90 in any subject among the all student <\h3>

In [0]:
df_c=df_s.filter(col("marks")>90).select("studentid").distinct().count()
df_t=df_s.select("studentid").distinct().count()
print(df_c*100/df_t)

40.0


<h3> write sql query get 2nd highest and 2nd lowest marks in each subject <\h3>

In [0]:
df=df_s.withColumn("lag_desc",rank().over(Window.partitionBy("subject").orderBy(col("marks").desc()))).\
    withColumn("lag_asc",rank().over(Window.partitionBy("subject").orderBy(col("marks").asc())))
df.withColumn("second_highest",when(col("lag_desc")==2,col("marks"))).\
    withColumn("second_lowest",when(col("lag_asc")==2,col("marks"))).\
        groupBy("subject").agg(max("second_highest").alias("second_highest"),max("second_lowest").alias("second_lowest")).show()

+--------+--------------+-------------+
| subject|second_highest|second_lowest|
+--------+--------------+-------------+
|Subject1|            91|           63|
|Subject2|            71|           60|
|Subject3|            29|           98|
+--------+--------------+-------------+



<h3> for each student and test ,identfy if their marks increase or decrease from prev test <\h3>

In [0]:
df_s.withColumn("prev_marks",lag("marks",1).over(Window.partitionBy("studentname").orderBy("testdate","subject"))).\
    withColumn("preogress",when(col("marks")<col("prev_marks"),"dec").\
        when(col("marks")>col("prev_marks"),"inc")).show()

+---------+-----------+--------+-----+------+----------+----------+---------+
|studentid|studentname| subject|marks|testid|  testdate|prev_marks|preogress|
+---------+-----------+--------+-----+------+----------+----------+---------+
|        3|     Arnold|Subject1|   95|     1|2022-01-02|      null|     null|
|        3|     Arnold|Subject2|   32|     1|2022-01-02|        95|      dec|
|        1|   John Deo|Subject2|   60|     1|2022-01-02|      null|     null|
|        5|  John Mike|Subject1|   91|     1|2022-01-02|      null|     null|
|        5|  John Mike|Subject2|   61|     2|2022-11-02|        91|      dec|
|        5|  John Mike|Subject3|   98|     2|2022-11-02|        61|      inc|
|        4| Krish Star|Subject1|   61|     1|2022-01-02|      null|     null|
|        4| Krish Star|Subject2|   71|     1|2022-01-02|        61|      inc|
|        2|   Max Ruin|Subject1|   63|     1|2022-01-02|      null|     null|
|        2|   Max Ruin|Subject2|   84|     1|2022-01-02|        