<h3> Scenario Based SQL Question | Solving Using SCD Type 2 Concept | SQL Interview </h3>

In [0]:
%sql
drop table if exists billings;
create table billings 
(
emp_name varchar(10),
bill_date date,
bill_rate int
);
delete from billings;
insert into billings values
('Sachin','1990-01-01',25)
,('Sehwag' ,'1989-01-01', 15)
,('Dhoni' ,'1989-01-01', 20)
,('Sachin' ,'1991-02-05', 30)
;


drop table if exists HoursWorked;
create table HoursWorked 
(
emp_name varchar(20),
work_date date,
bill_hrs int
);
insert into HoursWorked values
('Sachin', '1990-07-01' ,3)
,('Sachin', '1990-08-01', 5)
,('Sehwag','1990-07-01', 2)
,('Sachin','1991-07-01', 4);
;

num_affected_rows,num_inserted_rows
4,4


In [0]:
%sql
select * from billings

emp_name,bill_date,bill_rate
Sachin,1990-01-01,25
Sehwag,1989-01-01,15
Dhoni,1989-01-01,20
Sachin,1991-02-05,30


In [0]:
%sql
select * from HoursWorked

emp_name,work_date,bill_hrs
Sachin,1990-07-01,3
Sachin,1990-08-01,5
Sehwag,1990-07-01,2
Sachin,1991-07-01,4


<h3> Solution in sql(hive) </h3>

In [0]:
%sql 
with cte as(
select *,lead(bill_date,1,'9999-12-31')over(partition by emp_name order by bill_date asc)end_dt from billings)
,cte2 as(
select cte.emp_name,cte.bill_rate,h.bill_hrs,case when h.work_date between cte.bill_date and cte.end_dt then 1 else 0 end as rn from cte
inner join HoursWorked h on h.emp_name=cte.emp_name )
select emp_name,sum(bill_hrs*bill_rate) as totalCharge from cte2 where rn=1
group by emp_name

emp_name,totalCharge
Sachin,320
Sehwag,30


<h3> solution in pyspark <\h3>

In [0]:
from pyspark.sql.functions import col,count,lead,sum
from pyspark.sql.window import Window

In [0]:
df_b=spark.sql("select * from billings")
df_h=spark.sql("select * from HoursWorked")
df_b.show()
df_h.show()

+--------+----------+---------+
|emp_name| bill_date|bill_rate|
+--------+----------+---------+
|  Sachin|1990-01-01|       25|
|  Sehwag|1989-01-01|       15|
|   Dhoni|1989-01-01|       20|
|  Sachin|1991-02-05|       30|
+--------+----------+---------+

+--------+----------+--------+
|emp_name| work_date|bill_hrs|
+--------+----------+--------+
|  Sachin|1990-07-01|       3|
|  Sachin|1990-08-01|       5|
|  Sehwag|1990-07-01|       2|
|  Sachin|1991-07-01|       4|
+--------+----------+--------+



In [0]:
df_b=df_b.withColumn("end_dt",lead("bill_date",1,"9999-12-31").over(Window.partitionBy("emp_name").orderBy("bill_date")))
df_b.join(df_h,"emp_name","inner").filter(col("work_date").between(col("bill_date"),col("end_dt"))).groupBy("emp_name").\
    agg(sum(col("bill_rate")*col("bill_hrs")).alias("totalCharge")).show()


+--------+-----------+
|emp_name|totalCharge|
+--------+-----------+
|  Sachin|        320|
|  Sehwag|         30|
+--------+-----------+

