In [0]:
%python
dbutils.fs.rm("dbfs:/user/hive/warehouse/training_data", True)
dbutils.fs.rm("dbfs:/user/hive/warehouse/employee_data", True)
# (%fs) rm -r dbfs:/user/hive/warehouse/training_data

Out[1]: True

In [0]:
%sql
CREATE TABLE training_data (
    participant_id INT ,
    participant_name VARCHAR(50),
    training_session_date DATE,
    training_session_duration_hours DECIMAL(5, 2),
    trainer_name VARCHAR(50),
    training_location VARCHAR(100),
    training_subject VARCHAR(100)
);

In [0]:
%sql
INSERT INTO training_data (participant_id, participant_name, training_session_date, training_session_duration_hours, trainer_name, training_location, training_subject)
VALUES
    (1, 'Alice Johnson', '2024-02-05', 2.5, 'Trainer C', 'Room 101', 'Introduction to Programming'),
    (2, 'Bob Williams', '2024-02-06', 3.0, 'Trainer A', 'Conference Room B', 'Web Development Basics'),
    (3, 'Charlie Davis', '2024-02-07', 2.0, 'Trainer B', 'Training Room A', 'Project Management Fundamentals'),
    (4, 'David Smith', '2024-02-08', 1.5, 'Trainer C', 'Room 102', 'Digital Marketing Essentials'),
    (5, 'Eva Brown', '2024-02-09', 2.0, 'Trainer A', 'Conference Room A', 'Data Visualization Techniques'),
    (6, 'Frank Johnson', '2024-02-10', 3.5, 'Trainer B', 'Training Room B', 'Effective Leadership Skills'),
    (7, 'Grace Miller', '2024-02-11', 2.0, 'Trainer C', 'Room 103', 'Introduction to Machine Learning'),
    (8, 'Henry Davis', '2024-02-12', 2.5, 'Trainer A', 'Conference Room B', 'Cybersecurity Basics'),
    (9, 'Ivy Wilson', '2024-02-13', 1.5, 'Trainer B', 'Training Room A', 'Public Speaking Techniques'),
    (10, 'Jack White', '2024-02-14', 2.0, 'Trainer C', 'Room 104', 'Agile Project Management'),
    (11, 'Kelly Johnson', '2024-02-15', 3.0, 'Trainer A', 'Conference Room A', 'Social Media Marketing Strategies'),
    (12, 'Leo Brown', '2024-02-16', 2.5, 'Trainer B', 'Training Room B', 'Python Programming Basics'),
    (13, 'Mia Davis', '2024-02-17', 1.0, 'Trainer C', 'Room 105', 'Effective Time Management'),
    (14, 'Nathan Wilson', '2024-02-18', 2.0, 'Trainer A', 'Conference Room B', 'Customer Service Excellence'),
    (15, 'Olivia Miller', '2024-02-19', 2.5, 'Trainer B', 'Training Room A', 'Financial Literacy')
    ;

In [0]:
%sql
SELECT * FROM training_data


participant_id,participant_name,training_session_date,training_session_duration_hours,trainer_name,training_location,training_subject
1,Alice Johnson,2024-02-05,2.5,Trainer C,Room 101,Introduction to Programming
2,Bob Williams,2024-02-06,3.0,Trainer A,Conference Room B,Web Development Basics
3,Charlie Davis,2024-02-07,2.0,Trainer B,Training Room A,Project Management Fundamentals
4,David Smith,2024-02-08,1.5,Trainer C,Room 102,Digital Marketing Essentials
5,Eva Brown,2024-02-09,2.0,Trainer A,Conference Room A,Data Visualization Techniques
6,Frank Johnson,2024-02-10,3.5,Trainer B,Training Room B,Effective Leadership Skills
7,Grace Miller,2024-02-11,2.0,Trainer C,Room 103,Introduction to Machine Learning
8,Henry Davis,2024-02-12,2.5,Trainer A,Conference Room B,Cybersecurity Basics
9,Ivy Wilson,2024-02-13,1.5,Trainer B,Training Room A,Public Speaking Techniques
10,Jack White,2024-02-14,2.0,Trainer C,Room 104,Agile Project Management


In [0]:
%sql
-- count the number of session for each trainer and arrange trainer in ascending order
-- SELECT * FROM training_data
SELECT trainer_name,count(*) AS no_of_sessions FROM training_data
GROUP BY training_data.trainer_name
ORDER BY training_data.trainer_name

trainer_name,no_of_sessions
Trainer A,5
Trainer B,5
Trainer C,5


In [0]:
%sql
-- Find participants with session longer than average
SELECT participant_name,training_session_duration_hours FROM training_data
WHERE training_session_duration_hours > (SELECT avg(training_session_duration_hours) FROM training_data)

participant_name,training_session_duration_hours
Alice Johnson,2.5
Bob Williams,3.0
Frank Johnson,3.5
Henry Davis,2.5
Kelly Johnson,3.0
Leo Brown,2.5
Olivia Miller,2.5


In [0]:
%sql
-- find sessions conducted in the same location by different trainer
SELECT DISTINCT training_location FROM training_data AS main
WHERE exists(
  SELECT 1
  FROM training_data AS sub
  WHERE sub.training_location = main.training_location
  AND sub.trainer_name != main.trainer_name
 )

training_location


In [0]:
%sql
SELECT
    training_location,
    COUNT(DISTINCT trainer_name) AS distinct_trainers_count
FROM
    training_data
GROUP BY
    training_location
HAVING
    COUNT(DISTINCT trainer_name) > 1;

training_location,distinct_trainers_count


In [0]:
%sql
-- Find participants attending sessions by trainer A
SELECT participant_name FROM training_data
WHERE trainer_name = 'Trainer A'

participant_name
Bob Williams
Eva Brown
Henry Davis
Kelly Johnson
Nathan Wilson


In [0]:
%sql
CREATE or replace TABLE employee_data (
    employee_id INT ,
    department_id INT,
    position VARCHAR(50),
    salary INT,
    tenure_months INT,
    job_experience_years INT
);

In [0]:
%sql
INSERT INTO employee_data VALUES
(1, 101, 'Software Engineer', 90000, 24, 2),
(2, 102, 'Data Analyst', 60000, 18, 1),
(3, 101, 'Project Manager', 90000, 36, 5),
(4, 103, 'UX Designer', 75000, 27, 3),
(5, 102, 'Business Analyst', 70000, 22, 2),
(6, 101, 'Software Engineer', 80000, 30, 4),
(7, 103, 'Product Manager', 95000, 42, 7),
(8, 102, 'Data Scientist', 100000, 48, 8);

num_affected_rows,num_inserted_rows
8,8


In [0]:
%sql
SELECT * FROM employee_data

employee_id,department_id,position,salary,tenure_months,job_experience_years
1,101,Software Engineer,90000,24,2
2,102,Data Analyst,60000,18,1
3,101,Project Manager,90000,36,5
4,103,UX Designer,75000,27,3
5,102,Business Analyst,70000,22,2
6,101,Software Engineer,80000,30,4
7,103,Product Manager,95000,42,7
8,102,Data Scientist,100000,48,8


In [0]:
%sql
-- Windows Function
-- row_number()
SELECT *, row_number() OVER (PARTITION BY department_id ORDER BY salary DESC) AS serial_no
FROM employee_data

employee_id,department_id,position,salary,tenure_months,job_experience_years,serial_no
1,101,Software Engineer,90000,24,2,1
3,101,Project Manager,90000,36,5,2
6,101,Software Engineer,80000,30,4,3
8,102,Data Scientist,100000,48,8,1
5,102,Business Analyst,70000,22,2,2
2,102,Data Analyst,60000,18,1,3
7,103,Product Manager,95000,42,7,1
4,103,UX Designer,75000,27,3,2


In [0]:
%sql
-- rank
SELECT *, rank() OVER (PARTITION BY department_id ORDER BY salary DESC) AS rnk
FROM employee_data

employee_id,department_id,position,salary,tenure_months,job_experience_years,rnk
1,101,Software Engineer,90000,24,2,1
3,101,Project Manager,90000,36,5,1
6,101,Software Engineer,80000,30,4,3
8,102,Data Scientist,100000,48,8,1
5,102,Business Analyst,70000,22,2,2
2,102,Data Analyst,60000,18,1,3
7,103,Product Manager,95000,42,7,1
4,103,UX Designer,75000,27,3,2


In [0]:
%sql
--  dense rank
SELECT *, dense_rank() OVER (PARTITION BY department_id ORDER BY salary DESC) AS rnk
FROM employee_data

employee_id,department_id,position,salary,tenure_months,job_experience_years,rnk
1,101,Software Engineer,90000,24,2,1
3,101,Project Manager,90000,36,5,1
6,101,Software Engineer,80000,30,4,2
8,102,Data Scientist,100000,48,8,1
5,102,Business Analyst,70000,22,2,2
2,102,Data Analyst,60000,18,1,3
7,103,Product Manager,95000,42,7,1
4,103,UX Designer,75000,27,3,2


In [0]:
%sql
-- sum(running salary)
SELECT *,SUM(salary) OVER (PARTITION BY department_id ORDER BY tenure_months) AS running_salary
FROM employee_data 

employee_id,department_id,position,salary,tenure_months,job_experience_years,running_salary
1,101,Software Engineer,90000,24,2,90000
6,101,Software Engineer,80000,30,4,170000
3,101,Project Manager,90000,36,5,260000
2,102,Data Analyst,60000,18,1,60000
5,102,Business Analyst,70000,22,2,130000
8,102,Data Scientist,100000,48,8,230000
4,103,UX Designer,75000,27,3,75000
7,103,Product Manager,95000,42,7,170000


In [0]:
%sql
-- RUNNING AVERAGE
SELECT *, ROUND(AVG(salary) OVER (PARTITION BY department_id ORDER BY tenure_months),2) AS avg_salary
FROM employee_data

employee_id,department_id,position,salary,tenure_months,job_experience_years,avg_salary
1,101,Software Engineer,90000,24,2,90000.0
6,101,Software Engineer,80000,30,4,85000.0
3,101,Project Manager,90000,36,5,86666.67
2,102,Data Analyst,60000,18,1,60000.0
5,102,Business Analyst,70000,22,2,65000.0
8,102,Data Scientist,100000,48,8,76666.67
4,103,UX Designer,75000,27,3,75000.0
7,103,Product Manager,95000,42,7,85000.0


In [0]:
%sql
-- FIRST_VALUE
SELECT *, first_value(salary) OVER (PARTITION BY department_id ORDER BY tenure_months) AS first_tenure_salary,
        last_value(salary) OVER (PARTITION BY department_id ORDER BY tenure_months) AS last_tenure_salary
FROM employee_data

employee_id,department_id,position,salary,tenure_months,job_experience_years,first_tenure_salary,last_tenure_salary
1,101,Software Engineer,90000,24,2,90000,90000
6,101,Software Engineer,80000,30,4,90000,80000
3,101,Project Manager,90000,36,5,90000,90000
2,102,Data Analyst,60000,18,1,60000,60000
5,102,Business Analyst,70000,22,2,60000,70000
8,102,Data Scientist,100000,48,8,60000,100000
4,103,UX Designer,75000,27,3,75000,75000
7,103,Product Manager,95000,42,7,75000,95000


In [0]:
%sql
-- LEAD
SELECT *, lead(salary) OVER (PARTITION BY department_id ORDER BY tenure_months) AS lead_salary
FROM employee_data

employee_id,department_id,position,salary,tenure_months,job_experience_years,lead_salary
1,101,Software Engineer,90000,24,2,80000.0
6,101,Software Engineer,80000,30,4,90000.0
3,101,Project Manager,90000,36,5,
2,102,Data Analyst,60000,18,1,70000.0
5,102,Business Analyst,70000,22,2,100000.0
8,102,Data Scientist,100000,48,8,
4,103,UX Designer,75000,27,3,95000.0
7,103,Product Manager,95000,42,7,


In [0]:
%sql
-- LAG
SELECT *, LAG(salary) OVER (PARTITION BY department_id ORDER BY tenure_months) AS LAG_salary
FROM employee_data

employee_id,department_id,position,salary,tenure_months,job_experience_years,LAG_salary
1,101,Software Engineer,90000,24,2,
6,101,Software Engineer,80000,30,4,90000.0
3,101,Project Manager,90000,36,5,80000.0
2,102,Data Analyst,60000,18,1,
5,102,Business Analyst,70000,22,2,60000.0
8,102,Data Scientist,100000,48,8,70000.0
4,103,UX Designer,75000,27,3,
7,103,Product Manager,95000,42,7,75000.0


In [0]:
%sql
CREATE TABLE employees (
  employee_id INT,
  employee_name STRING,
  department STRING,
  salary DOUBLE
);

In [0]:
%sql
INSERT INTO employees VALUES
  (6, 'Eva Green', 'HR', 52000),
  (7, 'Sam Johnson', 'Marketing', 59000),
  (8, 'Alex Turner', 'IT', 63000),
  (9, 'Sophie Walker', 'Finance', 72000),
  (10, 'David Clark', 'IT', 60000),
  (11, 'Olivia King', 'Marketing', 56000),
  (12, 'Michael Baker', 'HR', 48000),
  (13, 'Emma White', 'Finance', 68000),
  (14, 'Daniel Smith', 'IT', 65000),
  (15, 'Grace Taylor', 'HR', 50000),
  (16, 'Liam Wilson', 'Marketing', 58000),
  (17, 'Ava Hall', 'IT', 61000),
  (18, 'Mia Adams', 'Finance', 70000),
  (19, 'Noah Moore', 'IT', 64000),
  (20, 'Isabella Davis', 'Marketing', 57000);

num_affected_rows,num_inserted_rows
15,15


In [0]:
%sql
SELECT * FROM employees

employee_id,employee_name,department,salary
6,Eva Green,HR,52000.0
7,Sam Johnson,Marketing,59000.0
8,Alex Turner,IT,63000.0
9,Sophie Walker,Finance,72000.0
10,David Clark,IT,60000.0
11,Olivia King,Marketing,56000.0
12,Michael Baker,HR,48000.0
13,Emma White,Finance,68000.0
14,Daniel Smith,IT,65000.0
15,Grace Taylor,HR,50000.0


In [0]:
%sql
-- Salary greater than 60000
WITH salary_cte AS (
  SELECT * FROM employees
WHERE salary > 60000)

SELECT * FROM salary_cte


employee_id,employee_name,department,salary
8,Alex Turner,IT,63000.0
9,Sophie Walker,Finance,72000.0
13,Emma White,Finance,68000.0
14,Daniel Smith,IT,65000.0
17,Ava Hall,IT,61000.0
18,Mia Adams,Finance,70000.0
19,Noah Moore,IT,64000.0


In [0]:
%sql
-- Retrieve information of employees working in IT
WITH it_cte AS (SELECT * FROM employees WHERE department = 'IT')
SELECT * FROM it_cte

employee_id,employee_name,department,salary
8,Alex Turner,IT,63000.0
10,David Clark,IT,60000.0
14,Daniel Smith,IT,65000.0
17,Ava Hall,IT,61000.0
19,Noah Moore,IT,64000.0


In [0]:
%sql
-- top 3 earners in company
WITH cte AS (SELECT *,dense_rank() OVER (ORDER BY salary DESC) AS rank FROM employees)

SELECT * FROM cte
WHERE rank < 4

employee_id,employee_name,department,salary,rank
9,Sophie Walker,Finance,72000.0,1
18,Mia Adams,Finance,70000.0,2
13,Emma White,Finance,68000.0,3


In [0]:
%sql
WITH cte AS (SELECT *,dense_rank() OVER (PARTITION BY department ORDER BY salary DESC) AS rank FROM employees)

SELECT * FROM cte
WHERE rank < 4

employee_id,employee_name,department,salary,rank
9,Sophie Walker,Finance,72000.0,1
18,Mia Adams,Finance,70000.0,2
13,Emma White,Finance,68000.0,3
6,Eva Green,HR,52000.0,1
15,Grace Taylor,HR,50000.0,2
12,Michael Baker,HR,48000.0,3
14,Daniel Smith,IT,65000.0,1
19,Noah Moore,IT,64000.0,2
8,Alex Turner,IT,63000.0,3
7,Sam Johnson,Marketing,59000.0,1


In [0]:
%sql
-- Summary of average salary for each department
WITH avg_cte AS (SELECT *,avg(salary) OVER (PARTITION BY department) AS avg_sal FROM employees)
SELECT * FROM avg_cte

employee_id,employee_name,department,salary,avg_sal
9,Sophie Walker,Finance,72000.0,70000.0
13,Emma White,Finance,68000.0,70000.0
18,Mia Adams,Finance,70000.0,70000.0
6,Eva Green,HR,52000.0,50000.0
12,Michael Baker,HR,48000.0,50000.0
15,Grace Taylor,HR,50000.0,50000.0
8,Alex Turner,IT,63000.0,62600.0
10,David Clark,IT,60000.0,62600.0
14,Daniel Smith,IT,65000.0,62600.0
17,Ava Hall,IT,61000.0,62600.0


In [0]:
%sql
WITH bonus_performance_emp AS (
  SELECT *,
  CASE WHEN salary > 60000 THEN salary * 0.1
       WHEN salary BETWEEN 50000 AND 60000 THEN salary * 0.05
       ELSE null END AS bonus,
  CASE WHEN salary > 60000 THEN 'high performer'
       WHEN salary BETWEEN 50000 AND 60000 THEN 'intermediate performer'
       ELSE null END AS performance
  FROM employees
)

SELECT * FROM bonus_performance_emp

employee_id,employee_name,department,salary,bonus,performance
6,Eva Green,HR,52000.0,2600.0,intermediate performer
7,Sam Johnson,Marketing,59000.0,2950.0,intermediate performer
8,Alex Turner,IT,63000.0,6300.0,high performer
9,Sophie Walker,Finance,72000.0,7200.0,high performer
10,David Clark,IT,60000.0,3000.0,intermediate performer
11,Olivia King,Marketing,56000.0,2800.0,intermediate performer
12,Michael Baker,HR,48000.0,,
13,Emma White,Finance,68000.0,6800.0,high performer
14,Daniel Smith,IT,65000.0,6500.0,high performer
15,Grace Taylor,HR,50000.0,2500.0,intermediate performer
