In [1]:
%run helper/setup_notebook.ipynb import display_table

Successfully connected to leetcode50 database.


In [2]:
tables = ['Employee', 'Department']
for table in tables:
    display_table(table)

+----+-------+--------+--------------+
| id |  name | salary | departmentId |
+----+-------+--------+--------------+
| 1  |  Joe  | 85000  |      1       |
| 2  | Henry | 80000  |      2       |
| 3  |  Sam  | 60000  |      2       |
| 4  |  Max  | 90000  |      1       |
| 5  | Janet | 69000  |      1       |
| 6  | Randy | 85000  |      1       |
| 7  |  Will | 70000  |      1       |
+----+-------+--------+--------------+
+----+-------+
| id |  name |
+----+-------+
| 1  |   IT  |
| 2  | Sales |
+----+-------+


#### *A company's executives are interested in seeing who earns the most money in each of the company's departments. A high earner in a department is an employee who has a salary in the top three unique salaries for that department.*

### Write an SQL query to find the employees who are high earners in each of the departments.

```
+------------+----------+--------+
| Department | Employee | Salary |
+------------+----------+--------+
| IT         | Max      | 90000  |
| IT         | Joe      | 85000  |
| IT         | Randy    | 85000  |
| IT         | Will     | 70000  |
| Sales      | Henry    | 80000  |
| Sales      | Sam      | 60000  |
+------------+----------+--------+
Explanation: 
In the IT department:
- Max earns the highest unique salary
- Both Randy and Joe earn the second-highest unique salary
- Will earns the third-highest unique salary

In the Sales department:
- Henry earns the highest salary
- Sam earns the second-highest salary
- There is no third-highest salary as there are only two employees
```

In [3]:
%%sql 

SELECT 
    d.name AS Department,
    e.name AS Employee,
    e.salary AS Salary
FROM Employee e
JOIN Department d ON e.departmentId = d.id 

Department,Employee,Salary
IT,Joe,85000
Sales,Henry,80000
Sales,Sam,60000
IT,Max,90000
IT,Janet,69000
IT,Randy,85000
IT,Will,70000


In [4]:
%%sql 

SELECT 
    salary 
FROM Employee e
JOIN Department d ON e.departmentId = d.id 
WHERE d.name = 'IT'

salary
85000
90000
69000
85000
70000


In [5]:
%%sql 

SELECT 
    DISTINCT salary 
FROM Employee e
JOIN Department d ON e.departmentId = d.id 
WHERE d.name = 'IT'
ORDER BY salary DESC
LIMIT 3

salary
90000
85000
70000


In [6]:
%%sql 

SELECT 
    DISTINCT salary 
FROM Employee e
JOIN Department d ON e.departmentId = d.id 
WHERE d.name = 'Sales'
ORDER BY salary DESC
LIMIT 3

salary
80000
60000


In [7]:
%%sql 

SELECT 
    d.name AS Department, 
    e.name AS Employee, 
    e.salary AS Salary
FROM Employee e
JOIN Department d ON e.departmentId = d.id 
WHERE d.name = 'IT'
    AND e.salary >= (
        SELECT 
            DISTINCT salary
        FROM Employee e1
        JOIN Department d1 ON e1.departmentId = d1.id 
        WHERE d1.name = 'IT'
        ORDER BY salary DESC
        LIMIT 1 OFFSET 2
    )
ORDER BY e.salary DESC;


Department,Employee,Salary
IT,Max,90000
IT,Joe,85000
IT,Randy,85000
IT,Will,70000


In [8]:
%%sql 

SELECT 
    d.name AS Department,
    e.name AS Employee,
    e.salary AS Salary
FROM Employee e
JOIN Department d ON e.departmentId = d.id 
WHERE d.name = 'Sales' AND e.salary >= (
    SELECT 
        DISTINCT salary 
    FROM Employee e
    JOIN Department d ON e.departmentId = d.id 
    WHERE d.name = 'Sales'
    ORDER BY salary DESC
    LIMIT 1 OFFSET 1
)
ORDER BY e.salary DESC

Department,Employee,Salary
Sales,Henry,80000
Sales,Sam,60000


In [9]:
%%sql

-- Solution failing test case
SELECT 
    d.name AS Department,
    e.name AS Employee,
    e.salary AS Salary
FROM Employee e
JOIN Department d ON e.departmentId = d.id 
WHERE (d.name = 'IT' AND e.salary >= (
    SELECT 
        DISTINCT salary 
    FROM Employee e1
    JOIN Department d1 ON e1.departmentId = d1.id 
    WHERE d1.name = 'IT'
    ORDER BY salary DESC
    LIMIT 1 OFFSET 2
))
OR (d.name = 'Sales' AND e.salary >= (
    SELECT 
        DISTINCT salary 
    FROM Employee e2
    JOIN Department d2 ON e2.departmentId = d2.id 
    WHERE d2.name = 'Sales'
    ORDER BY salary DESC
    LIMIT 1 OFFSET 1
))
ORDER BY e.salary DESC;


Department,Employee,Salary
IT,Max,90000
IT,Joe,85000
IT,Randy,85000
Sales,Henry,80000
IT,Will,70000
Sales,Sam,60000


## Solution using CTE

In [10]:
%%sql 

SELECT
d.name AS Department,
e.name AS Employee,
e.Salary AS Salary,
DENSE_RANK() OVER(PARTITION BY e.departmentId ORDER BY e.Salary DESC) AS salary_rank
FROM Employee e
LEFT JOIN Department d ON e.departmentId = d.id

Department,Employee,Salary,salary_rank
IT,Max,90000,1
IT,Joe,85000,2
IT,Randy,85000,2
IT,Will,70000,3
IT,Janet,69000,4
Sales,Henry,80000,1
Sales,Sam,60000,2


In [11]:
%%sql 

WITH salary_rank_cte AS (
    SELECT 
        d.name AS Department,
        e.name AS Employee,
        e.salary AS Salary,
        DENSE_RANK() OVER (PARTITION BY e.departmentId ORDER BY e.Salary DESC) AS salary_rank
    FROM Employee e 
    JOIN Department d ON e.departmentId = d.id 
) 
SELECT 
    Department,
    Employee,
    Salary 
FROM salary_rank_cte
WHERE salary_rank <= 3;

Department,Employee,Salary
IT,Max,90000
IT,Joe,85000
IT,Randy,85000
IT,Will,70000
Sales,Henry,80000
Sales,Sam,60000


In [12]:
%%sql 
-- Without using CTE 
SELECT 
    Department,
    Employee,
    Salary
FROM (
    SELECT
        d.name AS Department,
        e.name AS Employee,
        e.Salary AS Salary,
        DENSE_RANK() OVER (PARTITION BY e.departmentId ORDER BY e.Salary DESC) AS salary_rank
    FROM Employee e
    LEFT JOIN Department d ON e.departmentId = d.id) AS inner_table
WHERE salary_rank <= 3

Department,Employee,Salary
IT,Max,90000
IT,Joe,85000
IT,Randy,85000
IT,Will,70000
Sales,Henry,80000
Sales,Sam,60000


# Using Pandas

In [13]:
employee_query = %sql SELECT * FROM Employee # type: ignore
department_query = %sql SELECT * FROM Department #type: ignore 
employee_df = employee_query.DataFrame()
department_df = department_query.DataFrame()

display(employee_df, department_df)

Unnamed: 0,id,name,salary,departmentId
0,1,Joe,85000,1
1,2,Henry,80000,2
2,3,Sam,60000,2
3,4,Max,90000,1
4,5,Janet,69000,1
5,6,Randy,85000,1
6,7,Will,70000,1


Unnamed: 0,id,name
0,1,IT
1,2,Sales


In [14]:
employee_df.merge(department_df, on='id', how='right')

Unnamed: 0,id,name_x,salary,departmentId,name_y
0,1,Joe,85000,1,IT
1,2,Henry,80000,2,Sales


In [15]:
employee_df.merge(department_df, on='id', how='left')

Unnamed: 0,id,name_x,salary,departmentId,name_y
0,1,Joe,85000,1,IT
1,2,Henry,80000,2,Sales
2,3,Sam,60000,2,
3,4,Max,90000,1,
4,5,Janet,69000,1,
5,6,Randy,85000,1,
6,7,Will,70000,1,


In [16]:
# Perform the left join between employee_df and department_df
employee_df.merge(department_df, left_on='departmentId', right_on='id', how='left')

Unnamed: 0,id_x,name_x,salary,departmentId,id_y,name_y
0,1,Joe,85000,1,1,IT
1,2,Henry,80000,2,2,Sales
2,3,Sam,60000,2,2,Sales
3,4,Max,90000,1,1,IT
4,5,Janet,69000,1,1,IT
5,6,Randy,85000,1,1,IT
6,7,Will,70000,1,1,IT


In [17]:
merged_df = employee_df.merge(department_df, left_on='departmentId', right_on='id', how='left')
merged_df

Unnamed: 0,id_x,name_x,salary,departmentId,id_y,name_y
0,1,Joe,85000,1,1,IT
1,2,Henry,80000,2,2,Sales
2,3,Sam,60000,2,2,Sales
3,4,Max,90000,1,1,IT
4,5,Janet,69000,1,1,IT
5,6,Randy,85000,1,1,IT
6,7,Will,70000,1,1,IT


In [18]:
merged_df.drop(['id_x', 'id_y'], axis=1, inplace=True)
merged_df

Unnamed: 0,name_x,salary,departmentId,name_y
0,Joe,85000,1,IT
1,Henry,80000,2,Sales
2,Sam,60000,2,Sales
3,Max,90000,1,IT
4,Janet,69000,1,IT
5,Randy,85000,1,IT
6,Will,70000,1,IT


In [19]:
merged_df = merged_df.rename(columns={'name_x': 'Department', 'name_y': 'Employee', 'salary': 'Salary'})

merged_df

Unnamed: 0,Department,Salary,departmentId,Employee
0,Joe,85000,1,IT
1,Henry,80000,2,Sales
2,Sam,60000,2,Sales
3,Max,90000,1,IT
4,Janet,69000,1,IT
5,Randy,85000,1,IT
6,Will,70000,1,IT


The `rank()` function assigns ranks or ordinal positions to the values within a specified group.

When applied to a DataFrame column with the `rank()` function, it assigns ranks to the values based on their order. Here's a brief explanation of how it works:

1. By default, the `rank()` function assigns ranks in ascending order.

2. The `method` parameter allows you to specify the method to handle ties (i.e., when multiple values share the same value). Some common methods include:
   - `'average'` (default): Assigns the average rank to tied values. For example, if two values tie for the second rank, both will receive a rank of 2.5.
   - `'min'`: Assigns the minimum rank to tied values. For example, if two values tie for the second rank, both will receive a rank of 2.
   - `'max'`: Assigns the maximum rank to tied values. For example, if two values tie for the second rank, both will receive a rank of 3.
   - `'first'`: Assigns ranks in the order of appearance, without considering tie values. For example, if two values tie for the second rank, the first occurrence will receive a rank of 2, and the second occurrence will receive a rank of 3.

3. The `ascending` parameter determines whether the ranks are assigned in ascending (`True`) or descending (`False`) order.

4. The `na_option` parameter specifies how to handle missing or NaN values:
   - `'keep'` (default): Assigns NaN values the rank of NaN, leaving gaps in the ranking sequence.
   - `'top'`: Assigns NaN values the highest rank.
   - `'bottom'`: Assigns NaN values the lowest rank.


In [20]:
merged_df.groupby('departmentId')['Salary'].rank(method='dense', ascending=False)

0    2.0
1    1.0
2    2.0
3    1.0
4    4.0
5    2.0
6    3.0
Name: Salary, dtype: float64

In [21]:
merged_df['salary_rank'] = merged_df.groupby('departmentId')['Salary'] \
                            .rank(method='dense', ascending=False).astype(int)

merged_df

Unnamed: 0,Department,Salary,departmentId,Employee,salary_rank
0,Joe,85000,1,IT,2
1,Henry,80000,2,Sales,1
2,Sam,60000,2,Sales,2
3,Max,90000,1,IT,1
4,Janet,69000,1,IT,4
5,Randy,85000,1,IT,2
6,Will,70000,1,IT,3


In [22]:
merged_df.query("salary_rank <= 3")

Unnamed: 0,Department,Salary,departmentId,Employee,salary_rank
0,Joe,85000,1,IT,2
1,Henry,80000,2,Sales,1
2,Sam,60000,2,Sales,2
3,Max,90000,1,IT,1
5,Randy,85000,1,IT,2
6,Will,70000,1,IT,3
