In [1]:
import pandas as pd

In [2]:
# Creating Employee DataFrame
employee = pd.DataFrame({
    "id": [1, 2, 3, 4, 5],
    "name": ["Joe", "Jim", "Henry", "Sam", "Max"],
    "salary": [70000, 90000, 80000, 60000, 90000],
    "departmentId": [1, 1, 2, 2, 1]
})

# Creating Department DataFrame
department = pd.DataFrame({
    "id": [1, 2],
    "name": ["IT", "Sales"]
})

In [3]:
print(employee)
print(department)

   id   name  salary  departmentId
0   1    Joe   70000             1
1   2    Jim   90000             1
2   3  Henry   80000             2
3   4    Sam   60000             2
4   5    Max   90000             1
   id   name
0   1     IT
1   2  Sales


In [5]:
merge_df = employee.merge(department, left_on = 'departmentId', right_on='id', suffixes=('_emp','_dept'))
merge_df

Unnamed: 0,id_emp,name_emp,salary,departmentId,id_dept,name_dept
0,1,Joe,70000,1,1,IT
1,2,Jim,90000,1,1,IT
2,3,Henry,80000,2,2,Sales
3,4,Sam,60000,2,2,Sales
4,5,Max,90000,1,1,IT


In [7]:
max_salaries = merge_df.groupby('name_dept')['salary'].max().reset_index()
max_salaries

Unnamed: 0,name_dept,salary
0,IT,90000
1,Sales,80000


In [9]:
results_df = merge_df.merge(max_salaries,on=['name_dept','salary'])
results_df


Unnamed: 0,id_emp,name_emp,salary,departmentId,id_dept,name_dept
0,2,Jim,90000,1,1,IT
1,3,Henry,80000,2,2,Sales
2,5,Max,90000,1,1,IT


In [15]:
results_df = results_df.rename(columns={'name_emp':'Employee', 'name_dept':'Department', 'salary':'Salary'})

In [16]:
print(results_df[['Employee','Department','Salary']])

  Employee Department  Salary
0      Jim         IT   90000
1    Henry      Sales   80000
2      Max         IT   90000


***We can handle ranking scores*** in a *Pandas DataFrame* using the ```.rank()``` method. 
- It allows you to assign ranks based on values in a column, 
- It handles ties in different ways.
  - *Examples in Ranking Scores*

In [22]:
import pandas as pd

df = pd.DataFrame({"name": ["Alice", "Bob", "Charlie", "David", "Eve"],
        "score": [90, 85, 90, 80, 75]})
# Rank scores (default: average rank for ties)
df["rank"] = df["score"].rank(ascending=False)
print(df)


      name  score  rank
0    Alice     90   1.5
1      Bob     85   3.0
2  Charlie     90   1.5
3    David     80   4.0
4      Eve     75   5.0


**Ranking Methods in ```.rank()```**
*You can control how ties are handled using the method parameter:*
* ```average``` (default) → Assigns the average rank to tied values.
* ```min``` → Tied values get the lowest rank.
* ```max``` → Tied values get the highest rank.
* ```first``` → Ranks in the order they appear.
* ```dense``` → Like min, but without gaps in ranking.


In [23]:
# Examples
df["rank_min"] = df["score"].rank(ascending=False, method="min")
print(df)


      name  score  rank  rank_min
0    Alice     90   1.5       1.0
1      Bob     85   3.0       3.0
2  Charlie     90   1.5       1.0
3    David     80   4.0       4.0
4      Eve     75   5.0       5.0


In [24]:
df["rank_max"] = df["score"].rank(ascending=False, method="max")
print(df)

      name  score  rank  rank_min  rank_max
0    Alice     90   1.5       1.0       2.0
1      Bob     85   3.0       3.0       3.0
2  Charlie     90   1.5       1.0       2.0
3    David     80   4.0       4.0       4.0
4      Eve     75   5.0       5.0       5.0


In [25]:
df["rank_dense"] = df["score"].rank(ascending=False, method="dense")
print(df)

      name  score  rank  rank_min  rank_max  rank_dense
0    Alice     90   1.5       1.0       2.0         1.0
1      Bob     85   3.0       3.0       3.0         2.0
2  Charlie     90   1.5       1.0       2.0         1.0
3    David     80   4.0       4.0       4.0         3.0
4      Eve     75   5.0       5.0       5.0         4.0


In [32]:
scores_df = pd.DataFrame({
    'id':[1,2,3,4,5,6],
    'score':[3.50,3.65,4.00,3.85,4.00,3.65]
})

In [61]:
def order_scores(scores:pd.DataFrame) -> pd.DataFrame:
    scores = scores.sort_values(by='score', ascending=False)
    scores['rank'] = scores['score'].rank(ascending=False, method='dense')
    return scores[['score','rank']]


In [62]:
order_scores(scores=scores_df)

Unnamed: 0,score,rank
4,4.0,1.0
2,4.0,1.0
3,3.85,2.0
1,3.65,3.0
5,3.65,3.0
0,3.5,4.0


Table: Person

+-------------+---------+
| Column Name | Type    |
+-------------+---------+
| id          | int     |
| email       | varchar |
+-------------+---------+
id is the primary key (column with unique values) for this table.
Each row of this table contains an email. The emails will not contain uppercase letters.
 

Write a solution to delete all duplicate emails, keeping only one unique email with the smallest id.

For SQL users, please note that you are supposed to write a DELETE statement and not a SELECT one.

For Pandas users, please note that you are supposed to modify Person in place.

After running your script, the answer shown is the Person table. The driver will first compile and run your piece of code and then show the Person table. The final order of the Person table does not matter.

The result format is in the following example.

 

Example 1:

Input: 
Person table:
+----+------------------+
| id | email            |
+----+------------------+
| 1  | john@example.com |
| 2  | bob@example.com  |
| 3  | john@example.com |
+----+------------------+
Output: 
+----+------------------+
| id | email            |
+----+------------------+
| 1  | john@example.com |
| 2  | bob@example.com  |
+----+------------------+
Explanation: john@example.com is repeated two times. We keep the row with the smallest Id = 1.

In [None]:
import pandas as pd

def delete_duplicate_emails(Person: pd.DataFrame) -> None:
    Person.sort_values(by='id',inplace=True)
    Person.drop_duplicates(subset="email", keep="first", inplace=True)


In [64]:
import pandas as pd

# Example wide-format DataFrame
df = pd.DataFrame({
    "id": [1, 2, 3],
    "name": ["Alice", "Bob", "Charlie"],
    "math_score": [90, 85, 88],
    "science_score": [92, 80, 87]
})

# Convert wide to long format
long_df = df.melt(id_vars=["id", "name"], var_name="subject", value_name="score")

print(long_df)
print(df)


   id     name        subject  score
0   1    Alice     math_score     90
1   2      Bob     math_score     85
2   3  Charlie     math_score     88
3   1    Alice  science_score     92
4   2      Bob  science_score     80
5   3  Charlie  science_score     87
   id     name  math_score  science_score
0   1    Alice          90             92
1   2      Bob          85             80
2   3  Charlie          88             87


In [66]:
product = pd.DataFrame({
    'product_id':[0,1],
    'store1':[95,70],
    'store2':[100,None],
    'store3':[105,80]
})

In [67]:
df_long = product.melt(id_vars=['product_id'], var_name='store', value_name='price')

In [68]:
print(df_long)

   product_id   store  price
0           0  store1   95.0
1           1  store1   70.0
2           0  store2  100.0
3           1  store2    NaN
4           0  store3  105.0
5           1  store3   80.0


In [70]:
df_long.dropna().sort_values(by='product_id',inplace=False)

Unnamed: 0,product_id,store,price
0,0,store1,95.0
2,0,store2,100.0
4,0,store3,105.0
1,1,store1,70.0
5,1,store3,80.0


In [None]:
import pandas as pd

def rearrange_products_table(products: pd.DataFrame) -> pd.DataFrame:
    long_df = products.melt(id_vars=['product_id'], var_name = 'store',value_name='price')
    long_df = long_df.dropna().sort_values(by='product_id',inplace=False)
    return long_df

------------+--------+
| account_id | income |
+------------+--------+
| 3          | 108939 |
| 2          | 12747  |
| 8          | 87709  |
| 6          | 91796  |

In [71]:
accounts = pd.DataFrame({
    'account_id': [3,2,8,6],
    'income': [108939, 12747,87709,91796] 
})

In [72]:
print(accounts)

   account_id  income
0           3  108939
1           2   12747
2           8   87709
3           6   91796


able: Accounts

+-------------+------+
| Column Name | Type |
+-------------+------+
| account_id  | int  |
| income      | int  |
+-------------+------+
account_id is the primary key (column with unique values) for this table.
Each row contains information about the monthly income for one bank account.
 

Write a solution to calculate the number of bank accounts for each salary category. The salary categories are:

"Low Salary": All the salaries strictly less than $20000.
"Average Salary": All the salaries in the inclusive range [$20000, $50000].
"High Salary": All the salaries strictly greater than $50000.
The result table must contain all three categories. If there are no accounts in a category, return 0.

Return the result table in any order.

The result format is in the following example.

 

Example 1:

Input: 
Accounts table:
+------------+--------+
| account_id | income |
+------------+--------+
| 3          | 108939 |
| 2          | 12747  |
| 8          | 87709  |
| 6          | 91796  |
+------------+--------+
Output: 
+----------------+----------------+
| category       | accounts_count |
+----------------+----------------+
| Low Salary     | 1              |
| Average Salary | 0              |
| High Salary    | 3              |
+----------------+----------------+
Explanation: 
Low Salary: Account 2.
Average Salary: No accounts.
High Salary: Accounts 3, 6, and 8.

In [None]:

def count_salary_categories(accounts: pd.DataFrame) -> pd.DataFrame:
    categories = {
        "Low Salary": (accounts["income"] < 20000).sum(),
        "Average Salary": ((accounts["income"] >= 20000) & (accounts["income"] <= 50000)).sum(),
        "High Salary": (accounts["income"] > 50000).sum(),
    }
    return pd.DataFrame(categories.items(), columns=["category", "accounts_count"])