In [None]:
import pandas as pd

In [None]:
# Creating Employee DataFrame
employee = pd.DataFrame({
    "id": [1, 2, 3, 4, 5],
    "name": ["Joe", "Jim", "Henry", "Sam", "Max"],
    "salary": [70000, 90000, 80000, 60000, 90000],
    "departmentId": [1, 1, 2, 2, 1]
})

# Creating Department DataFrame
department = pd.DataFrame({
    "id": [1, 2],
    "name": ["IT", "Sales"]
})

In [None]:
print(employee)
print(department)

In [None]:
merge_df = employee.merge(department, left_on = 'departmentId', right_on='id', suffixes=('_emp','_dept'))
merge_df

In [None]:
max_salaries = merge_df.groupby('name_dept')['salary'].max().reset_index()
max_salaries

In [None]:
results_df = merge_df.merge(max_salaries,on=['name_dept','salary'])
results_df


In [None]:
results_df = results_df.rename(columns={'name_emp':'Employee', 'name_dept':'Department', 'salary':'Salary'})

In [None]:
print(results_df[['Employee','Department','Salary']])

***We can handle ranking scores*** in a *Pandas DataFrame* using the ```.rank()``` method. 
- It allows you to assign ranks based on values in a column, 
- It handles ties in different ways.
  - *Examples in Ranking Scores*

In [None]:
import pandas as pd

df = pd.DataFrame({"name": ["Alice", "Bob", "Charlie", "David", "Eve"],
        "score": [90, 85, 90, 80, 75]})
# Rank scores (default: average rank for ties)
df["rank"] = df["score"].rank(ascending=False)
print(df)


**Ranking Methods in ```.rank()```**
*You can control how ties are handled using the method parameter:*
* ```average``` (default) → Assigns the average rank to tied values.
* ```min``` → Tied values get the lowest rank.
* ```max``` → Tied values get the highest rank.
* ```first``` → Ranks in the order they appear.
* ```dense``` → Like min, but without gaps in ranking.


In [None]:
# Examples
df["rank_min"] = df["score"].rank(ascending=False, method="min")
print(df)


In [None]:
df["rank_max"] = df["score"].rank(ascending=False, method="max")
print(df)

In [None]:
df["rank_dense"] = df["score"].rank(ascending=False, method="dense")
print(df)

In [None]:
scores_df = pd.DataFrame({
    'id':[1,2,3,4,5,6],
    'score':[3.50,3.65,4.00,3.85,4.00,3.65]
})

In [None]:
def order_scores(scores:pd.DataFrame) -> pd.DataFrame:
    scores = scores.sort_values(by='score', ascending=False)
    scores['rank'] = scores['score'].rank(ascending=False, method='dense')
    return scores[['score','rank']]


In [None]:
order_scores(scores=scores_df)

Table: Person

+-------------+---------+
| Column Name | Type    |
+-------------+---------+
| id          | int     |
| email       | varchar |
+-------------+---------+
id is the primary key (column with unique values) for this table.
Each row of this table contains an email. The emails will not contain uppercase letters.
 

Write a solution to delete all duplicate emails, keeping only one unique email with the smallest id.

For SQL users, please note that you are supposed to write a DELETE statement and not a SELECT one.

For Pandas users, please note that you are supposed to modify Person in place.

After running your script, the answer shown is the Person table. The driver will first compile and run your piece of code and then show the Person table. The final order of the Person table does not matter.

The result format is in the following example.

 

Example 1:

Input: 
Person table:
+----+------------------+
| id | email            |
+----+------------------+
| 1  | john@example.com |
| 2  | bob@example.com  |
| 3  | john@example.com |
+----+------------------+
Output: 
+----+------------------+
| id | email            |
+----+------------------+
| 1  | john@example.com |
| 2  | bob@example.com  |
+----+------------------+
Explanation: john@example.com is repeated two times. We keep the row with the smallest Id = 1.

In [None]:
import pandas as pd

def delete_duplicate_emails(Person: pd.DataFrame) -> None:
    Person.sort_values(by='id',inplace=True)
    Person.drop_duplicates(subset="email", keep="first", inplace=True)


In [None]:
import pandas as pd

# Example wide-format DataFrame
df = pd.DataFrame({
    "id": [1, 2, 3],
    "name": ["Alice", "Bob", "Charlie"],
    "math_score": [90, 85, 88],
    "science_score": [92, 80, 87]
})

# Convert wide to long format
long_df = df.melt(id_vars=["id", "name"], var_name="subject", value_name="score")

print(long_df)
print(df)


In [None]:
product = pd.DataFrame({
    'product_id':[0,1],
    'store1':[95,70],
    'store2':[100,None],
    'store3':[105,80]
})

In [None]:
df_long = product.melt(id_vars=['product_id'], var_name='store', value_name='price')

In [None]:
print(df_long)

In [None]:
df_long.dropna().sort_values(by='product_id',inplace=False)

In [None]:
import pandas as pd

def rearrange_products_table(products: pd.DataFrame) -> pd.DataFrame:
    long_df = products.melt(id_vars=['product_id'], var_name = 'store',value_name='price')
    long_df = long_df.dropna().sort_values(by='product_id',inplace=False)
    return long_df

------------+--------+
| account_id | income |
+------------+--------+
| 3          | 108939 |
| 2          | 12747  |
| 8          | 87709  |
| 6          | 91796  |

In [None]:
accounts = pd.DataFrame({
    'account_id': [3,2,8,6],
    'income': [108939, 12747,87709,91796] 
})

In [None]:
print(accounts)

able: Accounts

+-------------+------+
| Column Name | Type |
+-------------+------+
| account_id  | int  |
| income      | int  |
+-------------+------+
account_id is the primary key (column with unique values) for this table.
Each row contains information about the monthly income for one bank account.
 

Write a solution to calculate the number of bank accounts for each salary category. The salary categories are:

"Low Salary": All the salaries strictly less than $20000.
"Average Salary": All the salaries in the inclusive range [$20000, $50000].
"High Salary": All the salaries strictly greater than $50000.
The result table must contain all three categories. If there are no accounts in a category, return 0.

Return the result table in any order.

The result format is in the following example.

 

Example 1:

Input: 
Accounts table:
+------------+--------+
| account_id | income |
+------------+--------+
| 3          | 108939 |
| 2          | 12747  |
| 8          | 87709  |
| 6          | 91796  |
+------------+--------+
Output: 
+----------------+----------------+
| category       | accounts_count |
+----------------+----------------+
| Low Salary     | 1              |
| Average Salary | 0              |
| High Salary    | 3              |
+----------------+----------------+
Explanation: 
Low Salary: Account 2.
Average Salary: No accounts.
High Salary: Accounts 3, 6, and 8.

In [None]:

def count_salary_categories(accounts: pd.DataFrame) -> pd.DataFrame:
    categories = {
        "Low Salary": (accounts["income"] < 20000).sum(),
        "Average Salary": ((accounts["income"] >= 20000) & (accounts["income"] <= 50000)).sum(),
        "High Salary": (accounts["income"] > 50000).sum(),
    }
    return pd.DataFrame(categories.items(), columns=["category", "accounts_count"])

**How to convert dictionary items into pandas DataFrame**

In [None]:
# Let's create a dictionary item called 'customers'
customers ={
    'customer_id':[1,2,3,4],
    'name':['Sami','Dave','Dagi','Beti'],
    'city':['AA','DD','HW','BD']
}

In [None]:
print(customers)

In [None]:
customers_df = pd.DataFrame(customers)
print(customers_df)

In [None]:
from collections import Counter
data =  ['pc','phone','mac','pc','mac','mac','pc']
Counter(data)

In [None]:
# Create dic with name and score
grades = {
    'Alice':85,
    'Bob':92,
    'Charlie':78
}

In [None]:
grades['David'] = 98

In [None]:
print(grades)
grades['Alice']=89

In [None]:
grades['Bob']=90 # Modifying the values for Bob
grades.pop('David')

In [None]:
print(grades)

In [None]:
grades_df = pd.DataFrame(grades.items(), columns=['name','score'])

In [None]:
print(grades_df)

In [None]:
from collections import Counter
words = ["apple", "banana", "apple", "orange", "banana", "apple", "grape"]
words_count = Counter(words)
print(words_count)

In [None]:
words_count.items()

In [None]:
pd.DataFrame(words_count.items(),columns=['firut_item','count'])

In [None]:
from collections import defaultdict
count = defaultdict(int)
count['banana']+=1
print(count["apple"])

In [None]:
department = {
    'Math':15,
    'Phy': 12,
    'Chemy':10,
    'Bio':6
}

In [None]:
print(department.items())

In [None]:
import pandas as pd
dept_items = department.items()
dept_df = pd.DataFrame(dept_items, columns=['Department','Number_of_students'])
print(dept_df)

- pivot, melt, groupby, merge, join, dict, str manipulation, str.contains(),
- rename table colunm name
- convert  dict to pd.Dataframe
dict.items()
drop_duplicates(subset='column_name', keep = first, inplace=True)
dropna()
sort_values()
.max().reset_index()



In [1]:
import pandas as pd

Assume we have a employee and department tables:
where departmentId is forign key in employee table.

In [49]:
employee = pd.DataFrame({
    'id':[1,2,3,4,5],
    'name':['dani','josi','babe','sol','mar'],
    'departmentId': [1,3,2,3,1],
    'salary':[2500,7000,4000,7000,5200]
})

In [14]:
department = pd.DataFrame({
    'id':[1,2,3],
    'name':['sales','marketing','IT']
})

Now using this tables, display a table:
-  containing a maximum salary 
-  and corresponding department.

Your display should contain a table with three columns:
column_1 = 'Employee', column_2 = 'Department' and column_3 = 'Salary'

In [24]:
def max_salaries(employee:pd.DataFrame, department:pd.DataFrame) -> pd.DataFrame:
    merge_ = employee.merge(department, left_on='departmentId', right_on='id', suffixes=('_emp','_dept'))
    max_salary = merge_.groupby('name_dept')['salary'].max().reset_index()
    max_result = merge_.merge(max_salary,on=['name_dept','salary'])
    max_result = max_result.rename(columns={'name_dept':'Department', 'name_emp':'Employee','salary':'Salary'})
    return max_result[['Employee','Department','Salary']].sort_values(by='Salary', ascending=False)


In [25]:
max_salaries(employee=employee,department=department)

Unnamed: 0,Employee,Department,Salary
0,josi,IT,7000
2,sol,IT,7000
3,mar,sales,5200
1,babe,marketing,4000


Find the number of *rows* and *columns* of a pandas DataFrame. 
- Use employee table.

In [60]:
from typing import List

In [64]:
print(df_shape)

   no_of_rows  no_of_columns
0           5              4


In [56]:
print(f"{[df_shape.no_of_rows,df_shape.no_of_columns]}")

[0    5
Name: no_of_rows, dtype: int64, 0    4
Name: no_of_columns, dtype: int64]


In [51]:
print(df_shape)

   no_of_rows  no_of_columns
0           5              4


In [72]:
def no_rows_columns(employee:pd.DataFrame) -> List[int]:
    df_shape = pd.DataFrame({
        'no_of_rows':[employee.shape[0]],
        'no_of_columns':[employee.shape[1]]
    })
    return [int(df_shape.iloc[0,0]), int(df_shape.iloc[0,1])]

In [73]:
no_rows_columns(employee=employee)

[5, 4]

In [66]:
employee = pd.DataFrame({
    'id':[1,2,3,4,5],
    'name':['dani','josi','babe','sol','mar'],
    'departmentId': [1,3,2,3,1],
    'salary':[2500,7000,4000,7000,5200]
})

In [31]:
employee

Unnamed: 0,id,name,departmentId,salary
0,1,dani,1,2500
1,2,josi,3,7000
2,3,babe,2,4000
3,4,sol,3,7000
4,5,mar,1,5200


In [78]:

df_shape1 = pd.DataFrame({
    'no_of_rows':[employee.index.value_counts().sum()],
    'no_of_columns':[employee.columns.value_counts().sum()]
})
print([int(df_shape1.iloc[0,0]), int(df_shape1.iloc[0,1])])

[5, 4]


* You are given a 0-indexed 2D integer matrix grid of size n * n with values in the range [1, n2]. 
* Each integer appears exactly once except a which appears twice and b which is missing. 
* The task is to find the repeating and missing numbers a and b.

*Return a 0-indexed integer array ans of size 2 where ans[0] equals to a and ans[1] equals to b.*

 
**Example 1:**

- Input: grid = [[1,3],[2,2]]
- Output: [2,4]
- Explanation: Number 2 is repeated and number 4 is missing so the answer is [2,4].

In [81]:
from collections import Counter

def findErrorNums(grid):
    n = len(grid)
    flattened = [num for row in grid for num in row]  # Flatten the 2D grid into a 1D list
    count = Counter(flattened)  # Count occurrences of each number
    
    repeated =[]
    missing = []
    for num in range(1, n * n + 1):
        if count[num] == 2:
            repeated = num
        elif num not in count:
            missing = num
            
    return [repeated, missing]

# Example cases
print(findErrorNums([[1,3],[2,2]]))  # Output: [2, 4]
print(findErrorNums([[9,1,7],[8,9,2],[3,4,6]]))  # Output: [9, 5]


[2, 4]
[9, 5]


In [84]:
def findErrorNums(grid):
    n = len(grid)
    flattened = [num for row in grid for num in row] 
    freq = {} 
    repeated = missing = None
    for num in flattened:
        if num in freq:
            repeated = num
        else:
            freq[num] = 1
    for num in range(1, n * n + 1):
        if num not in freq:
            missing = num
            break
    
    return [repeated, missing]


print(findErrorNums([[1,3],[2,2]]))  # Output: [2, 4]
print(findErrorNums([[9,1,7],[8,9,2],[3,4,6]]))  # Output: [9, 5]


[2, 4]
[9, 5]


In [88]:
students = pd.DataFrame({
    'student_id':[101,2,3,4],
    'name':['Ade','Selam','Debra','Song'],
    'age':[20,23,26,30]
})

In [96]:
def select_id(students:pd.DataFrame) -> pd.DataFrame:
    df = students.loc[students['student_id']==101,['name','age']]
    return df

In [97]:
select_id(students=students)

Unnamed: 0,name,age
0,Ade,20


In [98]:
def dropMissingData(students: pd.DataFrame) -> pd.DataFrame:
    df = students.dropna()
    return df