In [1]:
import pandas as pd

In [33]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Emma'],
    'Age': [25, 30, 22, 35, 28],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Boston'],
    'Salary': [50000, 60000, 45000, 70000, 55000],
    'Department': ['HR', 'IT', 'Marketing', 'IT', 'HR']
}

In [36]:
df = pd.DataFrame(data)

In [3]:
df

Unnamed: 0,Name,Age,City,Salary,Department
0,Alice,25,New York,50000,HR
1,Bob,30,Los Angeles,60000,IT
2,Charlie,22,Chicago,45000,Marketing
3,David,35,Houston,70000,IT
4,Emma,28,Boston,55000,HR


- First 3 rows of the DataFrame.
- List of column names: ['Name', 'Age', 'City', 'Salary', 'Department'].
- Data types for each column.
- Descriptive statistics for Age and Salary.

In [4]:
df.iloc[:3]

Unnamed: 0,Name,Age,City,Salary,Department
0,Alice,25,New York,50000,HR
1,Bob,30,Los Angeles,60000,IT
2,Charlie,22,Chicago,45000,Marketing


In [5]:
df.columns.tolist()

['Name', 'Age', 'City', 'Salary', 'Department']

In [6]:
df.dtypes

Name          object
Age            int64
City          object
Salary         int64
Department    object
dtype: object

In [91]:
df[['Age', 'Salary']].describe()

Unnamed: 0,Age,Salary
count,5.0,5.0
mean,28.0,56000.0
std,4.949747,9617.692031
min,22.0,45000.0
25%,25.0,50000.0
50%,28.0,55000.0
75%,30.0,60000.0
max,35.0,70000.0


- DataFrame with rows where Salary > 55000.
- DataFrame with rows where Department == 'IT'.
- DataFrame with rows where Age > 25 and Department == 'HR'.

In [8]:
df.loc[df.Salary > 55000]

Unnamed: 0,Name,Age,City,Salary,Department
1,Bob,30,Los Angeles,60000,IT
3,David,35,Houston,70000,IT


In [9]:
df.loc[df.Department == 'IT']

Unnamed: 0,Name,Age,City,Salary,Department
1,Bob,30,Los Angeles,60000,IT
3,David,35,Houston,70000,IT


In [10]:
df.loc[(df.Age > 25) & (df.Department == 'HR')]

Unnamed: 0,Name,Age,City,Salary,Department
4,Emma,28,Boston,55000,HR


- Add a new record
- DataFrame sorted by Salary (highest to lowest).
- DataFrame sorted by Age (lowest to highest), with ties broken by Salary (highest to lowest).

In [11]:
df.loc[5] = ['Nagi', 22, 'Kansai', 60000, 'Ace']

In [12]:
df.sort_values(by='Salary', ascending=False)

Unnamed: 0,Name,Age,City,Salary,Department
3,David,35,Houston,70000,IT
1,Bob,30,Los Angeles,60000,IT
5,Nagi,22,Kansai,60000,Ace
4,Emma,28,Boston,55000,HR
0,Alice,25,New York,50000,HR
2,Charlie,22,Chicago,45000,Marketing


In [13]:
df.sort_values(by=['Age', 'Salary'], ascending=[True, False])

Unnamed: 0,Name,Age,City,Salary,Department
5,Nagi,22,Kansai,60000,Ace
2,Charlie,22,Chicago,45000,Marketing
0,Alice,25,New York,50000,HR
4,Emma,28,Boston,55000,HR
1,Bob,30,Los Angeles,60000,IT
3,David,35,Houston,70000,IT


In [14]:
df.sort_values(by=['Age', 'Salary'], ascending=[False, True])

Unnamed: 0,Name,Age,City,Salary,Department
3,David,35,Houston,70000,IT
1,Bob,30,Los Angeles,60000,IT
4,Emma,28,Boston,55000,HR
0,Alice,25,New York,50000,HR
2,Charlie,22,Chicago,45000,Marketing
5,Nagi,22,Kansai,60000,Ace


- DataFrame with a new Age_Group column ('Young' or 'Adult').
- DataFrame with updated Salary values (original + 5000).
- DataFrame with a new Bonus column (e.g., 10% of Salary).

In [118]:
df['Salary'] = df['Salary'] + 5000

In [93]:
df['Age_Group'] = df.Age.map(lambda age: 'Adult' if age > 25 else 'Young')

In [94]:
df

Unnamed: 0,Name,Age,City,Salary,Department,Age_Group
0,Alice,25,New York,55000,HR,Young
1,Bob,30,Los Angeles,65000,IT,Adult
2,Charlie,22,Chicago,50000,Marketing,Young
3,David,35,Houston,75000,IT,Adult
4,Emma,28,Boston,60000,HR,Adult


In [95]:
df['Bonus'] = df.Salary.map(lambda x: x* 0.1)

In [96]:
df

Unnamed: 0,Name,Age,City,Salary,Department,Age_Group,Bonus
0,Alice,25,New York,55000,HR,Young,5500.0
1,Bob,30,Los Angeles,65000,IT,Adult,6500.0
2,Charlie,22,Chicago,50000,Marketing,Young,5000.0
3,David,35,Houston,75000,IT,Adult,7500.0
4,Emma,28,Boston,60000,HR,Adult,6000.0


- Calculate the average Salary by Department.
- Find the total Salary for each City.
- Count the number of employees in each Department.

In [37]:
df.groupby('Department').Salary.mean()

Department
HR           52500.0
IT           65000.0
Marketing    45000.0
Name: Salary, dtype: float64

In [117]:
df.groupby('City').Salary.sum()

City
Boston         60000
Chicago        50000
Houston        75000
Los Angeles    65000
New York       55000
Name: Salary, dtype: int64

In [41]:
df.groupby('Department').count()

Unnamed: 0_level_0,Name,Age,City,Salary
Department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HR,2,2,2,2
IT,2,2,2,2
Marketing,1,1,1,1


- Create a copy of the DataFrame and manually set Salary for 'Charlie' and Age for 'David' to NaN.
- Replace missing Salary values with the mean salary of the DataFrame.
- Drop any rows with missing Age values.

In [98]:
import numpy as np
dfn = df.copy()
dfn.loc[dfn.Name == 'Charlie','Salary'] = np.nan
dfn

Unnamed: 0,Name,Age,City,Salary,Department,Age_Group,Bonus
0,Alice,25,New York,55000.0,HR,Young,5500.0
1,Bob,30,Los Angeles,65000.0,IT,Adult,6500.0
2,Charlie,22,Chicago,,Marketing,Young,5000.0
3,David,35,Houston,75000.0,IT,Adult,7500.0
4,Emma,28,Boston,60000.0,HR,Adult,6000.0


In [99]:
dfn.loc[dfn.Name == 'David', 'Age'] = np.nan
dfn

Unnamed: 0,Name,Age,City,Salary,Department,Age_Group,Bonus
0,Alice,25.0,New York,55000.0,HR,Young,5500.0
1,Bob,30.0,Los Angeles,65000.0,IT,Adult,6500.0
2,Charlie,22.0,Chicago,,Marketing,Young,5000.0
3,David,,Houston,75000.0,IT,Adult,7500.0
4,Emma,28.0,Boston,60000.0,HR,Adult,6000.0


In [104]:
dfn.Age

0    25.0
1    30.0
2    22.0
3     NaN
4    28.0
Name: Age, dtype: float64

In [112]:
s = pd.Series([10, 20, 30, 30, 1])
s.idxmax(), s.idxmin()

(2, 4)

In [103]:
dfn.Age.idxmax(),\
dfn.Age.idxmin()

(1, 2)

In [114]:
dfn.Age.isna().idxmax(),\
dfn.Age.isna().idxmin()

(3, 0)

In [120]:
# dfn.drop(dfn.Age.isna().idxmax(), inplace=True)
dfn.dropna(subset=['Age'], inplace=True)
dfn

Unnamed: 0,Name,Age,City,Salary,Department,Age_Group,Bonus
0,Alice,25.0,New York,55000.0,HR,Young,5500.0
1,Bob,30.0,Los Angeles,65000.0,IT,Adult,6500.0
2,Charlie,22.0,Chicago,,Marketing,Young,5000.0
4,Emma,28.0,Boston,60000.0,HR,Adult,6000.0


In [63]:
dfn.fillna(dfn.Salary.mean())

Unnamed: 0,Name,Age,City,Salary,Department
0,Alice,25.0,New York,50000.0,HR
1,Bob,30.0,Los Angeles,60000.0,IT
2,Charlie,22.0,Chicago,55000.0,Marketing
4,Emma,28.0,Boston,55000.0,HR


In [71]:
dfn.loc[dfn.Salary.notnull(), 'Salary'].sum() / 3

np.float64(55000.0)

- DataFrame with an additional Manager column, where each employee is matched with their department’s manager.

In [116]:
df

Unnamed: 0,Name,Age,City,Salary,Department,Age_Group,Bonus
0,Alice,25,New York,55000,HR,Young,5500.0
1,Bob,30,Los Angeles,65000,IT,Adult,6500.0
2,Charlie,22,Chicago,50000,Marketing,Young,5000.0
3,David,35,Houston,75000,IT,Adult,7500.0
4,Emma,28,Boston,60000,HR,Adult,6000.0


In [72]:
dept_data = {
    'Department': ['HR', 'IT', 'Marketing'],
    'Manager': ['Anna', 'Mike', 'Lisa']
}
dept_df = pd.DataFrame(dept_data)
dept_df

Unnamed: 0,Department,Manager
0,HR,Anna
1,IT,Mike
2,Marketing,Lisa


In [89]:
data1 = {'ID': [1, 2, 3], 'Name': ['Alice', 'Bob', 'Charlie']}
df1 = pd.DataFrame(data1)

data2 = {'ID': [2, 3, 4], 'City': ['New York', 'London', 'Paris']}
df2 = pd.DataFrame(data2)

# Inner merge on 'ID'
merged_df = pd.merge(df1, df2, on='ID', how='inner')
merged_df

Unnamed: 0,ID,Name,City
0,2,Bob,New York
1,3,Charlie,London


In [121]:
pd.merge(df, dept_df, on='Department', how='inner')

Unnamed: 0,Name,Age,City,Salary,Department,Age_Group,Bonus,Manager
0,Alice,25,New York,60000,HR,Young,5500.0,Anna
1,Bob,30,Los Angeles,70000,IT,Adult,6500.0,Mike
2,Charlie,22,Chicago,55000,Marketing,Young,5000.0,Lisa
3,David,35,Houston,80000,IT,Adult,7500.0,Mike
4,Emma,28,Boston,65000,HR,Adult,6000.0,Anna
