In [9]:
import pandas as pd
import numpy as np

#### data
Creating first 3 DataFrames:
- employee data
- department data
- salary data

In [10]:
# Create sample employee data
employees = pd.DataFrame({
    'emp_id': [1, 2, 3, 4, 5],
    'name': ['John', 'Sarah', 'Mike', 'Lisa', 'David'],
    'department_id': [101, 102, 101, 103, 102]
})

# Create department data (note: department 104 exists but no employees are in it)
departments = pd.DataFrame({
    'department_id': [101, 102, 103, 104],
    'department_name': ['Engineering', 'Marketing', 'Sales', 'HR'],
    'location': ['NYC', 'SF', 'Chicago', 'Boston']
})

# Create salary data (note: no salary info for emp_id 5)
salaries = pd.DataFrame({
    'emp_id': [1, 2, 3, 4, 6],  # Note: emp_id 6 doesn't exist in employees
    'salary': [75000, 82000, 78000, 85000, 77000],
    'bonus': [5000, 6000, 5500, 7000, 4000]
})

In [11]:
# Print all data
print("Original DataFrames:")
print("\nEmployees:")
print(employees)
print("\nDepartments:")
print(departments)
print("\nSalaries:")
print(salaries)

Original DataFrames:

Employees:
   emp_id   name  department_id
0       1   John            101
1       2  Sarah            102
2       3   Mike            101
3       4   Lisa            103
4       5  David            102

Departments:
   department_id department_name location
0            101     Engineering      NYC
1            102       Marketing       SF
2            103           Sales  Chicago
3            104              HR   Boston

Salaries:
   emp_id  salary  bonus
0       1   75000   5000
1       2   82000   6000
2       3   78000   5500
3       4   85000   7000
4       6   77000   4000


In [12]:
# 1. Inner Join: only keeps matches
# Note that department 104 does not appear in the join
inner_join = employees.merge(departments, 
                           on='department_id', 
                           how='inner')
print("\n1. Inner Join (employees & departments):")
print(type(inner_join))
print(inner_join)


1. Inner Join (employees & departments):
<class 'pandas.core.frame.DataFrame'>
   emp_id   name  department_id department_name location
0       1   John            101     Engineering      NYC
1       2  Sarah            102       Marketing       SF
2       3   Mike            101     Engineering      NYC
3       4   Lisa            103           Sales  Chicago
4       5  David            102       Marketing       SF


In [13]:
# 2. Left Join: keeps all employees, even if no department match
left_join = employees.merge(departments, 
                          on='department_id', 
                          how='left')
print("\n2. Left Join (employees & departments):")
print(left_join)


2. Left Join (employees & departments):
   emp_id   name  department_id department_name location
0       1   John            101     Engineering      NYC
1       2  Sarah            102       Marketing       SF
2       3   Mike            101     Engineering      NYC
3       4   Lisa            103           Sales  Chicago
4       5  David            102       Marketing       SF


In [14]:
# 3. Right Join: keeps all departments, even if no employees
right_join = employees.merge(departments, 
                           on='department_id', 
                           how='right')
print("\n3. Right Join (employees & departments):")
print(right_join)


3. Right Join (employees & departments):
   emp_id   name  department_id department_name location
0     1.0   John            101     Engineering      NYC
1     3.0   Mike            101     Engineering      NYC
2     2.0  Sarah            102       Marketing       SF
3     5.0  David            102       Marketing       SF
4     4.0   Lisa            103           Sales  Chicago
5     NaN    NaN            104              HR   Boston


In [15]:
# 4. Outer Join: keeps everything, filling with NaN where no match
outer_join = employees.merge(departments, 
                           on='department_id', 
                           how='outer')
print("\n4. Outer Join (employees & departments):")
print(outer_join)


4. Outer Join (employees & departments):
   emp_id   name  department_id department_name location
0     1.0   John            101     Engineering      NYC
1     3.0   Mike            101     Engineering      NYC
2     2.0  Sarah            102       Marketing       SF
3     5.0  David            102       Marketing       SF
4     4.0   Lisa            103           Sales  Chicago
5     NaN    NaN            104              HR   Boston


In [16]:
# 5. Multiple joins: combining all three tables
complete_data = employees.merge(departments, on='department_id', how='left')\
                        .merge(salaries, on='emp_id', how='left')
print("\n5. Multiple table join (employees + departments + salaries):")
print(complete_data)


5. Multiple table join (employees + departments + salaries):
   emp_id   name  department_id department_name location   salary   bonus
0       1   John            101     Engineering      NYC  75000.0  5000.0
1       2  Sarah            102       Marketing       SF  82000.0  6000.0
2       3   Mike            101     Engineering      NYC  78000.0  5500.0
3       4   Lisa            103           Sales  Chicago  85000.0  7000.0
4       5  David            102       Marketing       SF      NaN     NaN


In [17]:
# 6. Advanced: Join with aggregation
dept_stats = complete_data.groupby('department_name').agg({
    'salary': ['mean', 'min', 'max', 'count'],
    'bonus': 'mean'
}).round(2)

print("\n6. Department Statistics:")
print(dept_stats)


6. Department Statistics:
                  salary                           bonus
                    mean      min      max count    mean
department_name                                         
Engineering      76500.0  75000.0  78000.0     2  5250.0
Marketing        82000.0  82000.0  82000.0     1  6000.0
Sales            85000.0  85000.0  85000.0     1  7000.0
