# Pandas Data Transformation Exercises

In [6]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Pandas Data Transformation Exercises

In [8]:
# Sample dataset for exercises
np.random.seed(42)
sales_data = pd.DataFrame({
    'date': pd.date_range('2024-01-01', periods=100, freq='D'),
    'product': np.random.choice(['A', 'B', 'C', 'D'], 100),
    'sales_amount': np.random.uniform(100, 1000, 100),
    'quantity': np.random.randint(1, 20, 100),
    'region': np.random.choice(['North', 'South', 'East', 'West'], 100),
    'customer_type': np.random.choice(['Premium', 'Regular', 'Basic'], 100)
})
# print(sales_data)

In [10]:
# Employee dataset
employees = pd.DataFrame({
    'employee_id': range(1, 51),
    'name': [f'Employee_{i}' for i in range(1, 51)],
    'department': np.random.choice(['Sales', 'Marketing', 'IT', 'HR'], 50),
    'salary': np.random.uniform(40000, 120000, 50),
    'hire_date': pd.date_range('2020-01-01', periods=50, freq='30D'),
    'performance_score': np.random.uniform(3.0, 5.0, 50)
})
# print(employees)

## Column Transformation Exercises

In [None]:
### Exercise 1: Basic Column Creation
# Create a new column called `revenue_per_unit` by dividing `sales_amount` by `quantity`.
#    Expected Output**: New column with calculated values


In [None]:
### Exercise 2: Column Renaming
# Rename the columns in the sales_data DataFrame:
# - `sales_amount` → `revenue`
# - `customer_type` → `customer_segment`
# - `quantity` → `units_sold`



In [None]:
### Exercise 3: Conditional Column Creation
# Create a new column `sales_category` based on sales_amount:
# - 'High' if sales_amount > 700
# - 'Medium' if sales_amount between 300-700
# - 'Low' if sales_amount < 300


In [None]:
### Exercise 4: String Manipulation
# In the employees DataFrame, create a new column `email` by combining the employee name with '@company.com' 
# (replace spaces with dots and make lowercase).

In [None]:
### Exercise 5: Date Extraction
# From the `date` column in sales_data, create three new columns:
# - `year`
# - `month`
# - `day_of_week`

In [None]:
### Exercise 6: Column Transformation with Apply
# Create a new column `adjusted_salary` in the employees DataFrame that increases salary by 10% 
# for employees with performance_score > 4.0, otherwise keeps the original salary.

In [None]:
### Exercise 7: Multiple Column Operations
# Create a new column `efficiency_score` that combines multiple factors:
# - (sales_amount / quantity) * 0.6 + (performance_score mapped from customer_type: Premium=5, Regular=3, Basic=1) * 0.4


In [None]:
### Exercise 8: Column Reordering
# Reorder the columns in sales_data to: `date`, `region`, `product`, `quantity`, `sales_amount`, `customer_type`


In [None]:
### Exercise 9: Column Data Type Conversion
# Convert the following columns to appropriate data types:
# - `product` to category
# - `region` to category
# - `sales_amount` to float32
# - `quantity` to int16

In [None]:
### Exercise 10: Column Aggregation
# Create a new column `total_sales_by_product` that shows the total sales amount for each product across all records.


## Row Transformation Exercises

In [None]:
### Exercise 11: Row Filtering
# Filter the sales_data to show only records where:
# - sales_amount > 500
# - region is either 'North' or 'South'
# - customer_type is 'Premium'

In [None]:
### Exercise 12: Row Sorting
# Sort the sales_data by multiple criteria:
# 1. region (ascending)
# 2. sales_amount (descending)
# 3. date (ascending)

In [None]:
### Exercise 13: Row Sampling
# Create a random sample of 20% of the sales_data records, ensuring the sample is reproducible.


In [None]:
### Exercise 14: Row Deduplication
# Remove duplicate rows based on the combination of `product`, `region`, and `customer_type`, 
# keeping the first occurrence.


In [None]:
### Exercise 15: Row Grouping and Aggregation
# Group the sales_data by `region` and `product`, then calculate:
# - Mean sales_amount
# - Total quantity
# - Count of records
# - Standard deviation of sales_amount

In [None]:
### Exercise 16: Row Transformation with Transform
# Add a new column `sales_rank_by_region` that ranks each row's sales_amount 
# within its region (1 = highest sales in region).


In [None]:
### Exercise 17: Row-wise Calculations
# Create a new column `days_since_hire` in the employees DataFrame 
# that calculates the number of days between hire_date and today's date.


In [None]:
### Exercise 18: Advanced Row Filtering
# Filter sales_data to show only the top 3 sales records for each product (based on sales_amount).


In [None]:
### Exercise 19: Row Interpolation
# Introduce some NaN values in the sales_amount column (randomly set 10% to NaN), then fill them using linear interpolation based on the date order.


In [None]:
### Exercise 20: Complex Row and Column Transformation
# Create a summary transformation that:
# 1. Groups data by region and month
# 2. Calculates total sales, average quantity, and unique customer count
# 3. Adds a column showing the percentage of total sales each region-month represents
# 4. Filters to show only region-months with > 5% of total sales
# 5. Sorts by percentage descending



## Solutions Template

```python
# Exercise 1 Solution
sales_data['revenue_per_unit'] = sales_data['sales_amount'] / sales_data['quantity']

# Exercise 2 Solution
sales_data.rename(columns={
    'sales_amount': 'revenue',
    'customer_type': 'customer_segment',
    'quantity': 'units_sold'
}, inplace=True)

# Exercise 3 Solution
sales_data['sales_category'] = pd.cut(
    sales_data['sales_amount'],
    bins=[0, 300, 700, float('inf')],
    labels=['Low', 'Medium', 'High']
)

# Continue with remaining solutions...
```

## Key Learning Objectives

After completing these exercises, you should be able to:
- Create and modify columns using various methods
- Apply conditional logic to create new columns
- Perform string and date manipulations
- Filter and sort data efficiently
- Handle duplicates and missing values
- Group data and perform aggregations
- Combine multiple transformation techniques
- Optimize data types for memory efficiency
