# Notebook 2: Hands-on SQL to Python Translation

This notebook provides practical exercises for translating common SQL operations to Python using pandas, with an emphasis on data manipulation and analysis.

## SECTION 1: DATA SETUP

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Create a mock orders dataset similar to what might be in a SQL database
np.random.seed(42)  # For reproducibility

# Generate 100 orders
num_orders = 100
order_ids = [f"ORD-{i:04d}" for i in range(1, num_orders + 1)]
customer_ids = [f"CUST-{i:03d}" for i in np.random.randint(1, 31, size=num_orders)]  # 30 customers
order_dates = pd.date_range(start='2023-01-01', end='2023-03-31', periods=num_orders)
order_statuses = np.random.choice(['Completed', 'Shipped', 'Processing', 'Cancelled'], size=num_orders, 
                                 p=[0.7, 0.15, 0.1, 0.05])  # Mostly completed
order_values = np.random.normal(loc=100, scale=50, size=num_orders).round(2)
order_values = np.maximum(order_values, 10)  # Ensure minimum order value

# Create orders DataFrame
orders_data = {
    'order_id': order_ids,
    'customer_id': customer_ids,
    'order_date': order_dates,
    'status': order_statuses,
    'order_value': order_values
}
orders_df = pd.DataFrame(orders_data)

In [None]:
# Generate 200 order items (average 2 items per order)
num_items = 200
order_item_ids = list(range(1, num_items + 1))
order_refs = np.random.choice(order_ids, size=num_items)
product_ids = [f"PROD-{i:03d}" for i in np.random.randint(1, 51, size=num_items)]  # 50 products
quantities = np.random.randint(1, 6, size=num_items)
unit_prices = np.random.normal(loc=50, scale=25, size=num_items).round(2)
unit_prices = np.maximum(unit_prices, 5)  # Ensure minimum price

# Create order items DataFrame
order_items_data = {
    'item_id': order_item_ids,
    'order_id': order_refs,
    'product_id': product_ids,
    'quantity': quantities,
    'unit_price': unit_prices
}
order_items_df = pd.DataFrame(order_items_data)

In [None]:
# Display sample data
print("ORDERS SAMPLE:")
print(orders_df.head())
print("\nORDER ITEMS SAMPLE:")
print(order_items_df.head())

## SECTION 2: BASIC SQL QUERIES IN PYTHON

In [None]:
# SQL: SELECT * FROM orders LIMIT 5
# Python:
print("\n1. Select all columns with limit:")
print(orders_df.head(5))

In [None]:
# SQL: SELECT order_id, order_date, order_value FROM orders ORDER BY order_date DESC LIMIT 10
# Python:
print("\n2. Select specific columns, sorted:")
print(orders_df[['order_id', 'order_date', 'order_value']].sort_values('order_date', ascending=False).head(10))

In [None]:
# SQL: SELECT * FROM orders WHERE order_value > 150
# Python:
print("\n3. Filtering with WHERE:")
high_value_orders = orders_df[orders_df['order_value'] > 150]
print(high_value_orders.head())

In [None]:
# SQL: SELECT status, COUNT(*) as order_count FROM orders GROUP BY status
# Python:
print("\n4. Grouping and counting:")
status_counts = orders_df.groupby('status').size().reset_index(name='order_count')
print(status_counts)

In [None]:
# SQL: SELECT customer_id, SUM(order_value) as total_spent 
#      FROM orders 
#      GROUP BY customer_id 
#      ORDER BY total_spent DESC
#      LIMIT 5
# Python:
print("\n5. Aggregation with GROUP BY and ORDER BY:")
customer_totals = orders_df.groupby('customer_id')['order_value'].sum().reset_index(name='total_spent')
print(customer_totals.sort_values('total_spent', ascending=False).head(5))

### EXERCISE: Try these SQL translations yourself:
1. Select all cancelled orders
2. Find the average order value by status
3. Count orders by month (hint: you'll need to extract month from order_date)

In [None]:
# Your code here:

# 1. Select all cancelled orders
# SQL: SELECT * FROM orders WHERE status = 'Cancelled'


# 2. Find the average order value by status
# SQL: SELECT status, AVG(order_value) as avg_value FROM orders GROUP BY status


# 3. Count orders by month
# SQL: SELECT EXTRACT(MONTH FROM order_date) as month, COUNT(*) as order_count 
#      FROM orders GROUP BY month ORDER BY month


## SECTION 3: JOINS IN PYTHON

In [None]:
# SQL: SELECT o.order_id, o.order_date, o.order_value, i.product_id, i.quantity, i.unit_price
#      FROM orders o
#      JOIN order_items i ON o.order_id = i.order_id
#      LIMIT 10
# Python:
print("\n6. Basic JOIN (INNER JOIN):")
joined_data = pd.merge(orders_df, order_items_df, on='order_id')
print(joined_data[['order_id', 'order_date', 'order_value', 'product_id', 'quantity', 'unit_price']].head(10))

In [None]:
# SQL: SELECT o.order_id, o.order_date, COUNT(i.item_id) as item_count, SUM(i.quantity) as total_quantity
#      FROM orders o
#      JOIN order_items i ON o.order_id = i.order_id
#      GROUP BY o.order_id, o.order_date
#      ORDER BY total_quantity DESC
#      LIMIT 5
# Python:
print("\n7. JOIN with aggregation:")
order_summary = pd.merge(orders_df, order_items_df, on='order_id')
order_summary = order_summary.groupby(['order_id', 'order_date']).agg({
    'item_id': 'count',
    'quantity': 'sum'
}).reset_index()
order_summary.columns = ['order_id', 'order_date', 'item_count', 'total_quantity']
print(order_summary.sort_values('total_quantity', ascending=False).head(5))

## SECTION 4: ADVANCED QUERIES

In [None]:
# SQL: WITH customer_stats AS (
#     SELECT customer_id, COUNT(*) as order_count, SUM(order_value) as total_value
#     FROM orders
#     GROUP BY customer_id
# )
# SELECT *
# FROM customer_stats
# WHERE order_count > 3
# ORDER BY total_value DESC
# Python (equivalent to SQL WITH clause):
print("\n8. Common Table Expression (CTE) / Subquery:")
customer_stats = orders_df.groupby('customer_id').agg({
    'order_id': 'count',
    'order_value': 'sum'
}).reset_index()
customer_stats.columns = ['customer_id', 'order_count', 'total_value']
print(customer_stats[customer_stats['order_count'] > 3].sort_values('total_value', ascending=False))

In [None]:
# SQL: SELECT 
#     DATE_TRUNC('month', order_date) as month, 
#     status,
#     COUNT(*) as order_count,
#     AVG(order_value) as avg_value
# FROM orders
# GROUP BY month, status
# ORDER BY month, status
# Python:
print("\n9. Grouping by date and category:")
# Extract month from order_date
orders_df['month'] = orders_df['order_date'].dt.to_period('M')
# Group by month and status
monthly_status = orders_df.groupby(['month', 'status']).agg({
    'order_id': 'count',
    'order_value': 'mean'
}).reset_index()
monthly_status.columns = ['month', 'status', 'order_count', 'avg_value']
print(monthly_status.head(10))

In [None]:
# SQL: SELECT 
#     o.order_id,
#     o.order_date,
#     o.order_value,
#     (o.order_value - AVG(o.order_value) OVER()) as diff_from_avg,
#     RANK() OVER(ORDER BY o.order_value DESC) as value_rank
# FROM orders o
# LIMIT 10
# Python (using window functions):
print("\n10. Window functions:")
# Calculate global average
global_avg = orders_df['order_value'].mean()
# Create a copy to avoid SettingWithCopyWarning
orders_window = orders_df.copy()
# Add window function calculations
orders_window['diff_from_avg'] = orders_window['order_value'] - global_avg
orders_window['value_rank'] = orders_window['order_value'].rank(method='min', ascending=False)
print(orders_window[['order_id', 'order_date', 'order_value', 'diff_from_avg', 'value_rank']].head(10))

## SECTION 5: DATA VISUALIZATION

One of Python's advantages over SQL is easy visualization. Let's create some basic charts from our data.

In [None]:
print("\n11. Visualizing data (a key Python advantage over SQL):")

# Create a simple chart - Orders by status
plt.figure(figsize=(10, 6))
status_counts_plot = orders_df['status'].value_counts()
status_counts_plot.plot(kind='bar', color='skyblue')
plt.title('Order Count by Status')
plt.xlabel('Status')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
# Time series of orders
plt.figure(figsize=(12, 6))
orders_df['order_date'].dt.to_period('W').value_counts().sort_index().plot(kind='line', marker='o')
plt.title('Order Volume by Week')
plt.xlabel('Week')
plt.ylabel('Number of Orders')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Distribution of order values
plt.figure(figsize=(10, 6))
plt.hist(orders_df['order_value'], bins=20, color='green', alpha=0.7)
plt.title('Distribution of Order Values')
plt.xlabel('Order Value ($)')
plt.ylabel('Frequency')
plt.axvline(orders_df['order_value'].mean(), color='red', linestyle='dashed', linewidth=2, label=f'Mean: ${orders_df["order_value"].mean():.2f}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### FINAL EXERCISES:
Try these more challenging SQL-to-Python translations:
1. Find the top 3 products by total quantity ordered
2. Calculate the percentage of cancelled orders by customer
3. Find orders where the total item value (quantity * unit_price) differs from order_value

In [None]:
# Your code here:

# 1. Find the top 3 products by total quantity ordered


# 2. Calculate the percentage of cancelled orders by customer


# 3. Find orders where the total item value differs from order_value


## Summary

In this notebook, we've explored how to translate common SQL operations to Python using pandas:

1. **Basic Querying**: Selection, filtering, sorting
2. **Aggregations**: GROUP BY, COUNT, SUM, AVG
3. **Joins**: Combining tables with merge()
4. **Advanced Queries**: CTEs, window functions
5. **Visualization**: Creating charts from query results

These skills will be especially valuable when working with the Olist e-commerce dataset, where you'll often need to combine and analyze data from multiple tables.