# Module 6: SQL vs Pandas â€” Analytical Parity

## Objective
Solve identical business problems using SQL and Pandas to understand when each tool is more appropriate in real-world data analysis workflows.

In [13]:
import pandas as pd
import sqlite3

In [14]:
# Load Pandas DataFrame
df = pd.read_csv("../data/retail_sales_cleaned.csv")
df["revenue"] = df["quantity"]*df["unit_price"]


In [15]:
conn = sqlite3.connect("../data/retail_sales.db")

In [16]:
df.to_sql("sales", conn, if_exists = "replace", index = False)

6

In [17]:
df

Unnamed: 0,order_id,order_date,region,product,category,quantity,unit_price,revenue
0,1001,2023-01-05,North,Laptop,Electronics,2.0,750.0,1500.0
1,1002,2023-01-07,South,Mobile,Electronics,8.0,300.0,2400.0
2,1003,2023-01-10,East,Chair,Furniture,10.0,45.0,450.0
3,1004,2023-01-15,West,Table,Furniture,8.0,120.0,960.0
4,1005,2023-01-20,Unknown,Headphones,Electronics,8.0,60.0,480.0
5,1006,2023-01-22,South,Sofa,Furniture,1.0,90.0,90.0


In [20]:
query = """SELECT * FROM sales;"""
pd.read_sql(query, conn)

Unnamed: 0,order_id,order_date,region,product,category,quantity,unit_price,revenue
0,1001,2023-01-05,North,Laptop,Electronics,2.0,750.0,1500.0
1,1002,2023-01-07,South,Mobile,Electronics,8.0,300.0,2400.0
2,1003,2023-01-10,East,Chair,Furniture,10.0,45.0,450.0
3,1004,2023-01-15,West,Table,Furniture,8.0,120.0,960.0
4,1005,2023-01-20,Unknown,Headphones,Electronics,8.0,60.0,480.0
5,1006,2023-01-22,South,Sofa,Furniture,1.0,90.0,90.0


In [None]:
# Top 3 Products by Revenue

In [27]:
query = """ SELECT product, SUM(revenue) AS total_revenue
FROM sales GROUP BY product ORDER BY SUM(revenue) DESC LIMIT 3;"""
pd.read_sql(query, conn)

Unnamed: 0,product,total_revenue
0,Mobile,2400.0
1,Laptop,1500.0
2,Table,960.0


In [28]:
df.groupby("product")["revenue"].sum().nlargest(3)

product
Mobile    2400.0
Laptop    1500.0
Table      960.0
Name: revenue, dtype: float64

In [None]:
# Total revenue collected category-wise

In [29]:
query = """ SELECT category, SUM(revenue) AS total_revenue
FROM sales GROUP BY category ORDER BY SUM(revenue) DESC;"""
pd.read_sql(query, conn)

Unnamed: 0,category,total_revenue
0,Electronics,4380.0
1,Furniture,1500.0


In [31]:
df.groupby("category")["revenue"].sum().sort_values(ascending = False)

category
Electronics    4380.0
Furniture      1500.0
Name: revenue, dtype: float64

In [None]:
# Regions with average revenue below average revenue of whole table

In [33]:
query = """ SELECT region, AVG(revenue) AS avg_revenue
FROM sales GROUP BY region 
HAVING AVG(revenue) < (SELECT AVG(revenue) FROM sales);"""
pd.read_sql(query, conn)

Unnamed: 0,region,avg_revenue
0,East,450.0
1,Unknown,480.0
2,West,960.0


In [34]:
overall_avg = df["revenue"].mean()
df.groupby("region")["revenue"].mean()[lambda x : x < overall_avg]

region
East       450.0
Unknown    480.0
West       960.0
Name: revenue, dtype: float64

In [None]:
# Use SQL to pull data already in databases.
# Use Pandas for complex fixes and creating new features.
# SQL is better for broad comparisons and subqueries.
# Pandas is faster for exploring and testing ideas.