In [5]:

# Group By and Rollup
# This is a technique that allows you to see both the fine details and the big picture at the same time
# Detailed Breakdown
# Subtotals
# Grand Total

# This is very useful for:
# Financial Reporting
# Sales Analysis
# Inventory Management
# Many, many other things...


In [22]:

# in Python world
import pandas as pd

# Sample data
data = {
    'Region': ['North', 'North', 'South', 'South', 'East', 'East', 'West', 'West'],
    'Product': ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'],
    'Sales': [100, 150, 200, 250, 300, 350, 400, 450]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Group by 'Region' and sum the 'Sales'
region_sales = df.groupby('Region')['Sales'].sum().reset_index()
print("\nGrouped by Region:")
print(region_sales)

# Group by 'Region' and 'Product' and sum the 'Sales'
grouped = df.groupby(['Region', 'Product'])['Sales'].sum().reset_index()

# Calculate the subtotal for each region
region_subtotals = df.groupby('Region')['Sales'].sum().reset_index()
region_subtotals['Product'] = 'All'

# Calculate the grand total
grand_total = pd.DataFrame({'Region': ['All'], 'Product': ['All'], 'Sales': [df['Sales'].sum()]})

# Combine the grouped data, region subtotals, and grand total
result_rollup = pd.concat([grouped, region_subtotals, grand_total], ignore_index=True)

print("\nResult with Rollup:")
print(result_rollup)


Original DataFrame:
  Region Product  Sales
0  North       A    100
1  North       B    150
2  South       A    200
3  South       B    250
4   East       A    300
5   East       B    350
6   West       A    400
7   West       B    450

Grouped by Region:
  Region  Sales
0   East    650
1  North    250
2  South    450
3   West    850

Result with Rollup:
   Region Product  Sales
0    East       A    300
1    East       B    350
2   North       A    100
3   North       B    150
4   South       A    200
5   South       B    250
6    West       A    400
7    West       B    450
8    East     All    650
9   North     All    250
10  South     All    450
11   West     All    850
12    All     All   2200


In [21]:

# Region   Product  Sales
# ------------------------
# East     A        300    <- Details: Sales of Product A in East
# East     B        350    <- Details: Sales of Product B in East
# North    A        100    <- Details: Sales of Product A in North
# North    B        150    <- Details: Sales of Product B in North
# South    A        200    <- Details: Sales of Product A in South
# South    B        250    <- Details: Sales of Product B in South
# West     A        400    <- Details: Sales of Product A in West
# West     B        450    <- Details: Sales of Product B in West
# ------------------------
# East     All      650    <- Subtotal: Total sales in East (A + B)
# North    All      250    <- Subtotal: Total sales in North (A + B)
# South    All      450    <- Subtotal: Total sales in South (A + B)
# West     All      850    <- Subtotal: Total sales in West (A + B)
# ------------------------
# All      All     2200    <- Grand Total: Total sales in all regions and products


In [18]:


# -- in the SQL world
# --DROP TABLE SalesData
CREATE TABLE SalesData (
    Region VARCHAR(50),
    Product VARCHAR(50),
    Sales INT
);

INSERT INTO SalesData (Region, Product, Sales) VALUES
('North', 'A', 100),
('North', 'B', 150),
('South', 'A', 200),
('South', 'B', 250),
('East', 'A', 300),
('East', 'B', 350),
('West', 'A', 400),
('West', 'B', 450);


SELECT
    COALESCE(Region, 'All') AS Region,
    COALESCE(Product, 'All') AS Product,
    SUM(Sales) AS TotalSales
FROM
    SalesData
GROUP BY
    ROLLUP (Region, Product)
ORDER BY
    Region,
    Product;

# Result
Region  Product  TotalSales
All	    All	     2200
East    A	     300
East	All	     650
East	B	     350
North	A	     100
North	All	     250
North	B	     150
South	A	     200
South	All	     450
South	B	     250
West	A	     400
West	All	     850
West	B	     450

# ROLLUP (Region, Product): 
# This performs a hierarchical grouping. It first groups by Region and Product, then by Region only, and finally, it provides a grand total.


In [19]:

# In conclusion:
# Pandas: Provides extensive functionality beyond SQL, such as handling time series data, applying custom 
# functions row-wise or element-wise, and integration with other Python libraries.

# SQL: Primarily focused on data querying and manipulation, lacks some of the advanced data analysis and 
# transformation functions available in Pandas.

# Pandas: More flexible in terms of applying custom functions, handling complex data structures, and 
# integrating with other parts of the Python ecosystem.

# SQL: More rigid but ensures data integrity and consistency, especially important for transaction-based systems.


In [20]:

# END!!!
