# Companion Notebook: Data Visualization with Pandas

<a href="https://colab.research.google.com/github/bradleyboehmke/uc-bana-4080/blob/main/example-notebooks/12_data_viz_pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook follows the content from *Chapter 13: Introduction to Data Visualization with Pandas*. It provides code examples to help you explore and visualize data using Pandas' built-in `.plot()` capabilities.

## Setup

In [None]:
import pandas as pd
from completejourney_py import get_data

# Load data
cj_data = get_data()
transactions = cj_data['transactions']
products = cj_data['products']
demographics = cj_data['demographics']

# Merge data
df = (
    transactions
    .merge(products, on='product_id', how='left')
    .merge(demographics, on='household_id', how='left')
)


## Using `.plot` in Pandas

In [None]:
# Example histogram
df['sales_value'].plot(kind='hist', bins=20, log=True)

## Univariate Visualizations

In [None]:
# Summary stats
df['sales_value'].describe()

In [None]:
# Basic histogram
df['sales_value'].plot.hist()

In [None]:
# Adjusted histogram
(
    df.loc[df['sales_value'] > 0, 'sales_value']
    .plot.hist(log=True, bins=30, title='Distribution of Sales Values')
);

In [None]:
# Boxplot of store sales
sales_by_store = df.groupby('store_id')['sales_value'].sum()
sales_by_store.plot.box(logy=True, title='Distribution of total sales across all stores');

In [None]:
# KDE plot
sales_by_store.plot.kde(title='Distribution of total sales across all stores');

## Time Series Visualizations

In [None]:
sales = df.set_index('transaction_timestamp')['sales_value']

In [None]:
# Hourly sales
sales.resample('h').sum().plot.line(figsize=(10, 4));

In [None]:
# Daily sales
sales.resample('D').sum().plot.line(figsize=(10, 4));

In [None]:
# Sales by weekday
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
total_sales_by_weekday = (
    sales
    .resample('D')
    .sum()
    .rename(lambda idx: idx.day_name())
    .groupby('transaction_timestamp')
    .quantile([.25, .5, .75])
    .unstack()
    .reindex(day_order)
)
total_sales_by_weekday.plot.line(title='Median and IQR of total sales by weekday', figsize=(10, 4));

In [None]:
# Median sales by weekday
median_sales_by_weekday = total_sales_by_weekday[0.50]
median_sales_by_weekday.plot.bar(title='Median total sales by weekday', figsize=(8, 4));

## Bivariate Visualizations

In [None]:
# Scatter plot of quantity vs. sales_value
df.plot.scatter(x='quantity', y='sales_value', title='Sales versus quantity', figsize=(8, 4));

In [None]:
# Bar plot: Top 10 departments by sales
dept_sales = (
    df
    .groupby('department', as_index=False)
    .agg({'sales_value': 'sum'})
    .nlargest(10, 'sales_value')
    .reset_index(drop=True)
)
dept_sales.sort_values('sales_value').plot.barh(x='department', y='sales_value', color='red');

In [None]:
# Bar plot with multiple series
dept_totals = (
    df
    .query("department != 'FUEL' & department != 'MISCELLANEOUS'")
    .groupby('department', as_index=False)
    .agg({'sales_value': 'sum', 'quantity': 'sum'})
    .nlargest(10, 'sales_value')
    .reset_index(drop=True)
)
dept_totals.sort_values('sales_value').plot.barh(x='department', y=['sales_value', 'quantity']).legend(loc='lower right');

## Multi-Series Time Series Plots

In [None]:
total_daily_discounts = (
    df
    .query("department == 'GROCERY'")
    .set_index('transaction_timestamp')
    .loc[:, ['retail_disc', 'coupon_disc', 'coupon_match_disc']]
    .resample('D')
    .sum()
)
total_daily_discounts.plot.line(logy=True, figsize=(10, 4));

In [None]:
total_daily_discounts.plot.kde(logx=True);