In [0]:
%pip install openpyxl
%pip install pytest

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


1. Read the provided files into dataframes

In [0]:
import pandas as pd
import json
from datetime import datetime

# Load Orders JSON file
with open('/Volumes/workspace/default/temp/Orders.json', 'r') as f:
    orders_data = json.load(f)
orders_df = pd.DataFrame(orders_data)

# Load Products CSV
products_df = pd.read_csv('/Volumes/workspace/default/temp/Products.csv')

# Load Customer Excel file  
customers_df = pd.read_excel('/Volumes/workspace/default/temp/Customer.xlsx')


In [0]:
customers_df= customers_df.drop('phone', axis=1)  # dropping column with bad data (unneccasry, as not being used anywhere)

2. Data Cleaning and Date Processing

In [0]:
# Parse date function
def parse_date(date_str):
    try:
        return pd.to_datetime(date_str, format='%d/%m/%Y')
    except:
        return pd.NaT

# Transform dates and extract year
orders_df['Order Date'] = orders_df['Order Date'].apply(parse_date)
orders_df['Ship Date'] = orders_df['Ship Date'].apply(parse_date)
orders_df['Year'] = orders_df['Order Date'].dt.year


In [0]:

%sql
Drop Table orders;
Drop Table products;
Drop Table customers;



In [0]:
from pyspark.sql.functions import trim, regexp_replace, when, col


# save raw tables
#Rename Columns to remove spaces

orders_df = orders_df.rename(columns=lambda x: x.replace(" ", "_"))
products_df = products_df.rename(columns=lambda x: x.replace(" ", "_"))
customers_df = customers_df.rename(columns=lambda x: x.replace(" ", "_"))

spark_orders_df = spark.createDataFrame(orders_df)
spark_products_df = spark.createDataFrame(products_df)
spark_customers_df = spark.createDataFrame(customers_df)

spark_orders_df = spark_orders_df.dropna()
spark_customers_df = spark_customers_df.dropna()

spark_customers_df = spark_customers_df.withColumn(
    "Customer_Name",
    trim(
        regexp_replace(
            regexp_replace("Customer_Name", "[^a-zA-Z ]", ""),  # remove special chars and digits from customer name
            " +",  # regex for one or more spaces
            " "    # replace with single space
        )
    )
)



spark_orders_df.write.mode("overwrite").saveAsTable("orders")
spark_products_df.write.mode("overwrite").saveAsTable("products")
spark_customers_df.write.mode("overwrite").saveAsTable("customers")

3. Unit Tests For Data Cleaning
Test the Data Cleaning logic, including column renaming and handling missing values

In [0]:
import pytest

def test_column_renaming():
    # Test that column renaming works correctly for already built DataFrames
    assert "Order_ID" in spark_orders_df.columns, "Column renaming failed for Orders DataFrame."
    assert "Product_ID" in spark_products_df.columns, "Column renaming failed for Products DataFrame."
    assert "Customer_ID" in spark_customers_df.columns, "Column renaming failed for Customers DataFrame."

def test_dropna_orders():
    # Test that rows with missing values are dropped in orders DataFrame
    total_rows = spark.createDataFrame(orders_df).count()
    cleaned_rows = spark_orders_df.count()
    assert cleaned_rows <= total_rows, "Dropna failed to clean missing rows in Orders DataFrame."

def test_dropna_customers():
    # Test that rows with missing values are dropped in customers DataFrame
    total_rows = spark.createDataFrame(customers_df).count()
    cleaned_rows = spark_customers_df.count()
    assert cleaned_rows <= total_rows, "Dropna failed to clean missing rows in Customers DataFrame."

4. Create tables with additional details and Respective unit tests


In [0]:
# Ensure spark_orders_df is defined by reading from the saved table if not already present
if 'spark_orders_df' not in locals():
    spark_orders_df = spark.table("orders")
if 'spark_customers_df' not in locals():
    spark_customers_df = spark.table("customers")

from pyspark.sql import functions as F

# Aggregate customer metrics - To get order information
customer_metrics = spark_orders_df.groupby('Customer_ID').agg(
    F.countDistinct('Order_ID').alias('Total_Orders'),
    F.sum('Price').alias('Total_Sales'),
    F.sum('Profit').alias('Total_Profit'),
    F.sum('Quantity').alias('Total_Quantity'),
    F.min('Order_Date').alias('First_Order_Date'),
    F.max('Order_Date').alias('Last_Order_Date')
)

# Round the numeric columns
customer_metrics = customer_metrics.withColumn('Total_Sales', F.round('Total_Sales', 2)) \
                                   .withColumn('Total_Profit', F.round('Total_Profit', 2)) \
                                   .withColumn('Total_Quantity', F.round('Total_Quantity', 2))

# Join with customer details, Left join with Customer and Customer Metrics  
enriched_customers = spark_customers_df.join(
    customer_metrics,
    on='Customer_ID',
    how='left'
)

display(enriched_customers)

# Additional unit tests
def test_customer_metrics_columns():
    expected_columns = {
        'Customer_ID', 'Total_Orders', 'Total_Sales', 'Total_Profit',
        'Total_Quantity', 'First_Order_Date', 'Last_Order_Date'
    }
    actual_columns = set(customer_metrics.columns)
    assert expected_columns.issubset(actual_columns), "Customer metrics columns missing."

def test_enriched_customers_join():
    # All Customer_IDs in spark_customers_df should be present in enriched_customers
    customer_ids = set(row['Customer_ID'] for row in spark_customers_df.select('Customer_ID').collect())
    enriched_ids = set(row['Customer_ID'] for row in enriched_customers.select('Customer_ID').collect())
    assert customer_ids.issubset(enriched_ids), "Left join failed: missing Customer_IDs in enriched_customers."

def test_total_orders_non_negative():
    # Total_Orders should be non-negative
    min_orders = customer_metrics.agg(F.min('Total_Orders')).collect()[0][0]
    assert min_orders >= 0, "Total_Orders contains negative values."

def test_total_sales_profit_quantity_non_negative():
    # Total_Sales, Total_Profit, and Total_Quantity should be non-negative or null
    for col_name in ['Total_Sales', 'Total_Profit', 'Total_Quantity']:
        min_value = customer_metrics.agg(F.min(col_name)).collect()[0][0]
        assert (min_value is None) or (min_value >= 0), f"{col_name} contains negative values."

def test_first_last_order_date_order():
    # First_Order_Date should be less than or equal to Last_Order_Date
    rows = customer_metrics.select('First_Order_Date', 'Last_Order_Date').collect()
    for row in rows:
        if row['First_Order_Date'] and row['Last_Order_Date']:
            assert row['First_Order_Date'] <= row['Last_Order_Date'], "First_Order_Date is after Last_Order_Date."

Customer_ID,Customer_Name,email,address,Segment,Country,City,State,Postal_Code,Region,Total_Orders,Total_Sales,Total_Profit,Total_Quantity,First_Order_Date,Last_Order_Date
RD-19585,Rob Dowd,danielleware947@gmail.com,"1055 Leslie Squares Apt. 640 North Jacob, AZ 25423",Consumer,United States,Dubuque,Iowa,52001,Central,10,2912.66,734.52,73,2014-04-02T00:00:00.000Z,2017-07-03T00:00:00.000Z
MM-18055,Michelle Moray,andrewhays420@gmail.com,"1127 Cole Skyway Lake Steven, OK 62340",Consumer,United States,San Francisco,California,94110,West,8,2749.86,-520.46,73,2014-04-11T00:00:00.000Z,2017-10-30T00:00:00.000Z
FO-14305,Frank Olsen,kendraholder796@gmail.com,"0543 Martinez Mount Suite 497 South Johnfort, NM 85655",Consumer,United States,Philadelphia,Pennsylvania,19143,East,10,2678.43,215.58,60,2014-06-09T00:00:00.000Z,2017-11-22T00:00:00.000Z
MP-17470,Mark Packer,nicholasrobinson191@gmail.com,"143 Kyle Throughway Suite 713 North Jacobville, SC 41104",Home Office,United States,New York City,New York,10035,East,7,3205.77,600.3,77,2014-03-30T00:00:00.000Z,2016-01-11T00:00:00.000Z
CM-12115,Chad McGuire,sharonwarner980@gmail.com,"050 Anna Camp South Jonathan, WY 45473",Consumer,United States,New York City,New York,10011,East,4,1661.61,408.58,35,2015-03-29T00:00:00.000Z,2017-12-22T00:00:00.000Z
KN-16705,Kristina Nunn,sabrinahayes269@gmail.com,"04950 Joseph Meadow East Kyleville, FM 82237",Home Office,United States,Sparks,Nevada,89431,West,8,2280.58,329.77,55,2014-03-02T00:00:00.000Z,2017-09-23T00:00:00.000Z
JG-15310,Jason Gross,danielpaul555@gmail.com,"04852 Wise Row Davidfurt, AR 32041",Corporate,United States,Newark,Ohio,43055,East,6,2240.58,3.59,46,2016-03-03T00:00:00.000Z,2017-12-28T00:00:00.000Z
VM-21685,Valerie Mitchum,pamelathompson911@gmail.com,"0592 James Landing Apt. 950 East Steven, NJ 78538",Home Office,United States,Seattle,Washington,98105,West,7,2454.12,513.62,37,2014-04-07T00:00:00.000Z,2017-11-06T00:00:00.000Z
TS-21085,Thais Sissman,jamesjohnson660@gmail.com,"04890 Sweeney Turnpike East Mariaside, DC 09754",Consumer,United States,Ormond Beach,Florida,32174,South,2,4.83,-3.32,4,2015-07-19T00:00:00.000Z,2017-01-07T00:00:00.000Z
FM-14290,Frank Merwin,denisecook866@gmail.com,"143 Danielle Route New Seanburgh, FM 77823",Home Office,United States,Quincy,Massachusetts,2169,East,9,3736.23,197.89,79,2014-08-26T00:00:00.000Z,2017-11-24T00:00:00.000Z


In [0]:
# Ensure spark_products_df is defined by reading from the saved table if not already present
if 'spark_products_df' not in locals():
    spark_products_df = spark.table("products")

product_metrics = spark_orders_df.groupby('Product_ID').agg(
    F.countDistinct('Order_ID').alias('Total_Orders'),
    F.sum('Price').alias('Total_Sales'),
    F.sum('Profit').alias('Total_Profit'),
    F.sum('Quantity').alias('Total_Quantity_Sold'),
    F.countDistinct('Customer_ID').alias('Unique_Customers')
)

product_metrics = product_metrics.withColumn('Total_Sales', F.round('Total_Sales', 2)) \
                                .withColumn('Total_Profit', F.round('Total_Profit', 2))

# Join with product details, Left join 
enriched_products = spark_products_df.join(product_metrics,
                                    on='Product_ID',
                                    how='left')

display(enriched_products)

# Pytest unit tests for product_metrics and enriched_products
def test_product_metrics_columns():
    expected_columns = {
        'Product_ID', 'Total_Orders', 'Total_Sales', 'Total_Profit',
        'Total_Quantity_Sold', 'Unique_Customers'
    }
    actual_columns = set(product_metrics.columns)
    assert expected_columns.issubset(actual_columns), "Product metrics columns missing."

def test_enriched_products_join():
    # All Product_IDs in spark_products_df should be present in enriched_products
    product_ids = set(row['Product_ID'] for row in spark_products_df.select('Product_ID').collect())
    enriched_ids = set(row['Product_ID'] for row in enriched_products.select('Product_ID').collect())
    assert product_ids.issubset(enriched_ids), "Left join failed: missing Product_IDs in enriched_products."

def test_total_orders_non_negative_products():
    # Total_Orders should be non-negative
    min_orders = product_metrics.agg(F.min('Total_Orders')).collect()[0][0]
    assert min_orders >= 0, "Total_Orders contains negative values in product_metrics."

def test_total_sales_profit_non_negative_products():
    # Total_Sales and Total_Profit should be non-negative or null
    for col_name in ['Total_Sales', 'Total_Profit']:
        min_value = product_metrics.agg(F.min(col_name)).collect()[0][0]
        assert (min_value is None) or (min_value >= 0), f"{col_name} contains negative values in product_metrics."

def test_total_quantity_sold_non_negative():
    # Total_Quantity_Sold should be non-negative or null
    min_value = product_metrics.agg(F.min('Total_Quantity_Sold')).collect()[0][0]
    assert (min_value is None) or (min_value >= 0), "Total_Quantity_Sold contains negative values in product_metrics."

def test_unique_customers_non_negative():
    # Unique_Customers should be non-negative
    min_value = product_metrics.agg(F.min('Unique_Customers')).collect()[0][0]
    assert (min_value is None) or (min_value >= 0), "Unique_Customers contains negative values in product_metrics."

Product_ID,Category,Sub-Category,Product_Name,State,Price_per_product,Total_Orders,Total_Sales,Total_Profit,Total_Quantity_Sold,Unique_Customers
OFF-ST-10001128,Office Supplies,Storage,"Carina Mini System Audio Rack, Model AR050B",Indiana,110.75,6,2729.19,-68.8,26,6
OFF-BI-10001249,Office Supplies,Binders,Avery Heavy-Duty EZD View Binder with Locking Rings,Illinois,1.276,10,170.99,29.72,41,10
OFF-PA-10004243,Office Supplies,Paper,Xerox 1939,New Jersey,18.97,5,417.34,200.33,22,5
OFF-PA-10001800,Office Supplies,Paper,Xerox 220,California,6.48,8,145.15,60.91,25,8
OFF-PA-10001790,Office Supplies,Paper,Xerox 1910,Illinois,38.432,5,586.09,236.36,14,5
TEC-PH-10002824,Technology,Phones,Jabra SPEAK 410 Multidevice Speakerphone,Ohio,123.3333333,3,2017.96,10.3,13,3
OFF-AP-10000358,Office Supplies,Appliances,Fellowes Basic Home/Office Series Surge Protectors,Illinois,2.596,9,436.27,58.69,41,9
OFF-BI-10001308,Office Supplies,Binders,GBC Standard Plastic Binding Systems' Combs,Texas,1.256,8,108.65,5.21,30,8
FUR-TA-10004619,Furniture,Tables,Hon Non-Folding Utility Tables,Texas,111.517,5,3775.52,541.66,29,5
OFF-BI-10001097,Office Supplies,Binders,Avery Hole Reinforcements,Washington,4.666666667,9,115.57,15.15,32,9


5. Enriched Orders and Unit Testing

In [0]:
# Join orders with customer and product details
from pyspark.sql.functions import round
spark_orders_df = spark_orders_df.withColumn('Profit', round(spark_orders_df['Profit'], 2))

enriched_orders = spark_orders_df.join(
    enriched_customers.select('Customer_ID', 'Customer_Name', 'Country'), 
    on='Customer_ID', how='inner'
).join(
    enriched_products.select('Product_ID', 'Product_Name', 'Category', 'Sub-Category'), 
    on='Product_ID', how='inner'
)

display(enriched_orders)

# Unit tests for dataframes and functions

import pytest

def test_enriched_orders_columns():
    expected_columns = {
        'Row_ID', 'Order_ID', 'Order_Date', 'Ship_Date', 'Ship_Mode', 'Customer_ID',
        'Product_ID', 'Quantity', 'Price', 'Discount', 'Profit', 'Year',
        'Customer_Name', 'Country', 'Product_Name', 'Category', 'Sub-Category'
    }
    actual_columns = set(enriched_orders.columns)
    assert expected_columns.issubset(actual_columns), "enriched_orders missing expected columns."

def test_enriched_orders_no_nulls_in_keys():
    # Ensure no nulls in join keys
    null_customer = enriched_orders.filter(enriched_orders.Customer_ID.isNull()).count()
    null_product = enriched_orders.filter(enriched_orders.Product_ID.isNull()).count()
    assert null_customer == 0, "Null Customer_IDs found in enriched_orders."
    assert null_product == 0, "Null Product_IDs found in enriched_orders."

def test_profit_rounded():
    # Ensure Profit is rounded to 2 decimals
    from pyspark.sql.functions import length, split
    profits = enriched_orders.select('Profit').rdd.flatMap(lambda x: x).filter(lambda x: x is not None)
    for profit in profits.collect():
        assert round(profit, 2) == profit, "Profit is not rounded to 2 decimals."

def test_parse_date_valid():
    # Test parse_date function for valid and invalid input
    assert parse_date('01/01/2020') == pd.Timestamp('2020-01-01')
    assert pd.isna(parse_date('invalid-date'))

def test_orders_df_year_extraction():
    # Ensure Year column matches Order Date year
    for row in orders_df.itertuples():
        if pd.notna(row._asdict()['Order_Date']):
            assert row._asdict()['Year'] == row._asdict()['Order_Date'].year

def test_customers_df_no_phone_column():
    assert 'phone' not in customers_df.columns, "phone column not dropped from customers_df."

def test_products_df_column_renaming():
    assert "Product_ID" in products_df.columns, "Column renaming failed for Products DataFrame."

def test_orders_df_column_renaming():
    assert "Order_ID" in orders_df.columns, "Column renaming failed for Orders DataFrame."

Product_ID,Customer_ID,Row_ID,Order_ID,Order_Date,Ship_Date,Ship_Mode,Quantity,Price,Discount,Profit,Year,Customer_Name,Country,Product_Name,Category,Sub-Category
FUR-CH-10002961,JK-15370,1,CA-2016-122581,2016-08-21T00:00:00.000Z,2016-08-25T00:00:00.000Z,Standard Class,7,573.17,0.3,63.69,2016,Jay Kimmel,United States,"Leather Task Chair, Black",Furniture,Chairs
TEC-AC-10004659,BD-11320,2,CA-2017-117485,2017-09-23T00:00:00.000Z,2017-09-29T00:00:00.000Z,Standard Class,4,291.96,0.0,102.19,2017,Bil Donatelli,United States,Imation Secure+ Hardware Encrypted USB 2.0 Flash Drive; 16GB,Technology,Accessories
OFF-BI-10002824,LB-16795,3,US-2016-157490,2016-10-06T00:00:00.000Z,2016-10-07T00:00:00.000Z,First Class,4,17.0,0.7,-14.92,2016,Laurel Beltran,United States,Recycled Easel Ring Binders,Office Supplies,Binders
OFF-PA-10003349,KB-16315,4,CA-2015-111703,2015-07-02T00:00:00.000Z,2015-07-09T00:00:00.000Z,Standard Class,3,15.55,0.2,5.64,2015,Karl Braun,United States,Xerox 1957,Office Supplies,Paper
TEC-AC-10003023,DO-13435,5,CA-2014-108903,2014-10-03T00:00:00.000Z,2014-10-03T00:00:00.000Z,Same Day,3,142.49,0.2,-3.0,2014,Denny Ordway,United States,Logitech G105 Gaming Keyboard,Technology,Accessories
OFF-BI-10004233,CB-12025,6,CA-2016-117583,2016-11-27T00:00:00.000Z,2016-11-30T00:00:00.000Z,First Class,5,79.95,0.0,38.38,2016,Cassandra Brandow,United States,"GBC Pre-Punched Binding Paper, Plastic, White, 8-1/2"" x 11""",Office Supplies,Binders
OFF-PA-10004470,SM-20005,7,CA-2014-148488,2014-12-10T00:00:00.000Z,2014-12-15T00:00:00.000Z,Standard Class,2,11.0,0.0,5.23,2014,Sally Matthias,United States,"Adams Write n' Stick Phone Message Book, 11"" X 5 1/4"", 200 Messages",Office Supplies,Paper
FUR-FU-10001196,RD-19480,8,CA-2016-136434,2016-12-01T00:00:00.000Z,2016-12-07T00:00:00.000Z,Standard Class,3,17.31,0.0,5.19,2016,Rick Duston,United States,DAX Cubicle Frames - 8x10,Furniture,Furnishings
OFF-ST-10000585,JM-16195,9,CA-2014-160094,2014-04-30T00:00:00.000Z,2014-05-02T00:00:00.000Z,First Class,5,826.0,0.0,214.0,2014,Justin MacKendrick,United States,Economy Rollaway Files,Office Supplies,Storage
OFF-ST-10003996,SC-20230,10,CA-2017-141747,2017-08-03T00:00:00.000Z,2017-08-08T00:00:00.000Z,Second Class,1,16.06,0.0,4.18,2017,Scot Coram,United States,"Letter/Legal File Tote with Clear Snap-On Lid, Black Granite",Office Supplies,Storage


In [0]:
enriched_orders.write.mode("overwrite").saveAsTable("enriched_orders")

6. Calculate Profit and Unit Testing

In [0]:
from pyspark.sql import functions as F

# Annual profit analysis
profit_by_year = enriched_orders.groupby('Year').agg(
    F.sum('Profit').alias('Total_Profit')
)

# Product Category profit analysis
profit_by_category = enriched_orders.groupby('Category').agg(
    F.sum('Profit').alias('Total_Profit')
)

# Product Sub-Category profit analysis
profit_by_sub_category = enriched_orders.groupby('Sub-Category').agg(
    F.sum('Profit').alias('Total_Profit')
)

# Product Sub-Category profit analysis
profit_by_cust = enriched_orders.groupby('Customer_Name').agg(
    F.sum('Profit').alias('Total_Profit')
)

display(profit_by_year)
display(profit_by_category)
display(profit_by_sub_category)
display(profit_by_cust)

# Unit tests for the above dataframes and aggregations

import pytest

def test_profit_by_year_columns():
    expected_columns = {'Year', 'Total_Profit'}
    actual_columns = set(profit_by_year.columns)
    assert expected_columns == actual_columns, "profit_by_year columns mismatch."

def test_profit_by_category_columns():
    expected_columns = {'Category', 'Total_Profit'}
    actual_columns = set(profit_by_category.columns)
    assert expected_columns == actual_columns, "profit_by_category columns mismatch."

def test_profit_by_sub_category_columns():
    expected_columns = {'Sub-Category', 'Total_Profit'}
    actual_columns = set(profit_by_sub_category.columns)
    assert expected_columns == actual_columns, "profit_by_sub_category columns mismatch."

def test_profit_by_cust_columns():
    expected_columns = {'Customer_Name', 'Total_Profit'}
    actual_columns = set(profit_by_cust.columns)
    assert expected_columns == actual_columns, "profit_by_cust columns mismatch."

def test_total_profit_non_negative():
    # Total_Profit should not be negative in any aggregation (or can be zero/positive)
    for df in [profit_by_year, profit_by_category, profit_by_sub_category, profit_by_cust]:
        min_profit = df.agg(F.min('Total_Profit')).collect()[0][0]
        assert (min_profit is None) or (min_profit >= 0), "Total_Profit contains negative values."

def test_years_match_orders():
    # All years in profit_by_year should be present in enriched_orders
    years_orders = set(row['Year'] for row in enriched_orders.select('Year').distinct().collect())
    years_agg = set(row['Year'] for row in profit_by_year.select('Year').distinct().collect())
    assert years_agg.issubset(years_orders), "profit_by_year contains years not in enriched_orders."

def test_categories_match_orders():
    # All categories in profit_by_category should be present in enriched_orders
    cats_orders = set(row['Category'] for row in enriched_orders.select('Category').distinct().collect())
    cats_agg = set(row['Category'] for row in profit_by_category.select('Category').distinct().collect())
    assert cats_agg.issubset(cats_orders), "profit_by_category contains categories not in enriched_orders."

def test_sub_categories_match_orders():
    # All sub-categories in profit_by_sub_category should be present in enriched_orders
    subcats_orders = set(row['Sub-Category'] for row in enriched_orders.select('Sub-Category').distinct().collect())
    subcats_agg = set(row['Sub-Category'] for row in profit_by_sub_category.select('Sub-Category').distinct().collect())
    assert subcats_agg.issubset(subcats_orders), "profit_by_sub_category contains sub-categories not in enriched_orders."

def test_customers_match_orders():
    # All customer names in profit_by_cust should be present in enriched_orders
    cust_orders = set(row['Customer_Name'] for row in enriched_orders.select('Customer_Name').distinct().collect())
    cust_agg = set(row['Customer_Name'] for row in profit_by_cust.select('Customer_Name').distinct().collect())
    assert cust_agg.issubset(cust_orders), "profit_by_cust contains customer names not in enriched_orders."

Year,Total_Profit
2016,67872.71
2015,63588.14999999997
2017,126920.96000000008
2014,40490.76


Category,Total_Profit
Office Supplies,128363.46999999996
Furniture,8583.789999999997
Technology,161925.32


Sub-Category,Total_Profit
Art,6379.9
Accessories,49076.42999999999
Binders,29671.890000000007
Fasteners,738.46
Bookcases,-3287.659999999999
Envelopes,6201.64
Supplies,-1224.87
Appliances,24009.730000000007
Machines,-13501.52
Paper,35538.630000000005


Customer_Name,Total_Profit
Craig Yedwab,60.640000000000015
Odella Nelson,-5.529999999999996
Keith Dawkins,3030.05
Philip Brown,263.27
Sean Braxton,-2082.76
Vivek Grady,-52.33
Craig Carroll,800.0600000000001
James Galang,1972.33
Art Foster,-163.13
Daniel Lacy,4.389999999999933


In [0]:
%sql
-- Profit by year
SELECT Year, SUM(Profit) AS Total_Profit
FROM enriched_orders
GROUP BY Year;


Year,Total_Profit
2016,67872.71
2015,63588.14999999997
2017,126920.96000000004
2014,40490.76


In [0]:
%sql

-- Profit by year and product category
SELECT Year, Category, SUM(Profit) AS Total_Profit
FROM enriched_orders
GROUP BY Year, Category;


Year,Category,Total_Profit
2015,Office Supplies,24724.23
2016,Furniture,7877.399999999998
2017,Furniture,3426.23
2014,Technology,23359.75
2014,Furniture,-5363.76
2014,Office Supplies,22494.770000000004
2016,Technology,24087.000000000004
2016,Office Supplies,35908.310000000005
2017,Technology,78258.56999999999
2015,Furniture,2643.9200000000005


In [0]:
%sql

-- Profit by customer
SELECT Customer_Name, SUM(Profit) AS Total_Profit
FROM enriched_orders
GROUP BY Customer_Name;


Customer_Name,Total_Profit
Craig Yedwab,60.64000000000002
Odella Nelson,-5.529999999999987
Keith Dawkins,3030.05
Philip Brown,263.27
Sean Braxton,-2082.76
Vivek Grady,-52.33
Craig Carroll,800.0600000000001
James Galang,1972.33
Art Foster,-163.13
Daniel Lacy,4.389999999999933


In [0]:
%sql

-- Profit by customer and year
SELECT Customer_Name, Year, SUM(Profit) AS Total_Profit
FROM enriched_orders
GROUP BY Customer_Name, Year;

Customer_Name,Year,Total_Profit
Anna Chung,2014,-4.970000000000001
Marc Harrigan,2014,12.21
Arthur Wiediger,2017,164.78
Jonathan Doherty,2015,113.76
Chloris Kastensmidt,2017,223.08
Stewart Carmichael,2015,147.72
Clay Cheatham,2017,34.3
Ken Black,2015,66.42
Stuart Calhoun,2017,2.58
Magdelene Morse,2016,166.51


In [0]:
pip install pytest pytest-ipynb

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
