In [1]:
import os
import sys
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window
import unittest
import json
import csv
from datetime import datetime
from decimal import Decimal, ROUND_HALF_UP

In [2]:
print(os.getcwd())

/content


In [3]:
import pandas as pd
from pyspark.sql import SparkSession

# Create Spark session if not already created
spark = SparkSession.builder \
    .appName("EcommerceAnalysis") \
    .getOrCreate()

base_path = "/content"

In [4]:
products_df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(f"{base_path}/Products.csv")


In [5]:
display(products_df)

DataFrame[Product ID: string, Category: string, Sub-Category: string, Product Name: string, State: string, Price per product: string]

In [6]:
orders_df = spark.read \
    .format("json") \
    .option("multiLine", "true") \
    .load(f"{base_path}/Orders.json")

In [7]:
display(orders_df)

DataFrame[Customer ID: string, Discount: double, Order Date: string, Order ID: string, Price: double, Product ID: string, Profit: double, Quantity: bigint, Row ID: bigint, Ship Date: string, Ship Mode: string]

In [8]:
customer_pd = pd.read_excel(f"{base_path}/Customer.xlsx")
customer_df = spark.createDataFrame(customer_pd)

In [9]:
display(customer_df)

DataFrame[Customer ID: string, Customer Name: string, email: string, phone: string, address: string, Segment: string, Country: string, City: string, State: string, Postal Code: bigint, Region: string]

In [10]:
orders_df.show(5)

+-----------+--------+----------+--------------+------+---------------+------+--------+------+---------+--------------+
|Customer ID|Discount|Order Date|      Order ID| Price|     Product ID|Profit|Quantity|Row ID|Ship Date|     Ship Mode|
+-----------+--------+----------+--------------+------+---------------+------+--------+------+---------+--------------+
|   JK-15370|     0.3| 21/8/2016|CA-2016-122581|573.17|FUR-CH-10002961| 63.69|       7|     1|25/8/2016|Standard Class|
|   BD-11320|     0.0| 23/9/2017|CA-2017-117485|291.96|TEC-AC-10004659|102.19|       4|     2|29/9/2017|Standard Class|
|   LB-16795|     0.7| 6/10/2016|US-2016-157490|  17.0|OFF-BI-10002824|-14.92|       4|     3|7/10/2016|   First Class|
|   KB-16315|     0.2|  2/7/2015|CA-2015-111703| 15.55|OFF-PA-10003349|  5.64|       3|     4| 9/7/2015|Standard Class|
|   DO-13435|     0.2| 3/10/2014|CA-2014-108903|142.49|TEC-AC-10003023|  -3.0|       3|     5|3/10/2014|      Same Day|
+-----------+--------+----------+-------

In [None]:
#Test Cases

In [46]:
import unittest
from pyspark.sql import SparkSession
import pandas as pd

# Create Spark session for testing
spark = SparkSession.builder \
    .appName("TestingEcommerceAnalysis") \
    .getOrCreate()

# Load dataframes
base_path = "/content"
customer_pd_test = pd.read_excel(f"{base_path}/Customer.xlsx")
customer_df_test = spark.createDataFrame(customer_pd_test)

orders_df_test = spark.read \
    .format("json") \
    .option("multiLine", "true") \
    .load(f"{base_path}/Orders.json")

print("Testing environment setup and dataframes loaded.")

Testing environment setup and dataframes loaded.


In [76]:
class TestRawDataQuality(unittest.TestCase):

    def test_customer_df_null_values(self):
        print("\nTesting for null values in customer_df_test...")
        # Check for null values in key columns
        null_counts = customer_df_test.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in customer_df_test.columns])
        null_counts_pd = null_counts.toPandas()
        print("Null counts per column in customer_df_test:")
        print(null_counts_pd)
        # Assert that there are no nulls in 'Customer ID' and 'Customer Name'
        self.assertEqual(null_counts_pd['Customer ID'][0], 0, "Customer ID column should not have null values")
        self.assertEqual(null_counts_pd['Customer Name'][0], 0, "Customer Name column should not have null values")
        print("✓ Null value test passed for key columns in customer_df_test.")

    def test_orders_df_null_values(self):
        print("\nTesting for null values in orders_df_test...")
        # Check for null values in key columns
        null_counts = orders_df_test.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in orders_df_test.columns])
        null_counts_pd = null_counts.toPandas()
        print("Null counts per column in orders_df_test:")
        print(null_counts_pd)
        # Assert that there are no nulls in 'Order ID', 'Customer ID', and 'Product ID'
        self.assertEqual(null_counts_pd['Order ID'][0], 0, "Order ID column should not have null values")
        self.assertEqual(null_counts_pd['Customer ID'][0], 0, "Customer ID column should not have null values")
        self.assertEqual(null_counts_pd['Product ID'][0], 0, "Product ID column should not have null values")
        print("✓ Null value test passed for key columns in orders_df_test.")

    def test_products_df_null_values(self):
        print("\nTesting for null values in products_df...")
        # Check for null values in key columns
        null_counts = products_df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in products_df.columns])
        null_counts_pd = null_counts.toPandas()
        print("Null counts per column in products_df:")
        print(null_counts_pd)
        # Assert that there are no nulls in 'Product ID' and 'Product Name'
        self.assertEqual(null_counts_pd['Product ID'][0], 0, "Product ID column should not have null values")
        self.assertEqual(null_counts_pd['Product Name'][0], 0, "Product Name column should not have null values")
        print("✓ Null value test passed for key columns in products_df.")


    def test_customer_df_data_types(self):
        print("\nTesting data types in customer_df_test...")
        expected_types = {
            'Customer ID': 'string',
            'Customer Name': 'string',
            'email': 'string',
            'phone': 'string',
            'address': 'string',
            'Segment': 'string',
            'Country': 'string',
            'City': 'string',
            'State': 'string',
            'Postal Code': 'bigint',
            'Region': 'string'
        }
        actual_types = {col: dtype for col, dtype in customer_df_test.dtypes}
        print("Actual data types in customer_df_test:")
        print(actual_types)
        for col, expected_type in expected_types.items():
            self.assertEqual(actual_types.get(col), expected_type, f"Column '{col}' should be of type {expected_type}")
        print("✓ Data type test passed for customer_df_test.")

    def test_orders_df_data_types(self):
        print("\nTesting data types in orders_df_test...")
        expected_types = {
            'Customer ID': 'string',
            'Discount': 'double',
            'Order Date': 'string',
            'Order ID': 'string',
            'Price': 'double',
            'Product ID': 'string',
            'Profit': 'double',
            'Quantity': 'bigint',
            'Row ID': 'bigint',
            'Ship Date': 'string',
            'Ship Mode': 'string'
        }
        actual_types = {col: dtype for col, dtype in orders_df_test.dtypes}
        print("Actual data types in orders_df_test:")
        print(actual_types)
        for col, expected_type in expected_types.items():
            self.assertEqual(actual_types.get(col), expected_type, f"Column '{col}' should be of type {expected_type}")
        print("✓ Data type test passed for orders_df_test.")

    def test_products_df_data_types(self):
        print("\nTesting data types in products_df...")
        expected_types = {
            'Product ID': 'string',
            'Category': 'string',
            'Sub-Category': 'string',
            'Product Name': 'string',
            'State': 'string',
            'Price per product': 'string'
        }
        actual_types = {col: dtype for col, dtype in products_df.dtypes}
        print("Actual data types in products_df:")
        print(actual_types)
        for col, expected_type in expected_types.items():
            self.assertEqual(actual_types.get(col), expected_type, f"Column '{col}' should be of type {expected_type}")
        print("✓ Data type test passed for products_df.")

In [77]:
if __name__ == '__main__':
    suite = unittest.TestSuite()
    suite.addTest(unittest.makeSuite(TestRawDataQuality))
    runner = unittest.TextTestRunner()
    runner.run(suite)

  suite.addTest(unittest.makeSuite(TestRawDataQuality))
.


Testing data types in customer_df_test...
Actual data types in customer_df_test:
{'Customer ID': 'string', 'Customer Name': 'string', 'email': 'string', 'phone': 'string', 'address': 'string', 'Segment': 'string', 'Country': 'string', 'City': 'string', 'State': 'string', 'Postal Code': 'bigint', 'Region': 'string'}
✓ Data type test passed for customer_df_test.

Testing for null values in customer_df_test...


..

Null counts per column in customer_df_test:
   Customer ID  Customer Name  email  phone  address  Segment  Country  City  \
0            0              0      0      0        0        0        0     0   

   State  Postal Code  Region  
0      0            0       0  
✓ Null value test passed for key columns in customer_df_test.

Testing data types in orders_df_test...
Actual data types in orders_df_test:
{'Customer ID': 'string', 'Discount': 'double', 'Order Date': 'string', 'Order ID': 'string', 'Price': 'double', 'Product ID': 'string', 'Profit': 'double', 'Quantity': 'bigint', 'Row ID': 'bigint', 'Ship Date': 'string', 'Ship Mode': 'string'}
✓ Data type test passed for orders_df_test.

Testing for null values in orders_df_test...


..

Null counts per column in orders_df_test:
   Customer ID  Discount  Order Date  Order ID  Price  Product ID  Profit  \
0            0         0           0         0      0           0       0   

   Quantity  Row ID  Ship Date  Ship Mode  
0         0       0          0          0  
✓ Null value test passed for key columns in orders_df_test.

Testing data types in products_df...
Actual data types in products_df:
{'Product ID': 'string', 'Category': 'string', 'Sub-Category': 'string', 'Product Name': 'string', 'State': 'string', 'Price per product': 'string'}
✓ Data type test passed for products_df.

Testing for null values in products_df...


.
----------------------------------------------------------------------
Ran 6 tests in 1.269s

OK


Null counts per column in products_df:
   Product ID  Category  Sub-Category  Product Name  State  Price per product
0           0         0             0             0      0                  0
✓ Null value test passed for key columns in products_df.


In [74]:
#Testing Aggregation Validation



In [94]:
class TestValidationOrders(unittest.TestCase):

    def test_orders_dataframe_not_empty(self):
        print("\nTesting that orders_df_test is not empty...")
        # Assert that the loaded orders dataframe has more than 0 rows
        self.assertGreater(orders_df_test.count(), 0, "orders_df_test should not be empty")
        print("✓ orders_df_test is not empty.")

    def test_total_orders_count(self):
        print("\nTesting total count of orders_df_test...")
        # Get the total count of rows in the loaded orders dataframe
        total_orders = orders_df_test.count()
        print(f"Total orders count: {total_orders}")
        # Assert that the total count is a positive number
        self.assertGreater(total_orders, 0, "Total orders count should be greater than 0")
        print("✓ Total count of orders_df_test is positive.")

    def test_aggregation_on_loaded_orders(self):
        print("\nTesting aggregation on orders_df_test...")
        # Calculate the sum of Quantity for all orders
        total_quantity = orders_df_test.agg(F.sum("Quantity").alias("TotalQuantity")).collect()[0][0]
        print(f"Total quantity sold: {total_quantity}")

        # Validate that the total quantity is a positive number
        self.assertGreater(total_quantity, 0, "Total quantity sold should be a positive number")
        print("✓aggregation (total quantity) on orders_df_test passed validation.")



In [95]:
if __name__ == '__main__':
    suite = unittest.TestSuite()
    suite.addTest(unittest.makeSuite(TestValidationOrders))
    runner = unittest.TextTestRunner()
    runner.run(suite)

  suite.addTest(unittest.makeSuite(TestValidationOrders))



Testing aggregation on orders_df_test...


.

Total quantity sold: 37873
✓aggregation (total quantity) on orders_df_test passed validation.

Testing that orders_df_test is not empty...


.

✓ orders_df_test is not empty.

Testing total count of orders_df_test...


.
----------------------------------------------------------------------
Ran 3 tests in 0.694s

OK


Total orders count: 9994
✓ Total count of orders_df_test is positive.


In [96]:
class TestValidationCustomers(unittest.TestCase):

    def test_customers_dataframe_not_empty(self):
      print("\nTesting that customers_df_test is not empty...")
      # Assert that the loaded customers dataframe has more than 0 rows
      self.assertGreater(customer_df_test.count(), 0, "customers_df_test should not be empty")
      print("✓ customers_df_test is not empty.")

    def test_total_customers_count(self):
        print("\nTesting total count of customers_df_test...")
        # Get the total count of rows in the loaded customers dataframe
        total_customers = customer_df_test.count()
        print(f"Total customers count: {total_customers}")
        # Assert that the total count is a positive number
        self.assertGreater(total_customers, 0, "Total customers count should be greater than 0")
        print("✓ Total count of customers_df_test is positive.")

    def test_aggregation_on_loaded_customers(self):
        print("\nTesting aggregation on customers_df_test...")
        # For example, count unique CustomerID values
        unique_customers = customer_df_test.select("Customer Name").distinct().count()
        print(f"Number of unique customers: {unique_customers}")

        # Validate that there are unique customers
        self.assertGreater(unique_customers, 0, "Number of unique customers should be a positive number")
        print("✓aggregation (unique customers) on customers_df_test passed validation.")


In [97]:
if __name__ == '__main__':
    suite = unittest.TestSuite()
    suite.addTest(unittest.makeSuite(TestValidationCustomers))
    runner = unittest.TextTestRunner()
    runner.run(suite)

  suite.addTest(unittest.makeSuite(TestValidationCustomers))



Testing aggregation on customers_df_test...


.

Number of unique customers: 786
✓aggregation (unique customers) on customers_df_test passed validation.

Testing that customers_df_test is not empty...


.

✓ customers_df_test is not empty.

Testing total count of customers_df_test...


.
----------------------------------------------------------------------
Ran 3 tests in 1.489s

OK


Total customers count: 793
✓ Total count of customers_df_test is positive.


In [99]:
class TestValidationProducts(unittest.TestCase):

    def test_products_dataframe_not_empty(self):
        print("\nTesting that products_df is not empty...")
        # Assert that the loaded products dataframe has more than 0 rows
        self.assertGreater(products_df.count(), 0, "products_df should not be empty")
        print("✓ products_df is not empty.")

    def test_total_products_count(self):
        print("\nTesting total count of products_df...")
        # Get the total count of rows in the loaded products dataframe
        total_products = products_df.count()
        print(f"Total products count: {total_products}")
        # Assert that the total count is a positive number
        self.assertGreater(total_products, 0, "Total products count should be greater than 0")
        print("✓ Total count of products_df is positive.")

    def test_aggregation_on_loaded_products(self):
        print("\nTesting aggregation on products_df...")
        # For example, count unique ProductID values
        unique_products = products_df.select("Product ID").distinct().count()
        print(f"Number of unique products: {unique_products}")

        # Validate that there are unique products
        self.assertGreater(unique_products, 0, "Number of unique products should be a positive number")
        print("✓ aggregation (unique products) on products_df passed validation.")


In [100]:

if __name__ == '__main__':
    suite = unittest.TestSuite()
    suite.addTest(unittest.makeSuite(TestValidationProducts))
    runner = unittest.TextTestRunner()
    runner.run(suite)

  suite.addTest(unittest.makeSuite(TestValidationProducts))



Testing aggregation on products_df...


.

Number of unique products: 1818
✓ aggregation (unique products) on products_df passed validation.

Testing that products_df is not empty...


..

✓ products_df is not empty.

Testing total count of products_df...
Total products count: 1851
✓ Total count of products_df is positive.



----------------------------------------------------------------------
Ran 3 tests in 1.028s

OK


In [None]:
# Added Test Cases