In [2]:
import unittest
import yaml
import datetime
import pandas as pd
from pyspark.sql import SparkSession
#spark = SparkSession.builder.appName("TestCharStats").getOrCreate()

class TestCharStats(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        # Loading configuration from config.yaml
        with open("config.yaml", "r") as config_file:
            config = yaml.safe_load(config_file)

        # Create a Spark session
        cls.spark = SparkSession.builder.appName("TestCharStats").getOrCreate()
        
        # Assign configuration variables
        cls.table_name = config["table_name"]
        cls.primary_key_columns = config["primary_key_columns"]
        cls.threshold_percentage = config["threshold_percentage"]
        print(f"printing table name:{cls.table_name}")

    @classmethod
    def tearDownClass(cls):
        # Stop the Spark session
        spark.stop()

    def log_test_result(self, test_name, status, reason=""):
        timestamp = datetime.datetime.now()
        # Creating a dictionary for the test result
        result = {
            "Table Name":table_name,
            "Test Name": test_name,
            "Status": status,
            "Reason": reason,
            "Timestamp": timestamp
        }
        # Append the result to the test_results list
        test_results.append(result)

    def test_primary_key(self):
        # Loading the current data
        df = cls.spark.read.parquet("/home/jovyan/work/digital/target/char_stats/")  

        # Checking primary key columns to make sure no duplicates
        num_rows = df.count()
        num_distinct_rows = df.select(*primary_key_columns).distinct().count()
        #self.assertEqual(num_rows, num_distinct_rows)
        if num_rows == num_distinct_rows:
            self.log_test_result("test_primary_key", "Pass")
        else:
            self.log_test_result("test_primary_key", "Fail", "Duplicate primary key values found")

    def test_count_increase(self):
        # Loading the today's and previous day data for the comparision of countyour DataFrames for previous and current loads
        df_previous = cls.spark.read.parquet("/home/jovyan/work/digital/target/char_stats/")
        df_current = cls.spark.read.parquet("/home/jovyan/work/digital/target/char_stats/")  # Replace with the actual path

        # Calculating the count of increased percentage
        count_previous = df_previous.count()
        count_current = df_current.count()
        increase_percentage = (count_current - count_previous) / count_previous * 100

        if increase_percentage >= cls.threshold_percentage:
            self.log_test_result("test_count_increase", "Pass")
        else:
            self.log_test_result("test_count_increase", "Fail", "Count increase percentage is below threshold")

if __name__ == '__main__':
    # Creating an empty list to store test results
    test_results = []

    # Creating a test suite
    test_suite = unittest.TestLoader().loadTestsFromTestCase(TestCharStats)

    # Running the tests
    test_runner = unittest.TextTestRunner()

    # Run each test and loggig the results
    for test_case in test_suite:
        test_result = test_runner.run(test_case)

    # Converting the test results to write into a file
    results_df = pd.DataFrame(test_results)

    # Write the results to a CSV file
    results_df.to_csv("test_results.csv", index=False)


E
ERROR: test_count_increase (__main__.TestCharStats.test_count_increase)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_5672/3399858392.py", line 57, in test_count_increase
    df_previous = cls.spark.read.parquet("/home/jovyan/work/digital/target/char_stats/")
                  ^^^
NameError: name 'cls' is not defined

----------------------------------------------------------------------
Ran 1 test in 0.003s

FAILED (errors=1)
E
ERROR: test_primary_key (__main__.TestCharStats.test_primary_key)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_5672/3399858392.py", line 44, in test_primary_key
    df = cls.spark.read.parquet("/home/jovyan/work/digital/target/char_stats/")
       ^^^
NameError: name 'cls' is not defined

----------------------------------------------------------------------
Ran 1 test in 0.002s

FAILED (errors=

In [62]:
import unittest
import yaml
import datetime
import pandas as pd
from pyspark.sql import SparkSession
import logging

# Set up logging
log_file_name = f"UnitTest1_{datetime.datetime.now().strftime('%Y-%m-%d')}.log"
logging.basicConfig(filename=log_file_name, level=logging.INFO)

class TestCharStats(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        # Loading configuration from config.yaml
        with open("config.yaml", "r") as config_file:
            config = yaml.safe_load(config_file)

        # Create a Spark session
        cls.spark = SparkSession.builder.appName("TestCharStats").getOrCreate()
        
        # Assign configuration variables
        cls.table_name = config["table_name"]
        cls.primary_key_columns = config["primary_key_columns"]
        cls.threshold_percentage = config["threshold_percentage"]

    @classmethod
    def tearDownClass(cls):
        # Stop the Spark session
        cls.spark.stop()

    def log_test_result(self, test_name, status, reason=""):
        timestamp = datetime.datetime.now()
        # Creating a dictionary for the test result
        result = {
            "Table Name": table_name,
            "Test Name": test_name,
            "Status": status,
            "Reason": reason,
            "Timestamp": timestamp
        }
        # Append the result to the test_results list
        test_results.append(result)

        # Logging the result
        log_message = f"Test Name: {test_name}, Status: {status}, Reason: {reason}"
        logging.info(log_message)

    def test_primary_key(self):
        # Loading the current data
        df = spark.read.parquet("/home/jovyan/work/digital/target/char_stats/")  

        # Checking primary key columns to make sure no duplicates
        num_rows = df.count()
        num_distinct_rows = df.select(*primary_key_columns).distinct().count()

        if num_rows == num_distinct_rows:
            self.log_test_result("test_primary_key", "Pass")
        else:
            self.log_test_result("test_primary_key", "Fail", "Duplicate primary key values found")

    def test_count_increase(self):
        # Loading the today's and previous day data for the comparison of count
        df_previous = spark.read.parquet("/home/jovyan/work/digital/target/char_stats/")
        df_current = spark.read.parquet("/home/jovyan/work/digital/target/char_stats/")  # Replace with the actual path

        # Calculating the count of increased percentage
        count_previous = df_previous.count()
        count_current = df_current.count()
        increase_percentage = (count_current - count_previous) / count_previous * 100

        if increase_percentage >= threshold_percentage:
            self.log_test_result("test_count_increase", "Pass")
        else:
            self.log_test_result("test_count_increase", "Fail", "Count increase percentage is below threshold")

if __name__ == '__main__':
    # Creating an empty list to store test results
    test_results = []

    # Creating a test suite
    test_suite = unittest.TestLoader().loadTestsFromTestCase(TestCharStats)

    # Running the tests
    test_runner = unittest.TextTestRunner()

    # Run each test and logging the results
    for test_case in test_suite:
        test_result = test_runner.run(test_case)

    # Converting the test results to write into a file
    results_df = pd.DataFrame(test_results)
    results_df.to_csv("test_results.csv", index=False)


.
----------------------------------------------------------------------
Ran 1 test in 1.676s

OK
.
----------------------------------------------------------------------
Ran 1 test in 1.325s

OK


In [59]:
import unittest
import yaml
import datetime
import pandas as pd
from pyspark.sql import SparkSession
import logging

# Set up logging
log_file_name = f"UnitTest_{datetime.datetime.now().strftime('%Y-%m-%d')}.log"
logging.basicConfig(filename=log_file_name, level=logging.INFO)

class TestCharStats(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        # Loading configuration from config.yaml
        with open("config.yaml", "r") as config_file:
            config = yaml.safe_load(config_file)

        # Create a Spark session
        cls.spark = SparkSession.builder.appName("TestCharStats").getOrCreate()
        
        # Assign configuration variables
        cls.table_name = config["table_name"]
        cls.primary_key_columns = config["primary_key_columns"]
        cls.threshold_percentage = config["threshold_percentage"]

    @classmethod
    def tearDownClass(cls):
        # Stop the Spark session
        cls.spark.stop()

    def log_message(self, message):
        # Log messages, including comments
        logging.info(message)

    def log_test_result(self, test_name, status, reason=""):
        timestamp = datetime.datetime.now()
        # Creating a dictionary for the test result
        result = {
            "Table Name": table_name,
            "Test Name": test_name,
            "Status": status,
            "Reason": reason,
            "Timestamp": timestamp
        }
        # Append the result to the test_results list
        test_results.append(result)

        # Logging the test result
        log_message = f"Test Name: {test_name}, Status: {status}, Reason: {reason}"
        logging.info(log_message)

    def test_primary_key(self):
        # Log comment
        self.log_message("Loading the current data")

        # Loading the current data
        df = spark.read.parquet("/home/jovyan/work/digital/target/char_stats/")  

        # Checking primary key columns to make sure no duplicates
        num_rows = df.count()
        num_distinct_rows = df.select(*primary_key_columns).distinct().count()

        if num_rows == num_distinct_rows:
            self.log_test_result("test_primary_key", "Pass")
        else:
            self.log_test_result("test_primary_key", "Fail", "Duplicate primary key values found")

    def test_count_increase(self):
        # Log comment
        self.log_message("Loading the today's and previous day data for the comparison of count")

        # Loading the today's and previous day data for the comparison of count
        df_previous = spark.read.parquet("/home/jovyan/work/digital/target/char_stats/")
        df_current = spark.read.parquet("/home/jovyan/work/digital/target/char_stats/")  # Replace with the actual path

        # Calculating the count of increased percentage
        count_previous = df_previous.count()
        count_current = df_current.count()
        increase_percentage = (count_current - count_previous) / count_previous * 100

        if increase_percentage >= threshold_percentage:
            self.log_test_result("test_count_increase", "Pass")
        else:
            self.log_test_result("test_count_increase", "Fail", "Count increase percentage is below threshold")

if __name__ == '__main__':
    # Creating an empty list to store test results
    test_results = []

    # Creating a test suite
    test_suite = unittest.TestLoader().loadTestsFromTestCase(TestCharStats)

    # Running the tests
    test_runner = unittest.TextTestRunner()

    # Run each test and logging the results
    for test_case in test_suite:
        test_result = test_runner.run(test_case)

    # Converting the test results to write into a file
    results_df = pd.DataFrame(test_results)
    results_df.to_csv("test_results.csv", index=False)


.
----------------------------------------------------------------------
Ran 1 test in 1.465s

OK
.
----------------------------------------------------------------------
Ran 1 test in 1.123s

OK


In [65]:
!python UnitTesting.py