In [29]:
import unittest
import yaml
import datetime
import pandas as pd
from pyspark.sql import SparkSession

class TestCharStats(unittest.TestCase):
    def setUp(self):
        # Loading configuration from config.yaml
        with open("config.yaml", "r") as config_file:
            config = yaml.safe_load(config_file)
            
        # Create a Spark session with a custom configuration
        self.spark = SparkSession.builder \
            .appName("TestCharStats") \
            .getOrCreate()
        # Set the log level to ERROR or FATAL
        self.spark.sparkContext.setLogLevel("ERROR")  # You can change "ERROR" to "FATAL" if needed
        
        # Assign configuration variables
        self.table_name = config["table_name"]
        self.primary_key_columns = config["primary_key_columns"]
        self.threshold_percentage = config["threshold_percentage"]
        self.output_path = config["output_path"]
        self.absolute_table_name =f"{self.output_path}/{self.table_name}"

    def tearDown(self):
        # Stop the Spark session
        self.spark.stop()

    def log_test_result(self, test_name, status, reason=""):
        timestamp = datetime.datetime.now()
        # Creating a dictionary for the test result
        result = {
            "Table Name": self.table_name,
            "Test Name": test_name,
            "Status": status,
            "Reason": reason,
            "Timestamp": timestamp
        }
        # Append the result to the test_results list
        test_results.append(result)

    def test_primary_key(self):
        # Loading the current data
        df = self.spark.read.parquet(self.absolute_table_name)  

        # Checking primary key columns to make sure no duplicates
        num_rows = df.count()
        num_distinct_rows = df.select(*self.primary_key_columns).distinct().count()
        #self.assertEqual(num_rows, num_distinct_rows)
        if num_rows == num_distinct_rows:
            self.log_test_result("test_primary_key", "PASS")
        else:
            count_duplicate_key = num_rows - num_distinct_rows
            self.log_test_result("test_primary_key", "FAIL", f"total of {count_duplicate_key} Duplicate primary key values found.")

    def test_count_increase(self):
        # Loading the today's and previous day data for the comparision of countyour DataFrames for previous and current loads
        df_previous = self.spark.read.parquet(self.absolute_table_name)
        df_current = self.spark.read.parquet(self.absolute_table_name)

        # Calculating the count of increased percentage
        count_previous = df_previous.count()
        count_current = df_current.count()
        increase_percentage = (count_current - count_previous) / count_previous * 100

        if increase_percentage >= self.threshold_percentage:
            self.log_test_result("test_count_increase", "FAIL", f"Count_increase percentage is more than threshold")
        else:
            self.log_test_result("test_count_increase", "PASS", "Count_increase percentage is below threshold")

if __name__ == '__main__':
    # Creating an empty list to store test results
    test_results = []

    # Creating a test suite
    test_suite = unittest.TestLoader().loadTestsFromTestCase(TestCharStats)

    # Running the tests
    test_runner = unittest.TextTestRunner()

    # Run each test and log the results
    for test_case in test_suite:
        test_result = test_runner.run(test_case)

    # Converting the test results to write into a file
    results_df = pd.DataFrame(test_results)
    

    # Write the results to a CSV file
    results_df.to_csv("test_results.csv", index=False)
    results_df.head()
    # Create a summary DataFrame to store load status
    load_status_df = results_df.groupby(["Table Name"])["Status"].max().reset_index()
    load_status_df.loc[:, "Test Type"] = "Data Quality Checks"
    
    # Check if any row in the test results DataFrame has "Failed" in the "Status" column
    if "FAILED" in load_status_df["Status"].str.upper().values:
        # Set the status in the summary DataFrame to "Failed"
        load_status_df.loc[:, "Status"] = "FAIL"
    else:
        load_status_df.loc[:, "Status"] = "PASS"
    
    # Add a Date column with the current date and time
    load_status_df["Date"] = pd.to_datetime('today').date()
    
    # Display the summary load status
    load_status_df.to_csv("load_status.csv", index=False)
    


.
----------------------------------------------------------------------
Ran 1 test in 1.618s

OK
.
----------------------------------------------------------------------
Ran 1 test in 1.628s

OK
