In [None]:
%run "./01_word_count"

In [None]:
class BaseWordCountTests():
    def __init__(self):
        self.base_dir = "/FileStore/tables/"

    def init_env(self):
        spark.sql("drop table if exists word_count_table")
        dbutils.fs.rm("/user/default/word_count_table", True)

        dbutils.fs.rm(f"{self.base_dir}/chekpoint", True)
        dbutils.fs.rm(f"{self.base_dir}/data/text", True)

        dbutils.fs.mkdirs(f"{self.base_dir}/data/text")
    
    def ingest_df(self, itr):
        dbutils.fs.cp(f"{self.base_dir}/datasets/text/text_data_{itr}.txt", f"{self.base_dir}/data/text/")

    def assert_result(self, expected_count):
        actual_count = spark.sql("select sum(count) from word_count_table where substr(word, 1, 1) == 's'").collect()[0][0]
        assert expected_count == actual_count, f"Test failed! actual count is {actual_count}"

    def run_tests(self):
        pass

In [None]:
class BatchWordCountTests(BaseWordCountTests):
    def run_tests(self):
        self.init_env()
        wc = BatchWordCount()

        print("Testing first iteration of batch word count...") 
        self.ingest_df(1)
        wc.word_count()
        self.assert_result(25)
        print("First iteration of batch word count completed.\n")

        # print("Testing second iteration of batch word count...") 
        self.ingest_df(2)
        wc.word_count()
        self.assert_result(32)
        print("Second iteration of batch word count completed.\n") 


        print("Testing third iteration of batch word count...") 
        self.ingest_df(3)
        wc.word_count()
        self.assert_result(37)
        print("Third iteration of batch word count completed.\n")

In [None]:
tests = BatchWordCountTests()
tests.run_tests()

In [None]:
class StreamWordCountTests(BaseWordCountTests):
    def run_tests(self):
        import time
        sleep_time = 30

        self.init_env()
        wc = StreamWordCount()
        query = wc.word_count()

        print("Testing first iteration of batch word count...") 
        self.ingest_df(1)
        time.sleep(sleep_time)
        self.assert_result(25)
        print("First iteration of batch word count completed.\n")

        # print("Testing second iteration of batch word count...") 
        self.ingest_df(2)
        time.sleep(sleep_time)
        self.assert_result(32)
        print("Second iteration of batch word count completed.\n") 


        print("Testing third iteration of batch word count...") 
        self.ingest_df(3)
        time.sleep(sleep_time)
        self.assert_result(37)
        print("Third iteration of batch word count completed.\n")

        query.stop()

In [None]:
streaming_tests = StreamWordCountTests()
streaming_tests.run_tests()