In [None]:
class BatchWordCount():
    def __init__(self):
        self.base_dir = "/FileStore/tables/"

    def get_raw_data(self):
        from pyspark.sql.functions import explode, split
        
        lines = spark.read.format('text').option('lineSep', '.').load(f"{self.base_dir}/data/text")
        return lines.select(explode(split(lines.value, " ")).alias('word'))
    
    def get_quality_data(self, df):
        from pyspark.sql.functions import trim, lower

        return (df.select(lower(trim(df.word)).alias('word'))
                    .where("word is not null")
                    .where('word rlike "[a-z]"'))
        
    def get_word_count(self, df):
        return df.groupBy('word').count()
    
    def write_data(self, df):
        df.write.format('delta').mode('overwrite').saveAsTable('word_count_table')

    def word_count(self):
        raw_df = self.get_raw_data()
        # print(raw_df.show())
        quality_df = self.get_quality_data(raw_df)
        result_df = self.get_word_count(quality_df)
        self.write_data(result_df)

In [None]:
class StreamWordCount():
    def __init__(self):
        self.base_dir = "/FileStore/tables/"

    def get_raw_data(self):
        from pyspark.sql.functions import explode, split
        
        lines = spark.readStream.format('text').option('lineSep', '.').load(f"{self.base_dir}/data/text")
        return lines.select(explode(split(lines.value, " ")).alias('word'))
    
    def get_quality_data(self, df):
        from pyspark.sql.functions import trim, lower

        return (df.select(lower(trim(df.word)).alias('word'))
                    .where("word is not null")
                    .where('word rlike "[a-z]"'))
        
    def get_word_count(self, df):
        return df.groupBy('word').count()
    
    def write_data(self, df):
        return (df.writeStream
            .format('delta')
            .option("checkpointLocation", f"{self.base_dir}/chekpoint/word_count")
            .outputMode('complete')
            .toTable('word_count_table'))

    def word_count(self):
        raw_df = self.get_raw_data()
        # print(raw_df.show())
        quality_df = self.get_quality_data(raw_df)
        streaming_query = result_df = self.get_word_count(quality_df)
        self.write_data(result_df)

        return streaming_query