In [1]:
from pyspark.sql import SparkSession

# Spark session & context
spark = (SparkSession
         .builder
         .master('local')
         .appName('wiki-changes-dataviz')
         .config("spark.sql.streaming.schemaInference", True) #Stream dataframe infers schema
         .getOrCreate())
sc = spark.sparkContext

In [2]:
# Read parquet stream
df_stream = (
    spark
    .readStream
    .format("parquet")
    .load("/home/jovyan/work/data-lake/wiki-changes")
)


In [3]:
# Create dataframe grouping by window 
from pyspark.sql.functions import window, col, current_timestamp

df_count = (
    df_stream
    .withWatermark("page_create_event_time", "10 minutes") # Don't aggregate events arriving more than 10 minutes late
    .groupBy(
        window(col("page_create_event_time"), "10 minutes", "10 minutes"), # 10 minute window, updating every 10 minutes
        col("user_isa_bot"))
    .count()
)



In [4]:
# Create query stream with memory sink
queryStream = (df_count
 .writeStream
 .format("memory")
 .queryName("wiki_changes")
 .outputMode("update")
 .start())

newStream = (df_stream
 .writeStream
 .format("memory")
 .queryName("wiki_changes3")
 .outputMode("update")
 .start())

In [None]:
from time import sleep
from IPython.display import clear_output
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

matplotlib.rc('font', family='DejaVu Sans')
sns.set(style="whitegrid")


try:
    i=1
    while True:
        # Clear output
        clear_output(wait=True)
        print("**********************")
        print("General Info")
        print("**********************")
        print("Run:{}".format(i))
        if (len(queryStream.recentProgress) > 0):
            print("Stream timestamp:{}".format(queryStream.lastProgress["timestamp"]))
            print("Watermark:{}".format(queryStream.lastProgress["eventTime"]["watermark"]))
            print("Total Rows:{}".format(queryStream.lastProgress["stateOperators"][0]["numRowsTotal"]))
            print("Updated Rows:{}".format(queryStream.lastProgress["stateOperators"][0]["numRowsUpdated"]))
            print("Memory used MB:{}".format((queryStream.lastProgress["stateOperators"][0]["memoryUsedBytes"]) * 0.000001))
            
        df = spark.sql(
                """
                    select
                        window.start
                        ,window.end
                        ,user_isa_bot
                        ,sum(count) count
                    from
                        wiki_changes
                    where
                        window.start = (select max(window.start) from wiki_changes)
                    group by
                        window.start
                        ,window.end
                        ,user_isa_bot
                    order by
                        4 desc
                    limit 10
                """
        ).toPandas()

        # Plot
        sns.set_color_codes("muted")

        # Initialize the matplotlib figure
        plt.figure(figsize=(8,6))

        print("**********************")
        print("Graph - Wiki Pages Created by Bots vs Humans in Near-Real Time")
        print("**********************")
        try:
            # Barplot
            sns.barplot(x="count", y="user_isa_bot", data=df)

            # Show barplot
            plt.show()
        except ValueError:
            # If Dataframe is empty, pass
            pass

        print("**********************")
        print("Table - Wiki Pages Created by Bots vs Humans in Near-Real Time")
        print("**********************")
        display(df)
        
        print("**********************")
        print("Table - Top 10 Largest Windows Between User Registration and Latest Wiki Page Creation")
        print("**********************")
        df1 = spark.sql("""select page_create_event_time, date_user_registered, (datediff(page_create_event_time, date_user_registered))/365 as YEARS_BETWEEN_REGISTRATION_AND_POST from wiki_changes3
                        order by datediff(page_create_event_time, date_user_registered) desc limit 10""").toPandas()
        
        display(df1)
        
        sleep(10)
        i=i+1
except KeyboardInterrupt:
    print("process interrupted.")

**********************
General Info
**********************
Run:1


In [8]:
# Check active streams
for s in spark.streams.active:
    print("ID:{} | NAME:{}".format(s.id, s.name))

ID:157e54b5-d2d2-4bce-af08-59c53cfbf2a8 | NAME:wiki_changes3


In [7]:
# Stop stream
queryStream.stop()

In [10]:
sc.stop()