# Computation

In [25]:
#import libraries cell

import json
import time
import findspark
import numpy as np
from numpy import arange
from numpy import linspace 
import pandas as pd
from kafka import KafkaProducer
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.streaming import StreamingContext
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType
from pyspark.sql.functions import from_json, col, max, min

import pyspark
from pyspark import SparkConf, SparkContext

## Spark setup

In [2]:
#!ls $SPARK_HOME/sbin/

In [3]:
#!$SPARK_HOME/sbin/stop-all.sh --host localhost --port 7077 --webui-port 8080

In [4]:
#!$SPARK_HOME/sbin/start-all.sh --host localhost --port 7077 --webui-port 8080

In [5]:
# sc.stop()
# spark.stop()

In [6]:
#initialisation of spark from the packages folder
findspark.init('/usr/local/spark')

In [7]:
#start session - specify port, application name, and configuration settings.

spark = SparkSession.builder \
        .appName("Project_MAPDB_application")\
        .config("spark.sql.execution.arrow.pyspark.enabled", "true")\
        .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "false")\
        .config("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")\
        .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2")\
        .config("spark.ui.port", "4041")\
        .getOrCreate()

#default parallelism setting to shuffle different partitions between workers (for join operation).
#spark.conf.set("spark.sql.shuffle.partitions", spark.sparkContext.defaultParallelism) #15 partitions

:: loading settings :: url = jar:file:/usr/local/spark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-0dee6117-f97e-41a7-abe2-6791abbee659;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.1.2 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.1.2 in central
	found org.apache.kafka#kafka-clients;2.6.0 in central
	found com.github.luben#zstd-jni;1.4.8-1 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.2 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
:: resolution report :: resolve 1031ms :: artifacts dl 46ms
	:: modules in use:
	com.github.luben#zstd-jni;1.4.8-1 from central in [default]
	org.apache.commons#commons-pool2;2.6.2 from central in [default]

22/09/11 14:12:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [8]:
spark

In [9]:
sc = spark.sparkContext
sc

## Kafka Setup

In [10]:
from kafka import KafkaProducer
from kafka.admin import KafkaAdminClient, NewTopic

KAFKA_BOOTSTRAP_SERVERS = 'localhost:9092'

producer = KafkaProducer(bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS)


In [11]:
kafka_admin = KafkaAdminClient(bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS)


In [12]:
kafka_admin.list_topics()

['stream', 'topic_stream', 'results', '__consumer_offsets']

In [14]:
kafka_admin.delete_topics(['stream', 'results'])


DeleteTopicsResponse_v3(throttle_time_ms=0, topic_error_codes=[(topic='stream', error_code=0), (topic='results', error_code=0)])

In [15]:
kafka_admin.list_topics()

['stream', 'topic_stream', '__consumer_offsets']

In [16]:
results_topic = NewTopic(name='results', 
                       num_partitions=1, 
                       replication_factor=1)
kafka_admin.create_topics(new_topics=[results_topic])


CreateTopicsResponse_v3(throttle_time_ms=0, topic_errors=[(topic='results', error_code=0, error_message=None)])

In [17]:
kafka_admin.list_topics()

['stream', 'topic_stream', 'results', '__consumer_offsets']

In [18]:
#define the input dataframe and its source. Define subscription to topic_stream - one of the two topics in kafka
inputDF = spark\
        .readStream\
        .format("kafka")\
        .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS)\
        .option('subscribe', 'topic_stream')\
        .load()

inputDF.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [19]:
#define the schema of the rows that will be read. double are used to overcome overflow issues
schema = StructType(
        [
            StructField("HEAD",        IntegerType()),
            StructField("FPGA",         IntegerType()),
            StructField("TDC_CHANNEL",  IntegerType()),
            StructField("ORBIT_CNT",    DoubleType()),
            StructField("BX_COUNTER",   DoubleType()), 
            StructField("TDC_MEAS",    DoubleType() )
        ]
    )

In [20]:
#convert input_Df to json by casting columns into the predefined schema.
jsonDF = inputDF.select(from_json(col("value").alias('value').cast("string"), schema).alias('value'))

jsonDF.printSchema()

root
 |-- value: struct (nullable = true)
 |    |-- HEAD: integer (nullable = true)
 |    |-- FPGA: integer (nullable = true)
 |    |-- TDC_CHANNEL: integer (nullable = true)
 |    |-- ORBIT_CNT: double (nullable = true)
 |    |-- BX_COUNTER: double (nullable = true)
 |    |-- TDC_MEAS: double (nullable = true)



In [21]:
#flattening the dataframe
flatDF = jsonDF.selectExpr("value.HEAD", 
                           "value.FPGA", 
                           "value.TDC_CHANNEL",
                           "value.ORBIT_CNT",
                           "value.BX_COUNTER",
                           "value.TDC_MEAS")

flatDF.printSchema()

root
 |-- HEAD: integer (nullable = true)
 |-- FPGA: integer (nullable = true)
 |-- TDC_CHANNEL: integer (nullable = true)
 |-- ORBIT_CNT: double (nullable = true)
 |-- BX_COUNTER: double (nullable = true)
 |-- TDC_MEAS: double (nullable = true)



In [33]:
def analysis(df, epoch_id):   
    
    # total events
    tot = df.count()
    
    # data-cleansing
    df_clean = df.where(col('HEAD')== 2)

    # repartition the df DataFrame to 105 parts - and persist in cache to speedup calculations
    #df_clean.coalesce(15)
    #df_clean.persist()
    
    # total number of processed hits, post-clensing
    tot_hits = df_clean.count()


    # division of the dataframe between chambers
    df_ch0 = df_clean                         \
        .where( col(        'FPGA' ) ==   0 ) \
        .where( col( 'TDC_CHANNEL' ) >=   0 ) \
        .where( col( 'TDC_CHANNEL' ) <=  63 )

    df_ch1 = df_clean                         \
        .where( col(        'FPGA' ) ==   0 ) \
        .where( col( 'TDC_CHANNEL' ) >=  64 ) \
        .where( col( 'TDC_CHANNEL' ) <= 127 )

    df_ch2 = df_clean                         \
        .where( col(        'FPGA' ) ==   1 ) \
        .where( col( 'TDC_CHANNEL' ) >=   0 ) \
        .where( col( 'TDC_CHANNEL' ) <=  63 )

    df_ch3 = df_clean                         \
        .where( col(        'FPGA' ) ==   1 ) \
        .where( col( 'TDC_CHANNEL' ) >=  64 ) \
        .where( col( 'TDC_CHANNEL' ) <= 127 )


    # total number of processed hits,
    #  post-clensing, per chamber
    
    tot_hits_ch0 = df_ch0.count()
    tot_hits_ch1 = df_ch1.count()
    tot_hits_ch2 = df_ch2.count()
    tot_hits_ch3 = df_ch3.count()


    # histogram of the counts of active
    # TDC_CHANNEL, per chamber.

    df0 = df_ch0                \
        .groupBy('TDC_CHANNEL') \
        .count()                \
        .toPandas()

    df1 = df_ch1                \
        .groupBy('TDC_CHANNEL') \
        .count()                \
        .toPandas()

    df2 = df_ch2                \
        .groupBy('TDC_CHANNEL') \
        .count()                \
        .toPandas()

    df3 = df_ch3                \
        .groupBy('TDC_CHANNEL') \
        .count()                \
        .toPandas()


    # histogram of the total number of active
    # TDC_CHANNEL in each ORBIT_CNT per chamber
    
    df_orbs0 = df_ch0                        \
        .groupBy('ORBIT_CNT')                \
        .agg(F.countDistinct('TDC_CHANNEL')) \
        .groupBy(col('count(TDC_CHANNEL)'))  \
        .count()                             \
        .toPandas()

    df_orbs1 = df_ch1                        \
        .groupBy('ORBIT_CNT')                \
        .agg(F.countDistinct('TDC_CHANNEL')) \
        .groupBy(col('count(TDC_CHANNEL)'))  \
        .count()                             \
        .toPandas()

    df_orbs2 = df_ch2                        \
        .groupBy('ORBIT_CNT')                \
        .agg(F.countDistinct('TDC_CHANNEL')) \
        .groupBy(col('count(TDC_CHANNEL)'))  \
        .count()                             \
        .toPandas()
    
    df_orbs3 = df_ch3                        \
        .groupBy('ORBIT_CNT')                \
        .agg(F.countDistinct('TDC_CHANNEL')) \
        .groupBy(col('count(TDC_CHANNEL)'))  \
        .count()                             \
        .toPandas()


    # Histogram of the count of active TDC_CHANNEL,
    # per chamber, only for those orbits with
    # at least one scintillatorin it

    tdc128 = df_clean                        \
        .where(col(        'FPGA' ) ==   1 ) \
        .where(col( 'TDC_CHANNEL' ) == 128 ) \
        .toPandas()
    l_orbs = tdc128['ORBIT_CNT'].tolist()
    
    scint_df_ch0 = df_ch0                           \
        .where( col( 'ORBIT_CNT' ).isin( l_orbs ) ) \
        .groupBy(  'TDC_CHANNEL'                  ) \
        .count()                                    \
        .toPandas()
    
    scint_df_ch1 = df_ch1                           \
        .where( col( 'ORBIT_CNT' ).isin( l_orbs ) ) \
        .groupBy(  'TDC_CHANNEL'                  ) \
        .count()                                    \
        .toPandas()
    
    scint_df_ch2 = df_ch2                           \
        .where( col( 'ORBIT_CNT' ).isin( l_orbs ) ) \
        .groupBy(  'TDC_CHANNEL'                  ) \
        .count()                                    \
        .toPandas()
    
    scint_df_ch3 = df_ch3                           \
        .where( col( 'ORBIT_CNT' ).isin( l_orbs ) ) \
        .groupBy(  'TDC_CHANNEL'                  ) \
        .count()                                    \
        .toPandas()
    
    
    outputJson = \
        {
            'tot_import':tot,
            'hits': tot_hits,
            'hitsPerChamber': [tot_hits_ch0, tot_hits_ch1, tot_hits_ch2, tot_hits_ch3],
            'hist_ch0': [df0['TDC_CHANNEL'].tolist(), df0['count'].tolist()],
            'hist_ch1': [df1['TDC_CHANNEL'].tolist(), df1['count'].tolist()],
            'hist_ch2': [df2['TDC_CHANNEL'].tolist(), df2['count'].tolist()],
            'hist_ch3': [df3['TDC_CHANNEL'].tolist(), df3['count'].tolist()],
            'hist_orbit_ch0':[df_orbs0['count(TDC_CHANNEL)'].tolist(), df_orbs0['count'].tolist()],
            'hist_orbit_ch1':[df_orbs1['count(TDC_CHANNEL)'].tolist(), df_orbs1['count'].tolist()],
            'hist_orbit_ch2':[df_orbs2['count(TDC_CHANNEL)'].tolist(), df_orbs2['count'].tolist()],
            'hist_orbit_ch3':[df_orbs3['count(TDC_CHANNEL)'].tolist(), df_orbs3['count'].tolist()],
            'hist_scin_ch0': [scint_df_ch0['TDC_CHANNEL'].tolist(), scint_df_ch0['count'].tolist()],
            'hist_scin_ch1': [scint_df_ch1['TDC_CHANNEL'].tolist(), scint_df_ch1['count'].tolist()],
            'hist_scin_ch2': [scint_df_ch2['TDC_CHANNEL'].tolist(), scint_df_ch2['count'].tolist()],
            'hist_scin_ch3': [scint_df_ch3['TDC_CHANNEL'].tolist(), scint_df_ch3['count'].tolist()],
        }
    
    producer.send('results', json.dumps(outputJson).encode('utf-8'))
    producer.flush()
    pass

In [34]:
flatDF.isStreaming


True

In [None]:
flatDF.writeStream\
    .foreachBatch(analysis)\
    .start()\
    .awaitTermination()

22/09/11 17:43:21 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-6a66e9c5-4455-452e-bd38-48e6447d6e91. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/09/11 17:43:21 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


                                                                                

In [None]:
sc.stop()
spark.stop()