In [1]:
#importing the jar files and SparkSession

from pyspark.sql import SparkSession
import pyspark
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars ./abris_2.12-3.2.0.jar --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0 pyspark-shell'

In [2]:
# Creating the SparkSession

spark = SparkSession.builder.appName('spark_app')\
.getOrCreate()
print(spark)



:: loading settings :: url = jar:file:/usr/local/spark-3.2.0-bin-hadoop3.2/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-0c2eaadc-a74a-47a1-92e2-fdeddeffa564;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central

<pyspark.sql.session.SparkSession object at 0x7f457529be20>


In [3]:
# Importing data from kafka topic into spark data frame


readDF = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers","broker:29092") \
.option("subscribe","newtopic") \
.option("startingOffsets","earliest") \
.load() \
.selectExpr("CAST(value AS STRING)")



In [4]:
# Zip=601,Latitude=18.18005,Longitude=-66.75218,City=Adjuntas,State_ID=PR,State_Name=Puerto Rico,Population=17113.0,Population_Density=102.7,County_Code=72001}

In [5]:
# Importing sql funtion adn types library for type casting and split function

from pyspark.sql.functions import *
from pyspark.sql.types import *

In [6]:
readDF.printSchema()

root
 |-- value: string (nullable = true)



In [7]:
print(type(readDF))

<class 'pyspark.sql.dataframe.DataFrame'>


In [8]:
# Defining the schema and changing the values from json to struct format for processing in SQL, as it uses SQL Struct types

from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType(
    [
        StructField('Zip', StringType(), True),
        StructField('Latitude', StringType(), True),
        StructField('Longitude', StringType(), True),
        StructField('City', StringType(), True),
        StructField('State_ID', StringType(), True),
        StructField('State_Name', StringType(), True),
        StructField('Population', StringType(), True),
        StructField('Population_Density', StringType(), True),
        StructField('County_Code', StringType(), True)
    ]
)

readDF.withColumn("value", from_json("value", schema))\
    .select(col('value.*'))

DataFrame[Zip: string, Latitude: string, Longitude: string, City: string, State_ID: string, State_Name: string, Population: string, Population_Density: string, County_Code: string]

In [9]:
readDF.printSchema()

root
 |-- value: string (nullable = true)



In [10]:
# values are like this at this point of time Struct{Zip=601,Latitude=18.18005....}
# Seaparting the values using "," here

newDF= readDF.withColumn("Zip", split(col("value"), ",").getItem(0))\
.withColumn("Latitude", split(col("value"), ",").getItem(1))\
.withColumn("Longitude", split(col("value"), ",").getItem(2))\
.withColumn("City", split(col("value"), ",").getItem(3))\
.withColumn("State_ID", split(col("value"), ",").getItem(4))\
.withColumn("State_Name", split(col("value"), ",").getItem(5))\
.withColumn("Population", split(col("value"), ",").getItem(6))\
.withColumn("Population_Density", split(col("value"), ",").getItem(7))\
.withColumn("County_Code", split(col("value"), ",").getItem(8))



In [11]:
# values are like this Struct{Zip=601 | Latitude=18.18005 |........}
# Separating the value using "=" to get the desired value

DF = newDF.withColumn("Zip", split(col("Zip"), "=").getItem(1))\
.withColumn("Latitude", split(col("Latitude"), "=").getItem(1))\
.withColumn("Longitude", split(col("Longitude"), "=").getItem(1))\
.withColumn("City", split(col("City"), "=").getItem(1))\
.withColumn("State_ID", split(col("State_ID"), "=").getItem(1))\
.withColumn("State_Name", split(col("State_Name"), "=").getItem(1))\
.withColumn("Population", split(col("Population"), "=").getItem(1))\
.withColumn("Population_Density", split(col("Population_Density"), "=").getItem(1))\
.withColumn("County_Code", split(col("County_Code"), "=").getItem(1))\
.withColumn("County_Code", split(col("County_Code"), "}").getItem(0)) #This because of the } in each value in this column at the end

In [12]:
DF.printSchema()

root
 |-- value: string (nullable = true)
 |-- Zip: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State_ID: string (nullable = true)
 |-- State_Name: string (nullable = true)
 |-- Population: string (nullable = true)
 |-- Population_Density: string (nullable = true)
 |-- County_Code: string (nullable = true)



In [13]:
finalDF = DF.select("Zip","Latitude","Longitude","City","State_ID","State_Name","Population","Population_Density","County_Code")

In [14]:
# finalDF = DF.selectExpr("CAST(Zip AS STRING)",\
#                    "CAST(Latitude AS STRING)",\
#                    "CAST(Longitude AS STRING)",\
#                    "CAST(City AS STRING)",\
#                    "CAST(State_ID AS STRING)",\
#                    "CAST(State_Name AS STRING)",\
#                    "CAST(Population AS STRING)",\
#                    "CAST(Population_Density AS STRING)",\
#                    "CAST(County_Code AS STRING)")

In [15]:
finalDF.printSchema()

root
 |-- Zip: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State_ID: string (nullable = true)
 |-- State_Name: string (nullable = true)
 |-- Population: string (nullable = true)
 |-- Population_Density: string (nullable = true)
 |-- County_Code: string (nullable = true)



In [17]:
resultDF = finalDF \
.writeStream \
.format("console") \
.outputMode("append") \
.option("checkpointLocation", "checkpoint/")\
.option("truncate", "false")\
.option("numRows",20) \
.start().awaitTermination()

21/11/21 23:04:31 INFO StateStoreCoordinatorRef: Registered StateStoreCoordinator endpoint
21/11/21 23:04:31 INFO ResolveWriteToStream: Checkpoint root checkpoint resolved to file:/home/jovyan/checkpoint.
21/11/21 23:04:31 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
21/11/21 23:04:31 INFO MicroBatchExecution: Starting [id = de6b4713-0636-43b2-8a54-ddd329944eb8, runId = 0ac98023-713c-4bfe-8e9c-5d158a4badc9]. Use file:/home/jovyan/checkpoint to store the query checkpoint.
21/11/21 23:04:31 INFO MicroBatchExecution: Reading table [org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@4c77f088] from DataSourceV2 named 'kafka' [org.apache.spark.sql.kafka010.KafkaSourceProvider@1a228222]
21/11/21 23:04:31 INFO MicroBatchExecution: Resuming at batch 8 with committed offsets {KafkaV2[Subscribe[newtopic]]: {"newtopic":{"0":264968}}} and available offsets {KafkaV2[Subscribe[newtopic]]: {"newtopic":{"0":26496

-------------------------------------------
Batch: 8
-------------------------------------------


21/11/21 23:04:41 INFO CodeGenerator: Code generated in 33.3748 ms
21/11/21 23:04:41 INFO CodeGenerator: Code generated in 42.9789 ms
21/11/21 23:04:41 INFO WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@588b2983 committed.
21/11/21 23:04:41 INFO CheckpointFileManager: Writing atomically to file:/home/jovyan/checkpoint/commits/8 using temp file file:/home/jovyan/checkpoint/commits/.8.6c6299dd-6882-46d3-b6d2-f8e9cc9a6bc1.tmp
21/11/21 23:04:41 INFO CheckpointFileManager: Renamed temp file file:/home/jovyan/checkpoint/commits/.8.6c6299dd-6882-46d3-b6d2-f8e9cc9a6bc1.tmp to file:/home/jovyan/checkpoint/commits/8


+---+--------+---------+-------------+--------+-----------+----------+------------------+-----------+
|Zip|Latitude|Longitude|City         |State_ID|State_Name |Population|Population_Density|County_Code|
+---+--------+---------+-------------+--------+-----------+----------+------------------+-----------+
|601|18.18005|-66.75218|Adjuntas     |PR      |Puerto Rico|17113.0   |102.7             |72001      |
|602|18.36074|-67.17519|Aguada       |PR      |Puerto Rico|37751.0   |476.0             |72003      |
|603|18.4544 |-67.12201|Aguadilla    |PR      |Puerto Rico|47081.0   |574.9             |72005      |
|606|18.16721|-66.93828|Maricao      |PR      |Puerto Rico|6392.0    |58.3              |72093      |
|610|18.29032|-67.12244|Anasco       |PR      |Puerto Rico|26686.0   |286.9             |72011      |
|612|18.40699|-66.70805|Arecibo      |PR      |Puerto Rico|59369.0   |339.1             |72013      |
|616|18.41752|-66.66814|Bajadero     |PR      |Puerto Rico|10022.0   |335.6       

21/11/21 23:04:41 INFO MicroBatchExecution: Streaming query made progress: {
  "id" : "de6b4713-0636-43b2-8a54-ddd329944eb8",
  "runId" : "0ac98023-713c-4bfe-8e9c-5d158a4badc9",
  "name" : null,
  "timestamp" : "2021-11-21T23:04:31.594Z",
  "batchId" : 8,
  "numInputRows" : 33121,
  "inputRowsPerSecond" : 0.0,
  "processedRowsPerSecond" : 3276.387377584331,
  "durationMs" : {
    "addBatch" : 7682,
    "getBatch" : 4,
    "latestOffset" : 1201,
    "queryPlanning" : 953,
    "triggerExecution" : 10106,
    "walCommit" : 95
  },
  "stateOperators" : [ ],
  "sources" : [ {
    "description" : "KafkaV2[Subscribe[newtopic]]",
    "startOffset" : {
      "newtopic" : {
        "0" : 264968
      }
    },
    "endOffset" : {
      "newtopic" : {
        "0" : 298089
      }
    },
    "latestOffset" : {
      "newtopic" : {
        "0" : 298089
      }
    },
    "numInputRows" : 33121,
    "inputRowsPerSecond" : 0.0,
    "processedRowsPerSecond" : 3276.387377584331,
    "metrics" : {
      

KeyboardInterrupt: 

ition FetchPosition{offset=298089, offsetEpoch=Optional.empty, currentLeader=LeaderAndEpoch{leader=Optional[broker:29092 (id: 1 rack: null)], epoch=0}}.
21/11/21 23:04:50 INFO SubscriptionState: [Consumer clientId=consumer-spark-kafka-source-d9988708-2040-4d0c-a1b9-4fc3d3d5151e--587948755-driver-0-1, groupId=spark-kafka-source-d9988708-2040-4d0c-a1b9-4fc3d3d5151e--587948755-driver-0] Seeking to LATEST offset of partition newtopic-0
21/11/21 23:04:50 INFO SubscriptionState: [Consumer clientId=consumer-spark-kafka-source-d9988708-2040-4d0c-a1b9-4fc3d3d5151e--587948755-driver-0-1, groupId=spark-kafka-source-d9988708-2040-4d0c-a1b9-4fc3d3d5151e--587948755-driver-0] Resetting offset for partition newtopic-0 to position FetchPosition{offset=298089, offsetEpoch=Optional.empty, currentLeader=LeaderAndEpoch{leader=Optional[broker:29092 (id: 1 rack: null)], epoch=0}}.
