In [None]:
import logging

from cassandra.cluster import Cluster
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType

s_conn = None
s_conn = SparkSession.builder \
    .appName('SparkDataStreaming') \
    .config('spark.jars.packages', "com.datastax.spark:spark-cassandra-connector_2.12:3.4.1,"
                                    "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1") \
    .config('spark.cassandra.connection.host', 'localhost') \
    .getOrCreate()

s_conn.sparkContext.setLogLevel("ERROR")

In [2]:
print(s_conn)
spark_conn = s_conn

<pyspark.sql.session.SparkSession object at 0x000001907F353B10>


In [3]:
spark_df = spark_conn.readStream \
    .format('kafka') \
    .option('kafka.bootstrap.servers', 'localhost:9092') \
    .option('subscribe', 'users_created') \
    .option('startingOffsets', 'earliest') \
    .load()

# print(spark_df)

In [4]:
schema = StructType([
    StructField("id", StringType(), False),
    StructField("first_name", StringType(), False),
    StructField("last_name", StringType(), False),
    StructField("gender", StringType(), False),
    StructField("address", StringType(), False),
    StructField("post_code", StringType(), False),
    StructField("email", StringType(), False),
    StructField("username", StringType(), False),
    StructField("registered_date", StringType(), False),
    StructField("phone", StringType(), False),
    StructField("picture", StringType(), False)
])

sel = spark_df.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col('value'), schema).alias('data')).select("data.*")
print(sel)

DataFrame[id: string, first_name: string, last_name: string, gender: string, address: string, post_code: string, email: string, username: string, registered_date: string, phone: string, picture: string]


In [5]:
try:
    # connecting to the cassandra cluster
    cluster = Cluster(['localhost'])

    cas_session = cluster.connect()
except Exception as e:
    logging.error(f"Could not create cassandra connection due to {e}")

In [6]:
cas_session.execute("""
    CREATE KEYSPACE IF NOT EXISTS spark_streams
    WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'};
""")

print("Keyspace created successfully!")

Keyspace created successfully!


In [9]:
cas_session.execute("""
    CREATE TABLE IF NOT EXISTS spark_streams.created_users2 (
        id TEXT PRIMARY KEY,
        first_name TEXT,
        last_name TEXT,
        gender TEXT,
        address TEXT,
        post_code TEXT,
        email TEXT,
        username TEXT,
        registered_date TEXT,
        phone TEXT,
        picture TEXT);
""")

print("Table created successfully!")

Table created successfully!


In [None]:
streaming_query = (sel.writeStream.format("org.apache.spark.sql.cassandra")
                               .option('checkpointLocation', '/tmp/checkpoint')
                               .option('keyspace', 'spark_streams')
                               .option('table', 'created_users2')
                               .start())

streaming_query.awaitTermination()