In [1]:
from pyspark.sql import SparkSession

In [2]:
jars = [
    "/usr/local/spark/jars/hadoop-aws-3.2.0.jar",
    "/usr/local/spark/jars/aws-java-sdk-bundle-1.11.375.jar",
    "/usr/local/spark/jars/spark-cassandra-connector-assembly_2.12-3.0.0.jar",
    "/usr/local/spark/jars/spark-sql-kafka-0-10_2.12-3.0.0.jar",
    "/usr/local/spark/jars/kafka-clients-2.4.1.jar",
    "/usr/local/spark/jars/commons-pool2-2.6.2.jar",
    "/usr/local/spark/jars/spark-token-provider-kafka-0-10_2.12-3.0.0.jar"
]

In [3]:
spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "480m").\
        config("spark.jars", ",".join(jars)).\
        getOrCreate()

In [4]:
spark

In [5]:
sc = spark.sparkContext

In [6]:
rdd = sc.parallelize([1,2,3,4])

In [7]:
rdd.sum()

10

## conf for minio

In [10]:
import os

In [11]:
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "http://minio:9000")
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", os.environ["AWS_ACCESS_KEY_ID"])
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", os.environ["AWS_SECRET_ACCESS_KEY"])
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
sc._jsc.hadoopConfiguration().set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")

In [12]:
df = spark.read.csv("s3a://realtime-ml/data/raw/transactions.csv", header=True, inferSchema=True)

In [13]:
df.show(1, vertical=True)

-RECORD 0--------------------------
 cc_num     | 180094108369013      
 first      | John                 
 last       | Holland              
 trans_num  | 80f5177be11f0bcd7... 
 trans_date | 2011-12-31 18:30:00  
 trans_time | 00:12:15             
 unix_time  | 1325376735           
 category   | personal_care        
 merchant   | Hills-Boyer          
 amt        | 64                   
 merch_lat  | 39.011566            
 merch_long | -119.937831          
 is_fraud   | 0                    
only showing top 1 row



In [14]:
df2 = spark.read.csv("s3a://realtime-ml/data/raw/customer.csv", header=True, inferSchema=True)

In [15]:
df2.show(1, vertical=True)

-RECORD 0----------------------
 cc_num | 3526015186182660     
 first  | Carl                 
 last   | Gomez                
 gender | M                    
 street | 204 Cohen Meadow ... 
 city   | Hathaway Pines       
 state  | CA                   
 zip    | 95233                
 lat    | 38.1919              
 long   | -120.3644            
 job    | Data processing m... 
 dob    | 1958-10-11 18:30:00  
only showing top 1 row



In [16]:
df2.printSchema()

root
 |-- cc_num: long (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- job: string (nullable = true)
 |-- dob: timestamp (nullable = true)



## cassandra with spark

In [17]:
hosts = {"spark.cassandra.connection.host": 'cassandra'}

In [18]:
df = spark.read.format("org.apache.spark.sql.cassandra")\
   .options(table="customer", keyspace="creditcard").options(**hosts).load()

In [19]:
df.show()

+------+----+---+-----+------+---+----+---+----+-----+------+---+
|cc_num|city|dob|first|gender|job|last|lat|long|state|street|zip|
+------+----+---+-----+------+---+----+---+----+-----+------+---+
+------+----+---+-----+------+---+----+---+----+-----+------+---+



## kafka consumer

In [85]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [86]:
df = (spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "broker:29092")
      .option("startingOffsets", "earliest")
      .option("subscribe", "realtime-ml")
      .load()
      .select("timestamp", "value")
     )

In [87]:
df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: binary (nullable = true)



In [88]:
[x.stop() for x in spark.streams.active]

[]

In [89]:
query = (
        df.withWatermark("timestamp", "1 minute")
        .writeStream
        .outputMode("append")
        .format("memory")
        .queryName("test_query")
)

In [90]:
query.start()

<pyspark.sql.streaming.StreamingQuery at 0x7f94076fefd0>

In [91]:
from IPython.display import display, clear_output
from time import sleep

while True:
    clear_output(wait=True)
    display(spark.sql("select * FROM test_query").tail(5))
    sleep(1)

[Row(timestamp=datetime.datetime(2023, 7, 13, 11, 5, 38, 763000), value=bytearray(b'{"cc_num": "4037295225657274", "first": "Antonio", "last": "Garcia", "trans_num": "9d88e9ebbc31b218c192086fbfe9dd5e", "trans_time": "2023-07-13 16:35:38.763031", "category": "1325379228", "merchant": "shopping_pos", "amt": "\\"Nitzsche", "merch_lat": " Kessler and Wolff\\"", "merch_long": "176", "distance": "40.078255", "age": "-102.402743"}')),
 Row(timestamp=datetime.datetime(2023, 7, 13, 11, 5, 38, 764000), value=bytearray(b'{"cc_num": "4368593032190508", "first": "Carla", "last": "Fleming", "trans_num": "9e3ee495dee3ac4577269c93c60fe0a4", "trans_time": "2023-07-13 16:35:38.763759", "category": "1325381126", "merchant": "health_fitness", "amt": "\\"Klocko", "merch_lat": " Runolfsdottir and Breitenberg\\"", "merch_long": "229", "distance": "40.203391", "age": "-75.552377"}')),
 Row(timestamp=datetime.datetime(2023, 7, 13, 11, 5, 38, 764000), value=bytearray(b'{"cc_num": "370763211656868", "first": "Je

KeyboardInterrupt: 