In [1]:
from pyspark.sql import SparkSession

In [2]:
jars = [
    "/usr/local/spark/jars/hadoop-aws-3.2.0.jar",
    "/usr/local/spark/jars/aws-java-sdk-bundle-1.11.375.jar",
    "/usr/local/spark/jars/spark-cassandra-connector-assembly_2.12-3.0.0.jar",
    "/usr/local/spark/jars/spark-sql-kafka-0-10_2.12-3.0.0.jar",
    "/usr/local/spark/jars/kafka-clients-2.4.1.jar",
    "/usr/local/spark/jars/commons-pool2-2.6.2.jar",
    "/usr/local/spark/jars/spark-token-provider-kafka-0-10_2.12-3.0.0.jar"
]

In [3]:
from pathlib import Path

In [4]:
for jar in jars:
    print(Path(jar).exists())

True
True
True
True
True
True
True


In [5]:
spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "480m").\
        config("spark.jars", ",".join(jars)).\
        getOrCreate()

In [6]:
spark

In [7]:
sc = spark.sparkContext

In [8]:
rdd = sc.parallelize([1,2,3,4])

In [9]:
rdd.sum()

10

## conf for minio

In [10]:
import os

In [11]:
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "http://minio:9000")
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", os.environ["AWS_ACCESS_KEY_ID"])
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", os.environ["AWS_SECRET_ACCESS_KEY"])
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
sc._jsc.hadoopConfiguration().set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")

In [12]:
df = spark.read.csv("s3a://realtime-ml/data/raw/transactions.csv", header=True, inferSchema=True)

In [13]:
df.show(1, vertical=True)

-RECORD 0--------------------------
 cc_num     | 180094108369013      
 first      | John                 
 last       | Holland              
 trans_num  | 80f5177be11f0bcd7... 
 trans_date | 2011-12-31 18:30:00  
 trans_time | 00:12:15             
 unix_time  | 1325376735           
 category   | personal_care        
 merchant   | Hills-Boyer          
 amt        | 64                   
 merch_lat  | 39.011566            
 merch_long | -119.937831          
 is_fraud   | 0                    
only showing top 1 row



In [14]:
df2 = spark.read.csv("s3a://realtime-ml/data/raw/customer.csv", header=True, inferSchema=True)

In [15]:
df2.show(1, vertical=True)

-RECORD 0----------------------
 cc_num | 3526015186182660     
 first  | Carl                 
 last   | Gomez                
 gender | M                    
 street | 204 Cohen Meadow ... 
 city   | Hathaway Pines       
 state  | CA                   
 zip    | 95233                
 lat    | 38.1919              
 long   | -120.3644            
 job    | Data processing m... 
 dob    | 1958-10-11 18:30:00  
only showing top 1 row



In [16]:
df2.printSchema()

root
 |-- cc_num: long (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- job: string (nullable = true)
 |-- dob: timestamp (nullable = true)



## cassandra with spark

In [17]:
hosts = {"spark.cassandra.connection.host": 'cassandra'}

In [18]:
df = spark.read.format("org.apache.spark.sql.cassandra")\
   .options(table="customer", keyspace="creditcard").options(**hosts).load()

In [19]:
df.show()

+------+----+---+-----+------+---+----+---+----+-----+------+---+
|cc_num|city|dob|first|gender|job|last|lat|long|state|street|zip|
+------+----+---+-----+------+---+----+---+----+-----+------+---+
+------+----+---+-----+------+---+----+---+----+-----+------+---+



## kafka consumer

In [70]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [71]:
df = (spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "broker:29092")
      .option("startingOffsets", "earliest")
      .option("subscribe", "test-topic")
      .load()
      .withColumn("transformed", F.from_json(F.col("value").cast(T.StringType()), test_schema)
     )

In [72]:
df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: binary (nullable = true)



In [93]:
[x.stop() for x in spark.streams.active]

[]

In [94]:
query = (
        df.withWatermark("timestamp", "1 minute")
        .writeStream
        .outputMode("append")
        .format("memory")
        .queryName("test_query")
)

In [95]:
query.start()

<pyspark.sql.streaming.StreamingQuery at 0x7fba65d76d50>

In [96]:
from IPython.display import display, clear_output
from time import sleep

while True:
    clear_output(wait=True)
    display(spark.sql("select * FROM test_query").tail(5))
    sleep(1)

[Row(timestamp=datetime.datetime(2023, 7, 24, 9, 15, 46, 801000), value=bytearray(b'{"x": 16, "y": 8, "z": "3"}')),
 Row(timestamp=datetime.datetime(2023, 7, 24, 9, 15, 51, 802000), value=bytearray(b'{"x": 18, "y": 9, "z": "3"}')),
 Row(timestamp=datetime.datetime(2023, 7, 24, 9, 15, 56, 803000), value=bytearray(b'{"x": 20, "y": 10, "z": "3"}')),
 Row(timestamp=datetime.datetime(2023, 7, 24, 9, 16, 1, 804000), value=bytearray(b'{"x": 22, "y": 11, "z": "3"}')),
 Row(timestamp=datetime.datetime(2023, 7, 24, 9, 16, 6, 805000), value=bytearray(b'{"x": 24, "y": 12, "z": "3"}'))]

KeyboardInterrupt: 

In [97]:
spark.sql("select * FROM test_query order by timestamp desc").show(n=10)

+--------------------+--------------------+
|           timestamp|               value|
+--------------------+--------------------+
|2023-07-24 09:16:...|[7B 22 78 22 3A 2...|
|2023-07-24 09:16:...|[7B 22 78 22 3A 2...|
|2023-07-24 09:16:...|[7B 22 78 22 3A 2...|
|2023-07-24 09:15:...|[7B 22 78 22 3A 2...|
|2023-07-24 09:15:...|[7B 22 78 22 3A 2...|
|2023-07-24 09:15:...|[7B 22 78 22 3A 2...|
|2023-07-24 09:15:...|[7B 22 78 22 3A 2...|
|2023-07-24 09:15:...|[7B 22 78 22 3A 2...|
|2023-07-24 09:15:...|[7B 22 78 22 3A 2...|
|2023-07-24 09:15:...|[7B 22 78 22 3A 2...|
+--------------------+--------------------+
only showing top 10 rows



In [98]:
test_df = spark.sql("select * FROM test_query order by timestamp desc").limit(5)

In [99]:
test_df.count()

5

In [112]:
test_schema = T.StructType()

In [113]:
test_schema.add("x", T.DoubleType(), True) \
            .add("y", T.DoubleType(), True) \
            .add("z", T.StringType(), True)

StructType(List(StructField(x,DoubleType,true),StructField(y,DoubleType,true),StructField(z,StringType,true)))

In [117]:
mapped = test_df.select(F.col("value").cast(T.StringType())) \
       .withColumn("transformed", F.from_json(F.col("value").cast(T.StringType()), test_schema))

In [121]:
mapped.select("transformed.*").show()

+-----+----+---+
|    x|   y|  z|
+-----+----+---+
|170.0|85.0|  3|
|168.0|84.0|  3|
|166.0|83.0|  3|
|164.0|82.0|  3|
|162.0|81.0|  3|
+-----+----+---+



In [103]:
json_schema = spark.read.json(test_df.rdd.map(mapper)).schema

In [104]:
json_schema

StructType(List(StructField(x,LongType,true),StructField(y,LongType,true),StructField(z,StringType,true)))

In [105]:
test_df.rdd.map(lambda row: row.value.decode()).collect()

['{"x": 28, "y": 14, "z": "3"}',
 '{"x": 26, "y": 13, "z": "3"}',
 '{"x": 24, "y": 12, "z": "3"}',
 '{"x": 22, "y": 11, "z": "3"}',
 '{"x": 20, "y": 10, "z": "3"}']

In [122]:
test_schema

StructType(List(StructField(x,DoubleType,true),StructField(y,DoubleType,true),StructField(z,StringType,true)))

[StructField(x,DoubleType,true),
 StructField(y,DoubleType,true),
 StructField(z,StringType,true)]