In [1]:
from pyspark.sql import SparkSession

In [2]:
jars = [
    "/usr/local/spark/jars/hadoop-aws-3.2.0.jar",
    "/usr/local/spark/jars/aws-java-sdk-bundle-1.11.375.jar",
    "/usr/local/spark/jars/spark-cassandra-connector-assembly_2.12-3.0.0.jar"
]

In [3]:
spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "480m").\
        config("spark.jars", ",".join(jars)).\
        getOrCreate()

In [4]:
spark

## conf for minio

In [5]:
import os

In [6]:
sc = spark.sparkContext

In [7]:
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "http://minio:9000")
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", os.environ["AWS_ACCESS_KEY_ID"])
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", os.environ["AWS_SECRET_ACCESS_KEY"])
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
sc._jsc.hadoopConfiguration().set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")

In [8]:
df = spark.read.csv("s3a://realtime-ml/data/raw/transactions.csv", header=True, inferSchema=True)

In [9]:
df.show(1, vertical=True)

-RECORD 0--------------------------
 cc_num     | 180094108369013      
 first      | John                 
 last       | Holland              
 trans_num  | 80f5177be11f0bcd7... 
 trans_date | 2011-12-31 18:30:00  
 trans_time | 00:12:15             
 unix_time  | 1325376735           
 category   | personal_care        
 merchant   | Hills-Boyer          
 amt        | 64                   
 merch_lat  | 39.011566            
 merch_long | -119.937831          
 is_fraud   | 0                    
only showing top 1 row



In [10]:
df2 = spark.read.csv("s3a://realtime-ml/data/raw/customer.csv", header=True, inferSchema=True)

In [11]:
df2.show(1, vertical=True)

-RECORD 0----------------------
 cc_num | 3526015186182660     
 first  | Carl                 
 last   | Gomez                
 gender | M                    
 street | 204 Cohen Meadow ... 
 city   | Hathaway Pines       
 state  | CA                   
 zip    | 95233                
 lat    | 38.1919              
 long   | -120.3644            
 job    | Data processing m... 
 dob    | 1958-10-11 18:30:00  
only showing top 1 row



In [12]:
df2.printSchema()

root
 |-- cc_num: long (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- job: string (nullable = true)
 |-- dob: timestamp (nullable = true)



## cassandra with spark

In [18]:
hosts = {"spark.cassandra.connection.host": 'cassandra'}

In [19]:
df = spark.read.format("org.apache.spark.sql.cassandra")\
   .options(table="students", keyspace="dummy").options(**hosts).load()

In [20]:
df.show()

+---+----+
|idx|name|
+---+----+
|  1|  AJ|
+---+----+

