# Aerospike Connect for Spark Tutorial for Python
## Tested with Java 8, Spark 2.4.0, and Python 3.7

In [1]:
import findspark
findspark.init()

In [2]:
# IP Address or DNS name for one host in your Aerospike cluster
AS_HOST ="10.0.2.106"
# Name of one of your namespaces. Type 'show namespaces' at the aql prompt if you are not sure
AS_NAMESPACE = "test" 

AS_PORT = 3000 # Usually 3000, but change here if not
AS_CONNECTION_STRING = AS_HOST + ":"+ str(AS_PORT)

# Download the Aerospike Spark jar
import urllib
import os



AEROSPIKE_SPARK_JAR_VERSION="2.4.0"

def aerospike_spark_jar_download_url(version=AEROSPIKE_SPARK_JAR_VERSION):
    DOWNLOAD_PREFIX="https://www.aerospike.com/enterprise/download/connectors/aerospike-spark/"
    DOWNLOAD_SUFFIX="/artifact/jar"
    AEROSPIKE_SPARK_JAR_DOWNLOAD_URL = DOWNLOAD_PREFIX+AEROSPIKE_SPARK_JAR_VERSION+DOWNLOAD_SUFFIX
    return AEROSPIKE_SPARK_JAR_DOWNLOAD_URL

def download_aerospike_spark_jar(version=AEROSPIKE_SPARK_JAR_VERSION):
    JAR_NAME="aerospike-spark-assembly-"+AEROSPIKE_SPARK_JAR_VERSION+".jar"
    if(not(os.path.exists(JAR_NAME))) :
        urllib.request.urlretrieve(aerospike_spark_jar_download_url(),JAR_NAME)
    else :
        print(JAR_NAME+" already downloaded")
    return os.path.join(os.getcwd(),JAR_NAME)

AEROSPIKE_JAR_PATH=download_aerospike_spark_jar()
os.environ["PYSPARK_SUBMIT_ARGS"] = '--jars ' + AEROSPIKE_JAR_PATH + ' pyspark-shell'



In [3]:
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StringType, StructField, StructType, ArrayType, IntegerType, MapType, LongType

Set up spark and point aerospike db to AS_HOST

In [4]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
sqlContext = SQLContext(sc)
spark.conf.set("aerospike.namespace", AS_NAMESPACE)
spark.conf.set("aerospike.seedhost", AS_CONNECTION_STRING)

## Create sample data and write to Aerospike database

In [6]:
import random
num_records=1000

schema = StructType( 
    [
        StructField("_id", IntegerType(), True),
        StructField("name", StringType(), True),
        StructField("age", IntegerType(), True),
        StructField("salary",IntegerType(), True)
    ]
)


inputBuf = []
for  i in range(1, num_records) :
         name = "name"  + str(i)
         age = i%100
         salary = 50000 + random.randint(1,50000)
         id_ = i 
         inputBuf.append((id_, name, age,salary))
    
inputRDD = spark.sparkContext.parallelize(inputBuf)
inputDF=spark.createDataFrame(inputRDD,schema)


#Write the Sample Data to Aerospike
inputDF \
.write \
.mode('overwrite') \
.format("com.aerospike.spark.sql")  \
.option("aerospike.namespace", AS_NAMESPACE) \
.option("aerospike.set", "py_input_data")\
.option("aerospike.updateByKey", "_id") \
.save()

## Schema in the Spark Connector

-  Aerospike is schemaless, however spark adher to schema. After the schema is decided upon (either through inference or given), data within the bins must honor the types. 

- To infer schema, the connector samples a set of records (configurable through `aerospike.schema.scan`) to decide the name of bins/columns and their types. This implies that the derived schema depends entirely upon sampled records.  

- Note that `__key` was not part of provided schema. So how can one query using `__key`? We can just add `__key` in provided schema with appropriate type. Similarly we can add `__gen` or `__ttl` etc.  
         
      schemaWithPK =  StructType([
                StructField("__key",IntegerType(), False),    
                StructField("id", IntegerType(), False),
                StructField("name", StringType(), False),
                StructField("age", IntegerType(), False),
                StructField("salary",IntegerType(), False)])
                
- We recommend that you provide schema for queries that involve complex data types such as lists, maps, and mixed types. 

## Load data into a DataFrame without specifying any Schema (using connector schema inference)

In [8]:
# Create a Spark DataFrame by using the Connector Schema inference mechanism

loadedDFWithoutSchema = (
    spark.read.format("com.aerospike.spark.sql") \
    .option("aerospike.seedhost",AS_HOST) \
    .option ("aerospike.namespace", AS_NAMESPACE) \
    .option("aerospike.keyPath", "/etc/aerospike/features.conf") \
    .option("aerospike.set", "py_input_data") \
    .load()
)


loadedDFWithoutSchema.show()

+-----+--------------------+--------+------------+-----+---+-------+------+---+
|__key|            __digest|__expiry|__generation|__ttl|age|   name|salary|_id|
+-----+--------------------+--------+------------+-----+---+-------+------+---+
| null|[02 50 2D 45 89 D...|       0|           1|   -1|  4|name604| 61568|604|
| null|[08 50 66 A1 68 1...|       0|           1|   -1| 10| name10| 88282| 10|
| null|[0B 70 8A F2 9F A...|       0|           1|   -1| 86|name586| 94012|586|
| null|[13 D0 09 FD 8E E...|       0|           1|   -1| 77|name477| 97299|477|
| null|[13 50 C4 E1 3F 6...|       0|           1|   -1| 50| name50| 90621| 50|
| null|[17 B0 1E 54 2C 9...|       0|           1|   -1| 54|name554| 54781|554|
| null|[18 10 E6 C8 15 6...|       0|           1|   -1| 64|name964| 71373|964|
| null|[1C 30 1B 8B DC E...|       0|           1|   -1| 80|name880| 54376|880|
| null|[25 B0 10 82 C7 6...|       0|           1|   -1| 59|name859| 99939|859|
| null|[26 90 85 67 25 2...|       0|   

## Load data into a DataFrame with user specified schema 

In [9]:
#Data can be loaded with known schema as well.

loadedDFWithSchema=spark \
.read \
.format("com.aerospike.spark.sql") \
.schema(schema) \
.option("aerospike.seedhost",AS_HOST) \
.option("aerospike.featurekey", "/etc/aerospike/features.conf") \
.option ("aerospike.namespace", AS_NAMESPACE) \
.option("aerospike.set", "py_input_data").load()

loadedDFWithSchema.show(5)



+---+-------+---+------+
|_id|   name|age|salary|
+---+-------+---+------+
|604|name604|  4| 61568|
| 10| name10| 10| 88282|
|586|name586| 86| 94012|
|477|name477| 77| 97299|
| 50| name50| 50| 90621|
+---+-------+---+------+
only showing top 5 rows



## SparkML with Aerospike

### The purpose of this ML model is to illustrate how data in Aerospike can be used for training and inference using SparkML 
#### A K-Means clustering model is used to create several segments based on age and salary.

In [11]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import VectorAssembler


#All machine learning algorithms in Spark take as input a Vector type, which must be a set of numerical values.
assembler = VectorAssembler(
    inputCols=["_id", "age", "salary"],
    outputCol="features")
data_2 = assembler.transform(loadedDFWithSchema)
data_2.show()

# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(data_2)

# Make predictions
predictions = model.transform(data_2)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

+---+-------+---+------+--------------------+
|_id|   name|age|salary|            features|
+---+-------+---+------+--------------------+
|604|name604|  4| 61568| [604.0,4.0,61568.0]|
| 10| name10| 10| 88282| [10.0,10.0,88282.0]|
|586|name586| 86| 94012|[586.0,86.0,94012.0]|
|477|name477| 77| 97299|[477.0,77.0,97299.0]|
| 50| name50| 50| 90621| [50.0,50.0,90621.0]|
|554|name554| 54| 54781|[554.0,54.0,54781.0]|
|964|name964| 64| 71373|[964.0,64.0,71373.0]|
|880|name880| 80| 54376|[880.0,80.0,54376.0]|
|859|name859| 59| 99939|[859.0,59.0,99939.0]|
|227|name227| 27| 75497|[227.0,27.0,75497.0]|
|812|name812| 12| 94863|[812.0,12.0,94863.0]|
|780|name780| 80| 71442|[780.0,80.0,71442.0]|
|555|name555| 55| 82947|[555.0,55.0,82947.0]|
|185|name185| 85| 79060|[185.0,85.0,79060.0]|
|221|name221| 21| 82627|[221.0,21.0,82627.0]|
|117|name117| 17| 93491|[117.0,17.0,93491.0]|
|706|name706|  6| 69972| [706.0,6.0,69972.0]|
|358|name358| 58| 78507|[358.0,58.0,78507.0]|
|438|name438| 38| 64639|[438.0,38.

## Writing Sample Complex Data Types (CDT) data into Aerospike

In [14]:
complex_data_json="resources/nested_data.json"
alias=  StructType( [
    StructField("first_name",StringType(), False),
    StructField("last_name",StringType(), False)]
)

name= StructType([
    StructField("first_name",StringType(), False), 
    StructField("aliases",ArrayType(alias), False)]
)
street_adress= StructType([StructField("street_name", StringType(), False), StructField("apt_number", IntegerType(), False)])
address = StructType([StructField("zip", LongType(), False), StructField("street", street_adress, False), StructField("city", StringType(), False)])

workHistory = StructType([StructField ("company_name" , StringType(), False),
                              StructField( "company_address" , address, False),
                              StructField("worked_from", StringType(), False)]
                        )

person= StructType([StructField("name", name, False),
                        StructField("SSN", StringType(), False),
                        StructField("home_address", ArrayType(address), False),
                        StructField("work_history", ArrayType(workHistory), False)]
                  )

cmplx_data_with_schema=spark.read.schema(person).json(complex_data_json)
cmplx_data_with_schema.printSchema()

cmplx_data_with_schema \
.write \
.mode('overwrite') \
.format("com.aerospike.spark.sql")  \
.option("aerospike.seedhost", AS_HOST) \
.option("aerospike.namespace", AS_NAMESPACE) \
.option("aerospike.writeset", "complex_input_data") \
.option("aerospike.updateByKey", "name.first_name") \
.save()

root
 |-- name: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- aliases: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- first_name: string (nullable = true)
 |    |    |    |-- last_name: string (nullable = true)
 |-- SSN: string (nullable = true)
 |-- home_address: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- zip: long (nullable = true)
 |    |    |-- street: struct (nullable = true)
 |    |    |    |-- street_name: string (nullable = true)
 |    |    |    |-- apt_number: integer (nullable = true)
 |    |    |-- city: string (nullable = true)
 |-- work_history: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- company_name: string (nullable = true)
 |    |    |-- company_address: struct (nullable = true)
 |    |    |    |-- zip: long (nullable = true)
 |    |    |    |-- street: struct (nullable = true)
 |    |    |    |    

## Load Complex Data Types (CDT) into a DataFrame without specifying any schema (using connector schema inference)¶

In [15]:
loadedComplexDFWithoutSchema=spark \
.read \
.format("com.aerospike.spark.sql") \
.option("aerospike.seedhost", AS_HOST) \
.option("aerospike.keyPath", "/etc/aerospike/features.conf") \
.option ("aerospike.namespace", "test") \
.option("aerospike.set", "complex_input_data") \
.load() 
loadedComplexDFWithoutSchema.printSchema()

root
 |-- __key: string (nullable = true)
 |-- __digest: binary (nullable = false)
 |-- __expiry: integer (nullable = false)
 |-- __generation: integer (nullable = false)
 |-- __ttl: integer (nullable = false)
 |-- SSN: string (nullable = true)
 |-- work_history: array (nullable = true)
 |    |-- element: binary (containsNull = true)
 |-- name: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- home_address: array (nullable = true)
 |    |-- element: binary (containsNull = true)



## Load Complex Data Types (CDT) into a DataFrame with user specified schema 

In [16]:
loadedComplexDFWithSchema=spark \
.read \
.format("com.aerospike.spark.sql") \
.option("aerospike.seedhost", AS_HOST) \
.option("aerospike.keyPath", "/etc/aerospike/features.conf") \
.option ("aerospike.namespace", "test") \
.option("aerospike.set", "complex_input_data") \
.schema(person) \
.load() 
loadedComplexDFWithSchema.printSchema()
#Please note the difference in types of loaded data in both cases. With schema, we extactly infer complex types.

root
 |-- name: struct (nullable = false)
 |    |-- first_name: string (nullable = false)
 |    |-- aliases: array (nullable = false)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- first_name: string (nullable = false)
 |    |    |    |-- last_name: string (nullable = false)
 |-- SSN: string (nullable = false)
 |-- home_address: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- zip: long (nullable = false)
 |    |    |-- street: struct (nullable = false)
 |    |    |    |-- street_name: string (nullable = false)
 |    |    |    |-- apt_number: integer (nullable = false)
 |    |    |-- city: string (nullable = false)
 |-- work_history: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- company_name: string (nullable = false)
 |    |    |-- company_address: struct (nullable = false)
 |    |    |    |-- zip: long (nullable = false)
 |    |    |    |-- street: struct (nullable = false)
 |  

## Data Exploration with Aerospike 

In [17]:
#Install packages, if not installed.
!pip install numpy 
!pip install matplotlib
!pip install pandas

You should consider upgrading via the '/home/ec2-user/venv/bin/python -m pip install --upgrade pip' command.[0m
Collecting matplotlib
  Downloading matplotlib-3.3.2-cp37-cp37m-manylinux1_x86_64.whl (11.6 MB)
[K     |████████████████████████████████| 11.6 MB 17.2 MB/s eta 0:00:01
Collecting cycler>=0.10
  Downloading cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Collecting kiwisolver>=1.0.1
  Downloading kiwisolver-1.2.0-cp37-cp37m-manylinux1_x86_64.whl (88 kB)
[K     |████████████████████████████████| 88 kB 20.7 MB/s eta 0:00:01
[?25hCollecting pillow>=6.2.0
  Downloading Pillow-7.2.0-cp37-cp37m-manylinux1_x86_64.whl (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 60.9 MB/s eta 0:00:01
[?25hCollecting certifi>=2020.06.20
  Downloading certifi-2020.6.20-py2.py3-none-any.whl (156 kB)
[K     |████████████████████████████████| 156 kB 83.4 MB/s eta 0:00:01
Installing collected packages: cycler, kiwisolver, pillow, certifi, matplotlib
Successfully installed certifi-2020.6.20 c

In [18]:
import pandas
import matplotlib
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named '_bz2'

In [19]:
#convert spark df to pandas df
pdf = loadedDFWithSchema.toPandas()
#print(pdf)

#Histogram
pdf[['age']].head(20).plot(kind='hist',bins=[0,20,40,60,80,100],rwidth=0.8)
plt.show()

#Bar graph
pdf.head(20).groupby('age')['name'].nunique().plot(kind='bar')
plt.show()

#pie chart
pdf.head(20).groupby(['age']).sum().plot(kind='pie', y='salary')
plt.show()

#Line Plot
# gca stands for 'get current axis'
ax = plt.gca()
pdf.tail(20).plot(kind='line',x='name',y='salary', color='red', ax=ax)
plt.show()

ImportError: Pandas >= 0.19.2 must be installed; however, it was not found.

# Querying Aerospike Data using SparkSQL
#### Things to keep in mind
   1. Queries that involve Primary Key in the predicate trigger aerospike_batch_get()[hyper link: https://www.aerospike.com/docs/client/c/usage/kvs/batch.html] and run extremely fast. For e.g. a query containing `__key` with, with no `OR` between two bins.
   2. All other queries may entail a full scan of the Aerospike DB if they can’t be converted to Aerospike batchget. 

## Queries that include Primary Key in the Predicate

In case of batchget queries we can also apply filters upon metadata columns like `__gen` or `__ttl` etc. To do so, these columns should be exposed through schema (if schema provided). 

In [16]:
batchGet1= spark \
.read \
.format("com.aerospike.spark.sql") \
.option("aerospike.seedhost", AS_HOST) \
.option("aerospike.featurekey", "/etc/aerospike/features.conf") \
.option ("aerospike.namespace", namespace) \
.option("aerospike.set", "py_input_data") \
.option("aerospike.keyType", "int") \
.load().where("__key = 829") \

batchGet1.show()
#Please be aware ASDB only supports equality test with PKs in primary key query. 
#So, a where clause with "__key >10", would result in scan query!

+-----+--------------------+---------+------------+-------+---+-------+------+---+
|__key|            __digest| __expiry|__generation|  __ttl|age|   name|salary|_id|
+-----+--------------------+---------+------------+-------+---+-------+------+---+
|  829|[9A E9 5B 0A 11 6...|340420991|          14|2591946| 29|name829| 55725|829|
+-----+--------------------+---------+------------+-------+---+-------+------+---+



In [17]:
#In this query we are doing *OR* between PK subqueries 
from pyspark.sql.functions import *
somePrimaryKeys= list(range(1,10))
someMoreKeys= list(range(12,14))
batchGet2= spark \
.read \
.format("com.aerospike.spark.sql") \
.option("aerospike.seedhost",AS_HOST) \
.option("aerospike.featurekey", "/etc/aerospike/features.conf") \
.option ("aerospike.namespace", namespace) \
.option("aerospike.set", "py_input_data") \
.option("aerospike.keyType", "int") \
.load().where((col("__key").isin(somePrimaryKeys)) | ( col("__key").isin(someMoreKeys))) 

batchGet2.show(5)
#We should got in total 'len(somePrimaryKeys) + len(someMoreKeys)' records.

+-----+--------------------+---------+------------+-------+---+------+------+---+
|__key|            __digest| __expiry|__generation|  __ttl|age|  name|salary|_id|
+-----+--------------------+---------+------------+-------+---+------+------+---+
|    4|[FE E0 77 E4 17 F...|340420991|          14|2591946|  4| name4| 87361|  4|
|    5|[FF 00 39 4A 07 0...|340420991|          14|2591946|  5| name5| 96477|  5|
|    7|[8B F3 60 83 F9 6...|340420991|          14|2591946|  7| name7| 87085|  7|
|   13|[D7 B4 65 3D FA 4...|340420991|          14|2591946| 13|name13| 67846| 13|
|    3|[75 25 0A 1D C0 4...|340420991|          14|2591946|  3| name3| 92078|  3|
+-----+--------------------+---------+------------+-------+---+------+------+---+
only showing top 5 rows



## Queries that do not include Primary Key in the Predicate

In [18]:
somePrimaryKeys= list(range(1,10))
scanQuery1= spark \
.read \
.format("com.aerospike.spark.sql") \
.option("aerospike.seedhost", AS_HOST) \
.option ("aerospike.namespace", namespace) \
.option("aerospike.featurekey", "/etc/aerospike/features.conf") \
.option("aerospike.set", "py_input_data") \
.option("aerospike.keyType", "int") \
.load().where((col("__key").isin(somePrimaryKeys)) | ( col("age") >50 ))

scanQuery1.show()

#Since there is OR between PKs and Bin. It will be treated as Scan query. 
#Primary keys are not stored in bins(by default), hence only filters corresponding to bins are honored.  

+-----+--------------------+---------+------------+-------+---+-------+------+---+
|__key|            __digest| __expiry|__generation|  __ttl|age|   name|salary|_id|
+-----+--------------------+---------+------------+-------+---+-------+------+---+
| null|[0B 70 8A F2 9F A...|340420991|          14|2591945| 86|name586| 55387|586|
| null|[13 D0 09 FD 8E E...|340420991|          14|2591945| 77|name477| 87042|477|
| null|[17 B0 1E 54 2C 9...|340420991|          14|2591945| 54|name554| 66669|554|
| null|[18 10 E6 C8 15 6...|340420991|          14|2591945| 64|name964| 95080|964|
| null|[1C 30 1B 8B DC E...|340420991|          14|2591945| 80|name880| 70626|880|
| null|[25 B0 10 82 C7 6...|340420991|          14|2591945| 59|name859| 64936|859|
| null|[35 D0 BA A7 35 7...|340420991|          14|2591945| 80|name780| 51597|780|
| null|[3C 80 4A 91 AA 3...|340420991|          14|2591945| 55|name555| 72318|555|
| null|[3D C0 3E 1E 0D C...|340420991|          14|2591945| 85|name185| 53643|185|
| nu

## Query with CDT

In [19]:
#Find people who have had at least 5 jobs in the past
from pyspark.sql.functions import col, size

loadedComplexDFWithSchema \
.withColumn("past_jobs", col("work_history.company_name")) \
.withColumn("num_jobs", size(col("past_jobs")))  \
.where(col("num_jobs") > 4) \
.show(5)

+--------------------+-----------+--------------------+--------------------+--------------------+--------+
|                name|        SSN|        home_address|        work_history|           past_jobs|num_jobs|
+--------------------+-----------+--------------------+--------------------+--------------------+--------+
|[Tami, [[Joseph, ...|001-49-0685|[[23288, [Clark V...|[[Roberts PLC, [4...|[Roberts PLC, Hub...|       5|
|[Chelsea, [[Melis...|465-88-7213|[[49305, [Ward By...|[[Ochoa and Sons,...|[Ochoa and Sons, ...|       5|
|[Jonathan, [[Robe...|526-54-7792|[[71421, [William...|[[Henderson-Shaw,...|[Henderson-Shaw, ...|       5|
|[Gary, [[Cameron,...|825-55-3247|[[66428, [Kim Mil...|[[Bishop, Scott a...|[Bishop, Scott an...|       5|
|[Danielle, [[Mich...|319-30-0983|[[63276, [Bauer C...|[[Powers LLC, [60...|[Powers LLC, Powe...|       5|
+--------------------+-----------+--------------------+--------------------+--------------------+--------+
only showing top 5 rows



## Use Aerospike Spark Connector Configuration properties in the Spark API to improve performance 

  - aerospike.partition.factor: number of logical aerospike partitions [0-15]
  - aerospike.maxthreadcount : maximum number of threads to use for writing data into Aerospike
  - aerospike.compression : compression of java client-server communication
  - aerospike.batchMax : maximum number of records per read request (default 5000)
  - aerospike.recordspersecond : same as java client

#### Other
  - aerospike.keyType : Primary key type hint for schema inference. Always set it properly if primary key type is not string  

See https://www.aerospike.com/docs/connect/processing/spark/reference.html for detailed description of the above properties
