# JDBC API Write Benchmark

Import needed packets and set connections up

In [None]:
import columnStoreExporter, time
import mysql.connector as mariadb
from pyspark import SparkContext
from pyspark.sql import Row, SQLContext
from pyspark.sql.functions import rand, randn, sha1, sha2, md5

url = 'jdbc:mysql://columnstore_host_nm:3306'
properties = {'user': 'jupiter_user', 'password': 'jupiter_pass', 'driver': 'org.mariadb.jdbc.Driver'}

sc = SparkContext("local", "MariaDB Spark ColumnStore Benchmark")
sqlContext = SQLContext(sc)

# SampleDataframe size parameter:
asciiRange = 128
randRange = 1000
hashRange = 1000

Prepare the database

In [None]:
try:
    conn = mariadb.connect(user='jupiter_user', password='jupiter_pass', host='columnstore_host_nm')
    cursor = conn.cursor()
    cursor.execute("DROP DATABASE IF EXISTS benchmark")
    cursor.execute("CREATE DATABASE IF NOT EXISTS benchmark")

except mariadb.Error as err:
    print("Error while preparing the database for the benchmark. %s" %(err,))

finally:
    if cursor: cursor.close()
    if conn: conn.close()

Create the sample dataframes to insert

In [None]:
asciiDF = sqlContext.createDataFrame(sc.parallelize(range(0, asciiRange)).\
                                     map(lambda i: Row(number=i, ascii_representation=chr(i)))).cache()
asciiDF.count()
asciiDF.printSchema()
randDF = sqlContext.range(0, randRange).withColumn('uniform', rand(seed=23)).withColumn('normal', randn(seed=42)).cache()
randDF.count()
randDF.printSchema()
tmpDF = sqlContext.createDataFrame(sc.parallelize(range(0, hashRange)).map(lambda i: Row(number=i, string=str(i))))
hashDF = tmpDF.select(tmpDF.number, sha1(tmpDF.string).alias("sha1"), sha2(tmpDF.string,256).alias("sha256"),\
                      sha2(tmpDF.string,512).alias("sha512"), md5(tmpDF.string).alias("md5")).cache()
hashDF.count()
hashDF.printSchema()

Benchmark the insertion

In [None]:
def createColumnStoreAPITable(name, schema):
    try:
        conn = mariadb.connect(user='jupiter_user', password='jupiter_pass', host='columnstore_host_nm', database='benchmark')
        cursor = conn.cursor()
        cursor.execute("CREATE TABLE IF NOT EXISTS %s (%s) engine=columnstore" %(name,schema))

    except mariadb.Error as err:
        print("Error while creating the columnstore database %s for the benchmark. %s" %(name,err,))
    
    finally:
        if cursor: cursor.close()
        if conn: conn.close()

def benchmark(name, dataframe, schema):
    t = time.time()
    dataframe.write.option("createTableOptions", "ENGINE=innodb")\
    .option("createTableColumnTypes", schema).jdbc(url, "benchmark.jdbc_innodb_%s" %(name,), properties=properties)
    jdbc_innodb_time = time.time() - t
    t = time.time()
    dataframe.write.option("numPartitions", 1).option("createTableOptions", "ENGINE=columnstore")\
    .option("createTableColumnTypes", schema).jdbc(url, "benchmark.jdbc_columnstore_%s" %(name,), properties=properties)
    jdbc_columnstore_time = time.time() - t
    t = time.time()
    createColumnStoreAPITable("api_columnstore_%s" %(name,), schema)
    columnStoreExporter.export("benchmark","api_columnstore_%s" %(name,),dataframe)
    api_columnstore_time = time.time() - t
    return jdbc_innodb_time, jdbc_columnstore_time, api_columnstore_time

ascii_benchmark = benchmark("ascii", asciiDF, "ascii_representation CHAR(1), number INT")
rand_benchmark = benchmark("rand", randDF, "id BIGINT, uniform DOUBLE, normal DOUBLE")
hash_benchmark = benchmark("hash", hashDF, "number BIGINT, sha1 VARCHAR(40), sha256 VARCHAR(64), sha512 VARCHAR(128), md5 VARCHAR(32)")

Show the comparison in numbers

In [None]:
print("jdbc_innodb\tjdbc_columnstore\tapi_columnstore\t\trows\t\titems")
print("%.3fs\t\t%.3fs\t\t\t%.3fs\t\t\t%i\t\t%i" %(ascii_benchmark[0], ascii_benchmark[1], ascii_benchmark[2], asciiDF.count(), asciiDF.count()*len(asciiDF.columns)))
print("%.3fs\t\t%.3fs\t\t\t%.3fs\t\t\t%i\t\t%i" %(rand_benchmark[0], rand_benchmark[1], rand_benchmark[2], randDF.count(), randDF.count()*len(randDF.columns)))
print("%.3fs\t\t%.3fs\t\t\t%.3fs\t\t\t%i\t\t%i" %(hash_benchmark[0], hash_benchmark[1], hash_benchmark[2], hashDF.count(), hashDF.count()*len(hashDF.columns)))