# JDBC API Write Benchmark

Import needed packets and set connections up

In [None]:
import com.mariadb.columnstore.api.connector.ColumnStoreExporter
import java.util.Properties
import org.apache.spark.sql.{SQLContext,DataFrame}
import org.apache.spark.sql.functions.{rand, randn, sha1, sha2, md5}
import java.sql.{DriverManager,Connection,Date,Timestamp,PreparedStatement,ResultSet,SQLException}

val url = "jdbc:mysql://columnstore_host_nm:3306"

var connectionProperties = new Properties()
connectionProperties.put("user", "jupiter_user")
connectionProperties.put("password", "jupiter_pass")
connectionProperties.put("driver", "org.mariadb.jdbc.Driver")

val sqlContext = new SQLContext(sc)
import sqlContext.implicits._

// SampleDataframe size parameter:
val asciiRange = 128
val randRange = 1000
val hashRange = 1000

Prepare the database

In [None]:
var connection: Connection = null
try {
      connection = DriverManager.getConnection(url, connectionProperties)
      val statement = connection.createStatement
      statement.executeQuery("""DROP DATABASE IF EXISTS benchmark""")
      statement.executeQuery("""CREATE DATABASE IF NOT EXISTS benchmark""")
    } catch {
      case e: Exception => e.printStackTrace()
    } finally {
      connection.close()
    }

Create the sample dataframes to insert

In [None]:
val asciiDF = sc.makeRDD(0 until asciiRange).map(i => (i.toChar.toString, i)).toDF("ascii_representation", "number").cache()
asciiDF.count()
asciiDF.printSchema()
val randDF = sqlContext.range(0, randRange).withColumn("uniform", rand(seed=23)).withColumn("normal", randn(seed=42)).cache()
randDF.count()
randDF.printSchema()
val tmpDF = sc.makeRDD(0 until hashRange).map(i => (i, i.toString)).toDF("number", "string")
tmpDF.registerTempTable("tempDF")
val hashDF = sqlContext.sql("SELECT number, sha1(string) AS sha1, sha2(string,256) AS sha256, sha2(string,512) AS sha512, md5(string) AS md5 FROM tempDF").cache()
hashDF.count()
hashDF.printSchema()

Benchmark the insertion

In [None]:
def createColumnStoreAPITable(name: String, schema: String) : Unit = {
  var connection: Connection = null
  try {
      connection = DriverManager.getConnection("jdbc:mysql://columnstore_host_nm:3306/benchmark", connectionProperties)
      val statement = connection.createStatement
      statement.executeQuery("CREATE TABLE IF NOT EXISTS " + name + " (" + schema + ") engine=columnstore")
  } catch {
      case e: Exception => e.printStackTrace()
  } finally {
      connection.close()
  }   
}

def benchmark(name: String, dataframe: DataFrame, schema: String) = {
    var t = System.nanoTime()
    dataframe.write.option("createTableOptions", "ENGINE=innodb").
    option("createTableColumnTypes", schema).jdbc(url, "benchmark.jdbc_innodb_"+name, connectionProperties)
    val jdbc_innodb_time = System.nanoTime() - t
    t = System.nanoTime()
    dataframe.write.option("numPartitions", 1).option("createTableOptions", "ENGINE=columnstore").
    option("createTableColumnTypes", schema).jdbc(url, "benchmark.jdbc_columnstore_"+name, connectionProperties)
    val jdbc_columnstore_time = System.nanoTime() - t
    t = System.nanoTime()
    createColumnStoreAPITable("api_columnstore_"+name, schema)
    ColumnStoreExporter.export("benchmark", "api_columnstore_"+name, dataframe)
    val api_columnstore_time = System.nanoTime() - t
    (jdbc_innodb_time, jdbc_columnstore_time, api_columnstore_time)
}

val ascii_benchmark = benchmark("ascii", asciiDF, "ascii_representation CHAR(1), number INT")
val rand_benchmark = benchmark("rand", randDF, "id BIGINT, uniform DOUBLE, normal DOUBLE")
val hash_benchmark = benchmark("hash", hashDF, "number BIGINT, sha1 VARCHAR(40), sha256 VARCHAR(64), sha512 VARCHAR(128), md5 VARCHAR(32)")

Show the comparison in numbers

In [None]:
println("jdbc_innodb\tjdbc_columnstore\tapi_columnstore\t\trows\t\titems")
println(ascii_benchmark._1/1000000000.toDouble+"s\t"+ascii_benchmark._2/1000000000.toDouble+"s\t\t"+ascii_benchmark._3/1000000000.toDouble+"s\t\t"+asciiDF.count+"\t\t"+asciiDF.count*asciiDF.columns.size)
println(rand_benchmark._1/1000000000.toDouble+"s\t"+rand_benchmark._2/1000000000.toDouble+"s\t\t"+rand_benchmark._3/1000000000.toDouble+"s\t\t"+randDF.count+"\t\t"+randDF.count*randDF.columns.size)
println(hash_benchmark._1/1000000000.toDouble+"s\t"+hash_benchmark._2/1000000000.toDouble+"s\t\t"+hash_benchmark._3/1000000000.toDouble+"s\t\t"+hashDF.count+"\t\t"+hashDF.count*hashDF.columns.size)