# Python example
## Insert through JDBC
Variables to connect to MariaDB ColumnStore throuhg JDBC are set and a SparkContext is initiated.

In [None]:
from pyspark import SparkContext
from pyspark.sql import Row, SQLContext

url = 'jdbc:mysql://columnstore_host_nm:3306'
properties = {'user': 'jupiter_user', 'password': 'jupiter_pass', 'driver': 'org.mariadb.jdbc.Driver'}

sc = SparkContext("local", "MariaDB Spark ColumnStore example")
sqlContext = SQLContext(sc)

A sample DataFrame is created, that shows numbers and their ASCII representation,

In [None]:
sampleDF = sqlContext.createDataFrame(sc.parallelize(range(0, 128))
                                      .map(lambda i: Row(number=i, ASCII_representation= chr(i))))
sampleDF.printSchema()
sampleDF.registerTempTable("df")
sqlContext.sql("SELECT number, ASCII_representation from df WHERE number > 64 LIMIT 10").show()

 and is inserted into MariaDB CoumnStore through JDBC in database "test" table "pythonexample".

In [None]:
sampleDF.write \
    .mode("overwrite") \
    .option("numPartitions", 1) \
    .option("createTableOptions", "ENGINE=columnstore") \
    .option("createTableColumnTypes", "number INT, ASCII_representation CHAR(1)") \
    .jdbc(url, "test.pythonexample", properties=properties)

## Insert through native ColumnStore API (bulk insert)
Variables to connect to MariaDB ColumnStore through the native ColumnStore API are already set in /usr/local/mariadb/columnstore/etc/Columnstore.xml.

The necessary library is loaded, the ColumnStoreDriver is instantiated, and table "pythonexample" in database "test" is set for the bulk insert.

In [None]:
import pymcsapi

driver = pymcsapi.ColumnStoreDriver()
bulk = driver.createBulkInsert('test', 'pythonexample', 0, 0)

Data is inserted into ColumnStore.

In [None]:
for row in sampleDF.collect():
    bulk.setColumn(0, row.ASCII_representation)
    bulk.setColumn(1, row.number)
    bulk.writeRow()
    
bulk.commit()

Finally, a summary of the insert process is shown.

In [None]:
summary = bulk.getSummary()
print("Execution time: %s" % (summary.getExecutionTime(),))
print("Rows inserted: %s" % (summary.getRowsInsertedCount(),))
print("Truncation count: %s" % (summary.getTruncationCount(),))
print("Saturated count: %s" % (summary.getSaturatedCount(),))
print("Invalid count: %s" % (summary.getInvalidCount(),))

## Insert through custom function and ColumnStore API (automatic type detection)
Requires an existing table with a corresponding schema.

In [None]:
import sys, pymcsapi, decimal, datetime

def export(database, table, df):
    
    global long
    python2 = True

    if sys.version_info[0] == 3:
        long = int
        python2 = False

    rows = df.collect()
    driver = pymcsapi.ColumnStoreDriver()
    bulkInsert = driver.createBulkInsert(database, table, 0, 0)
    
    # get the column count of table
    dbCatalog = driver.getSystemCatalog()
    dbTable = dbCatalog.getTable(database, table)
    dbTableColumnCount = dbTable.getColumnCount()
    
    # insert row by row into table
    try:
        for row in rows:
            for columnId in range(0, len(row)):
                if columnId < dbTableColumnCount:
                    if isinstance(row[columnId], bool):
                        if row[columnId]:
                            bulkInsert.setColumn(columnId, 1)
                        else:
                            bulkInsert.setColumn(columnId, 0)
                    
                    elif isinstance(row[columnId], datetime.date):
                        bulkInsert.setColumn(columnId, row[columnId].strftime('%Y-%m-%d %H:%M:%S'))
                    
                    elif isinstance(row[columnId], decimal.Decimal):
                        dbColumn = dbTable.getColumn(columnId)
                        #DATA_TYPE_DECIMAL, DATA_TYPE_UDECIMAL, DATA_TYPE_FLOAT, DATA_TYPE_UFLOAT, DATA_TYPE_DOUBLE, DATA_TYPE_UDOUBLE
                        if dbColumn.getType() == 4 or dbColumn.getType() == 18 or dbColumn.getType() == 7 or dbColumn.getType() == 21 or dbColumn.getType() == 10 or dbColumn.getType() == 23:
                            s = '{0:f}'.format(row[columnId])
                            bulkInsert.setColumn(columnId, pymcsapi.ColumnStoreDecimal(s))
                        #ANY OTHER DATA TYPE
                        else:
                            bulkInsert.setColumn(columnId, long(row[columnId]))
    
                    #handle python2 unicode strings
                    elif python2 and isinstance(row[columnId], unicode):
                        bulkInsert.setColumn(columnId, row[columnId].encode('utf-8'))

                    #any other datatype is inserted without parsing
                    else:
                        bulkInsert.setColumn(columnId, row[columnId])
            bulkInsert.writeRow()
        bulkInsert.commit()
    except Exception as e:
        bulkInsert.rollback()
        print(row[columnId], type(row[columnId]))
        print(type(e))
        print(e)
       
    #print a short summary of the insertion process
    summary = bulkInsert.getSummary()
    print("Execution time: %s" % (summary.getExecutionTime(),))
    print("Rows inserted: %s" % (summary.getRowsInsertedCount(),))
    print("Truncation count: %s" %(summary.getTruncationCount(),))
    print("Saturated count: %s" %(summary.getSaturatedCount(),))
    print("Invalid count: %s" %(summary.getInvalidCount(),))

In [None]:
export("test","pythonexample",sampleDF)