# Setup

In [None]:
%%bash
if [[ -d project-tycho-utilities ]];
then
  cd project-tycho-utilities/
  git pull
else
  git clone https://github.com/lgautier/project-tycho-utilities.git
  cd project-tycho-utilities/
fi
DBNAME=../tycho.db make all

<!-- label:spark_setup -->
Spark can be started from regular Python code.

In [None]:
import findspark
findspark.init()

import pyspark

conf = pyspark.conf.SparkConf()
(conf.setMaster('local[2]')
 .setAppName('ipython-notebook')
 .set("spark.executor.memory", "2g"))

sc = pyspark.SparkContext(conf=conf)

<!-- label:spark_sql_create -->

In [None]:
import sqlite3
dbfilename = "tycho.db"
dbcon = sqlite3.connect(dbfilename)
cursor = dbcon.cursor()

<!-- label:spark_dataframe -->

In [None]:
from pyspark.sql import SQLContext, Row
sqlcontext = SQLContext(sc)
cursor.execute("SELECT * FROM location")
location = \
    sqlcontext.createDataFrame(cursor,
                               tuple(x[0] for x in cursor.description))
location.registerTempTable("location")

sql = """
SELECT * 
FROM (SELECT * FROM disease WHERE name='PNEUMONIA') AS disease
INNER JOIN casecount
ON disease.id=casecount.disease_id"""

cursor.execute(sql)
casecount = \
    sqlcontext.createDataFrame(cursor,
                               tuple(x[0] for x in cursor.description))
casecount.registerTempTable("casecount")

Streaming data out of SQLite onto Spark cluster
SQL query on database in Spark
Regular old SQL
    Translated by Spark
    optimized
    sent to JVM for bytecode

---


<!-- label:spark_query -->
SQL can be used to query the data.

In [None]:
sql = """
SELECT state, count(city) AS ct
FROM location
GROUP BY state
ORDER BY ct DESC
"""
counts = sqlcontext.sql(sql)

<!-- label:spark_query_collect -->
The evaluation is only performed when the results are needed.

In [None]:
res = counts.collect()
res[:3]

<!-- label:spark_mapreduce -->

Spark is particularly comfortable with map-reduce tasks.
The input data can be our table (stored in a RDBM).
Here we count the number of times suffixes are found in city names:

In [None]:
names = (location
         .filter(location.city.isNotNull())
         .rdd
         .flatMap(lambda rec: [x[-5:] for x in rec.city.split()])
         .map(lambda word: (word.lower(), 1))
         .reduceByKey(lambda a, b: a+b))
names.takeOrdered(10, key = lambda x: -x[1])

---

<!-- label:spark_sqlmapreduce -->

We can also seamlessly use result table obtained from an SQL query
to perform map/reduce tasks:

In [None]:
sql = """
SELECT city
FROM (SELECT * FROM casecount WHERE epiweek LIKE '1912%') AS sub
INNER JOIN location
ON location.id=sub.location_id
GROUP BY city
"""
y_1912 = sqlcontext.sql(sql)
names = (y_1912
         .filter(y_1912.city.isNotNull())
         .rdd
         .flatMap(lambda rec: [x[-5:] for x in rec.city.split()])
         .map(lambda word: (word.lower(), 1))
         .reduceByKey(lambda a,b: a+b))
names.takeOrdered(5, key = lambda x: -x[1])

---

<!-- label:spark_sqlmapreduceggplot_1_2 -->

# SQL query

In [None]:
## --- SQL ---
sql = """
SELECT state, city, date_from, event, count AS ct
FROM (SELECT * FROM casecount WHERE epiweek LIKE '1912%') AS sub
INNER JOIN location
ON location.id=sub.location_id
"""

y_1912 = sqlcontext.sql(sql)

## --- Spark ---
cases = (y_1912
         .rdd
         .map(lambda rec: ((rec.state,
	                    int(rec.date_from.split('-')[1]),
			    rec.event),
                           rec.ct))
         .reduceByKey(lambda a, b: a + b)).collect()