# Batch processing with Spark and Cassandra

## Import Spark Libraries

In [1]:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark import StorageLevel
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import *

## Connect to Cassandra and HDFS

In [2]:
from cassandra.cluster import Cluster
cluster = Cluster()

ec2_host = "ec2-52-35-74-206.us-west-2.compute.amazonaws.com:9000/"
hdfs_dir = "camus/topics/smw_low_freq2/hourly/2016/01/21/00"

conf = SparkConf().setAppName("Smart Meter Watchdog")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
df = sqlContext.read.json("hdfs://" + ec2_host + hdfs_dir)

## Take some samples to make sure it works!

In [3]:
print df.take(3)

[Row(houseId=622, label=u'electric_heat', meterId=12, power=u'0.00', timestamp=u'1303100654', zip=u'61562'), Row(houseId=9063, label=u'lighting', meterId=23, power=u'67.50', timestamp=u'1303100651', zip=u'45688'), Row(houseId=6766, label=u'outlets_unknown', meterId=10, power=u'0.00', timestamp=u'1306006763', zip=u'83539')]


In [4]:
df.printSchema()

root
 |-- houseId: long (nullable = true)
 |-- label: string (nullable = true)
 |-- meterId: long (nullable = true)
 |-- power: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- zip: string (nullable = true)



In [5]:
import time
time.strftime("%D", time.localtime(int("1306006763")))

'05/21/11'

In [6]:
import time

def ts2date(curTime):
    return time.strftime("%D", time.localtime(int(curTime)))

ts2date('1306006763')

'05/21/11'

### Add one more column and convert timestamp to date

In [7]:
df_date = SQLContext.createDataFrame(sqlContext, df.map(lambda row: Row(**dict(row.asDict(), date=ts2date(row.timestamp)))))

In [8]:
df_date.take(1)

[Row(date=u'04/18/11', houseId=622, label=u'electric_heat', meterId=12, power=u'0.00', timestamp=u'1303100654', zip=u'61562')]

In [9]:
print type(df_date)
print type(df)

<class 'pyspark.sql.dataframe.DataFrame'>
<class 'pyspark.sql.dataframe.DataFrame'>


In [10]:
df_house_power = df_date.select(df_date['houseId'], df_date['date'], df_date['zip'], df_date['power'])

In [11]:
df_house_power.take(1)

[Row(houseId=622, date=u'04/18/11', zip=u'61562', power=u'0.00')]

In [12]:
df_house_power_aggr = df_house_power.map(lambda x: ((x.houseId, x.date, x.zip), x.power)).reduceByKey(lambda x, y: float(x)+float(y))

In [13]:
df_house_power_aggr.take(1)

[((4024, u'04/16/11', u'22731'), 13820.779999999999)]

In [15]:
df_house_power_aggr.take(10)

[((4024, u'04/16/11', u'22731'), 13820.779999999999),
 ((4993, u'04/16/11', u'67905'), 12132.66),
 ((5524, u'04/18/11', u'04474'), 7736.469999999999),
 ((196, u'04/18/11', u'56397'), 4399.08),
 ((605, u'04/16/11', u'28649'), 13230.08),
 ((2452, u'05/21/11', u'46939'), u'3.00'),
 ((2149, u'04/17/11', u'74901'), u'15.00'),
 ((8386, u'05/21/11', u'06071'), u'108.00'),
 ((7250, u'04/18/11', u'26060'), u'1.00'),
 ((1561, u'05/21/11', u'12160'), 5.0)]

In [27]:
df_house_clean = df_house_power_aggr.map(lambda x: {
        "houseId": x[0][0],
        "date": x[0][1],
        "zip": x[0][2],
        "power": x[1]
    })

In [28]:
df_house_clean.take(1)

[{'date': u'04/16/11',
  'houseId': 4024,
  'power': 13820.779999999999,
  'zip': u'22731'}]

In [29]:
def aggToCassandraPart(agg):
    if agg:
        cascluster = Cluster(['52.89.47.199', '52.89.59.188', '52.88.228.95', '52.35.74.206'])
        casSession = cascluster.connect('playground')
        for aggItem in agg:
            casSession.execute('INSERT INTO power_aggr1 (houseId, date, zip, power) VALUES (%s, %s, %s, %s)', (aggItem[0], aggItem[1], aggItem[2], aggItem[3]))
        casSession.shutdown()
        cascluster.shutdown()

In [30]:
df_house_clean.foreachPartition(aggToCassandraPart)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 18.0 failed 1 times, most recent failure: Lost task 0.0 in stage 18.0 (TID 24, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2355, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2355, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2355, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2355, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 317, in func
    return f(iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 761, in func
    r = f(it)
  File "<ipython-input-29-b32f950cc959>", line 7, in aggToCassandraPart
KeyError: 0

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:300)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:264)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
	at org.apache.spark.scheduler.Task.run(Task.scala:88)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1850)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1921)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:909)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:908)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:405)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:606)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:207)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2355, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2355, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2355, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2355, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 317, in func
    return f(iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 761, in func
    r = f(it)
  File "<ipython-input-29-b32f950cc959>", line 7, in aggToCassandraPart
KeyError: 0

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:300)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:264)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
	at org.apache.spark.scheduler.Task.run(Task.scala:88)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	... 1 more


In [1]:
import pyspark_cassandra

In [18]:
df_house_clean.saveToCassandra("playground", "power_aggr1")

Py4JJavaError: An error occurred while calling o84.loadClass.
: java.lang.ClassNotFoundException: pyspark_cassandra.PythonHelper
	at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
	at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
	at java.security.AccessController.doPrivileged(Native Method)
	at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:606)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:207)
	at java.lang.Thread.run(Thread.java:745)


## Simple operation on the Cassandra database to make sure it works :-)

In [22]:
session = cluster.connect('playground')

result = session.execute("select * from email")
for x in result: print x

Row(id=u'austin@insightdataengineering.com', date=u'2015-09-02', time=datetime.datetime(2015, 9, 2, 9, 46), fname=u'Austin', lname=u'Ouyang', message=u'We have happy hours on every Friday!')
Row(id=u'david@insightdataengineering.com', date=u'2015-09-01', time=datetime.datetime(2015, 9, 1, 15, 33), fname=u'David', lname=u'Drummond', message=u'Bounce off project ideas off of us or fellows. Brainstorming is really helpful.')
Row(id=u'david@insightdataengineering.com', date=u'2015-09-01', time=datetime.datetime(2015, 9, 1, 10, 15), fname=u'David', lname=u'Drummond', message=u'Alumni is a great resource for you to get help from.')
Row(id=u'ronak@insightdataengineering.com', date=u'2015-09-01', time=datetime.datetime(2015, 9, 1, 12, 10), fname=u'Ronak', lname=u'Nathani', message=u'We are sure you will build great things at Insight!')
Row(id=u'ronak@insightdataengineering.com', date=u'2015-09-01', time=datetime.datetime(2015, 9, 1, 10, 3), fname=u'Ronak', lname=u'Nathani', message=u'Welcome t