# Batch processing with Spark and Cassandra

## Import Spark Libraries

In [1]:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark import StorageLevel
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import *

## Connect to Cassandra and HDFS

In [2]:
from cassandra.cluster import Cluster
cluster = Cluster()

ec2_host = "ec2-52-35-74-206.us-west-2.compute.amazonaws.com:9000/"
hdfs_dir = "camus/topics/smw_low_freq2/hourly/2016/01/21/00"

conf = SparkConf().setAppName("Smart Meter Watchdog")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
df = sqlContext.read.json("hdfs://" + ec2_host + hdfs_dir)

## Take some samples to make sure it works!

In [3]:
print df.take(3)

[Row(houseId=622, label=u'electric_heat', meterId=12, power=u'0.00', timestamp=u'1303100654', zip=u'61562'), Row(houseId=9063, label=u'lighting', meterId=23, power=u'67.50', timestamp=u'1303100651', zip=u'45688'), Row(houseId=6766, label=u'outlets_unknown', meterId=10, power=u'0.00', timestamp=u'1306006763', zip=u'83539')]


In [4]:
df.printSchema()

root
 |-- houseId: long (nullable = true)
 |-- label: string (nullable = true)
 |-- meterId: long (nullable = true)
 |-- power: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- zip: string (nullable = true)



In [5]:
import time
time.strftime("%D", time.localtime(int("1306006763")))

'05/21/11'

In [6]:
import time

def ts2date(curTime):
    return time.strftime("%D", time.localtime(int(curTime)))

ts2date('1306006763')

'05/21/11'

### Add one more column and convert timestamp to date

In [7]:
df_date = SQLContext.createDataFrame(sqlContext, df.map(lambda row: Row(**dict(row.asDict(), date=ts2date(row.timestamp)))))

In [8]:
df_date.take(1)

[Row(date=u'04/18/11', houseId=622, label=u'electric_heat', meterId=12, power=u'0.00', timestamp=u'1303100654', zip=u'61562')]

In [9]:
print type(df_date)
print type(df)

<class 'pyspark.sql.dataframe.DataFrame'>
<class 'pyspark.sql.dataframe.DataFrame'>


In [10]:
df_house_power = df_date.select(df_date['houseId'], df_date['date'], df_date['zip'], df_date['power'])

In [11]:
df_house_power.take(1)

[Row(houseId=622, date=u'04/18/11', zip=u'61562', power=u'0.00')]

In [12]:
df_house_power_aggr = df_house_power.map(lambda x: ((x.houseId, x.date, x.zip), x.power)).reduceByKey(lambda x, y: float(x)+float(y))

In [13]:
df_house_power_aggr.take(1)

[((4024, u'04/16/11', u'22731'), 13820.779999999999)]

In [14]:
df_house_power_aggr.take(10)

[((4024, u'04/16/11', u'22731'), 13820.779999999999),
 ((4993, u'04/16/11', u'67905'), 12132.66),
 ((5524, u'04/18/11', u'04474'), 7736.469999999999),
 ((196, u'04/18/11', u'56397'), 4399.08),
 ((605, u'04/16/11', u'28649'), 13230.08),
 ((2452, u'05/21/11', u'46939'), u'3.00'),
 ((2149, u'04/17/11', u'74901'), u'15.00'),
 ((8386, u'05/21/11', u'06071'), u'108.00'),
 ((7250, u'04/18/11', u'26060'), u'1.00'),
 ((1561, u'05/21/11', u'12160'), 5.0)]

In [15]:
df_house_clean = df_house_power_aggr.map(lambda x: {
        "houseId": x[0][0],
        "date": x[0][1],
        "zip": x[0][2],
        "power": x[1]
    })

In [16]:
type(df_house_clean)

pyspark.rdd.PipelinedRDD

In [17]:
df_house_clean.count()

19118

In [20]:
def aggToCassandraPart(agg):
    if agg:
        cascluster = Cluster(['52.89.47.199', '52.89.59.188', '52.88.228.95', '52.35.74.206'])
        casSession = cascluster.connect('playground')
        for rec in agg:
            casSession.execute('INSERT INTO power_aggr1 (houseId, date, zip, power) VALUES (%s, %s, %s, %s)', (str(rec['houseId']), rec['date'], rec['zip'], str(rec['power'])))
        casSession.shutdown()
        cascluster.shutdown()

In [None]:
def saveCassandra(rec):
#    cascluster = Cluster(['52.89.47.199', '52.89.59.188', '52.88.228.95', '52.35.74.206'])
    cascluster = Cluster()
    casSession = cascluster.connect('playground')
    casSession.execute('INSERT INTO power_aggr1 (houseId, date, zip, power) VALUES (%s, %s, %s, %s)', (str(rec['houseId']), rec['date'], rec['zip'], str(rec['power'])))
    casSession.shutdown()
    cascluster.shutdown()

In [None]:
for rec in df_house_clean.collect():
    saveCassandra(rec)

In [21]:
df_house_clean.foreachPartition(aggToCassandraPart)

In [None]:
import pyspark_cassandra

In [None]:
df_house_clean.saveToCassandra("playground", "power_aggr1")

## Simple operation on the Cassandra database to make sure it works :-)

In [None]:
session = cluster.connect('playground')

result = session.execute("select * from email")
for x in result: print x