# Batch processing with Spark and Cassandra

## Import Spark Libraries

In [1]:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark import StorageLevel
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import *

## Connect to Cassandra and HDFS

In [2]:
from cassandra.cluster import Cluster
cluster = Cluster()

ec2_host = "ec2-52-35-74-206.us-west-2.compute.amazonaws.com:9000/"
hdfs_dir = "camus/topics/smw_low_freq6/hourly/2016/01/23"

conf = SparkConf().setAppName("Smart Meter Watchdog")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
df = sqlContext.read.json("hdfs://" + ec2_host + hdfs_dir)

In [8]:
df = sqlContext.read.json("hdfs://" + ec2_host + hdfs_dir + "/10")

## Take some samples to make sure it works!

In [9]:
print df.take(1)

[Row(houseId=7155, label=u'disposal', meterId=11, power=u'0.00', timestamp=u'1303106503', zip=u'11710')]


In [10]:
print df.count()

12523786


In [7]:
df.printSchema()

root
 |-- houseId: long (nullable = true)
 |-- label: string (nullable = true)
 |-- meterId: long (nullable = true)
 |-- power: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- zip: string (nullable = true)



In [None]:
import time
time.strftime("%D", time.localtime(int("1306006763")))

In [None]:
import time

def ts2date(curTime):
    return time.strftime("%D", time.localtime(int(curTime)))

ts2date('1306006763')

### Add one more column and convert timestamp to date

In [None]:
df_date = SQLContext.createDataFrame(sqlContext, df.map(lambda row: Row(**dict(row.asDict(), date=ts2date(row.timestamp)))))

In [None]:
df_date.take(1)

In [None]:
print type(df_date)
print type(df)

In [None]:
df_house_power = df_date.select(df_date['houseId'], df_date['date'], df_date['zip'], df_date['power'])

In [None]:
df_house_power.take(1)

In [None]:
df_house_power_aggr = df_house_power.map(lambda x: ((x.houseId, x.date, x.zip), x.power)).reduceByKey(lambda x, y: float(x)+float(y))

In [None]:
df_house_power_aggr.take(1)

In [None]:
df_house_power_aggr.take(10)

In [None]:
df_house_clean = df_house_power_aggr.map(lambda x: {
        "houseId": x[0][0],
        "date": x[0][1],
        "zip": x[0][2],
        "power": x[1]
    })

In [None]:
type(df_house_clean)

In [None]:
df_house_clean.count()

In [None]:
def aggToCassandraPart(agg):
    if agg:
        cascluster = Cluster(['52.89.47.199', '52.89.59.188', '52.88.228.95', '52.35.74.206'])
        casSession = cascluster.connect('playground')
        for rec in agg:
            casSession.execute('INSERT INTO power_aggr2 (houseId, date, zip, power) VALUES (%s, %s, %s, %s)', (str(rec['houseId']), rec['date'], rec['zip'], str(rec['power'])))
        casSession.shutdown()
        cascluster.shutdown()

In [None]:
df_house_clean.foreachPartition(aggToCassandraPart)

## Simple operation on the Cassandra database to make sure it works :-)

In [None]:
session = cluster.connect('playground')

result = session.execute("select * from email")
for x in result: print x