In [1]:
import sys, glob, os
SPARK_HOME=os.environ['SPARK_HOME']
sys.path.append(SPARK_HOME + "/python")
sys.path.append(glob.glob(SPARK_HOME + "/python/lib/py4j*.zip")[0])
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf 


cassandra_host = "localhost"

spark_conf = (SparkConf()
                .setAppName("BatchJob - Data loader")
                .setIfMissing("spark.master", "local")
                .set("spark.cassandra.connection.host", cassandra_host)
                .set("spark.cassandra.connection.port", 9042)
                .set("spark.sql.shuffle.partitions", 10)
               )

# Create spark session
spark = (SparkSession
         .builder
         .config(conf = spark_conf)
         .getOrCreate()
        )
sc = spark.sparkContext
sql = spark.sql
print(sc.uiWebUrl)

http://10.0.2.15:4040


In [12]:
from pyspark.sql import Row
from pyspark.sql import functions as F

In [3]:
rdd = sc.parallelize([Row(a = 1, b = 4)])

In [6]:
df = spark.createDataFrame(rdd)
df.show()

+---+---+
|  a|  b|
+---+---+
|  1|  4|
+---+---+



In [7]:
r = df.rdd.collect()[0]

In [8]:
r.a

1

In [22]:
base_path = "file:///home/cloudera/notebooks/RandomDataGenerator-master/target/"

In [23]:
customers = spark.read.options(inferSchena = True).json(base_path + "customers.json")
customers.show()
customers.printSchema()
(customers
    .drop("address")
    .write
    .mode("overwrite")
    .format("org.apache.spark.sql.cassandra")
    .options(table = "customer", keyspace = "cc")
    .save())

+--------------------+---+----------+--------------------+----------+------+------------+--------------+
|             address|age|       dob|               email|first_name|gender|          id|     last_name|
+--------------------+---+----------+--------------------+----------+------+------------+--------------+
|[Brooklyn,Kings,L...| 19|1999-09-09| iforell@hotmail.com|    Ivania|     M|800000000000|        Forell|
|[Bronx,Bronx,LF00...| 63|1955-06-23|jodena.tetreau@ms...|    Jodena|     F|800000000001|       Tetreau|
|[Merrick,Nassau,L...| 73|1945-02-05|     sshoyko@msn.com|   Shawnic|     F|800000000002|        Shoyko|
|[New York,New Yor...| 67|1950-11-19|jerzei.berardo@gm...|    Jerzei|     F|800000000003|       Berardo|
|[Saratoga Springs...| 28|1990-06-05|einzinger@hotmail...|   Elinora|     M|800000000004|      Inzinger|
|[Rome,Oneida,LF00...| 34|1984-09-28|bschillinglaw@msn...|   Braylyn|     F|800000000005|  Schillinglaw|
|[New York,New Yor...| 61|1957-01-24| hsherwood@gmail.c

In [2]:
def cass_table(table_name):
    return (spark
        .read
        .format("org.apache.spark.sql.cassandra")
        .options(table = table_name, keyspace = "cc")
        .load())

In [3]:
cass_table("customer").show()

+------------+-------+---+----------------------+----------------------+----------+--------------------+----------+------+-------------+
|          id|address|age|amount_lower_threshold|amount_upper_threshold|       dob|               email|first_name|gender|    last_name|
+------------+-------+---+----------------------+----------------------+----------+--------------------+----------+------+-------------+
|800000002686|   null| 79|    1873.6919950506053|     5750.826738407709|1939-03-15|clea.eckenbrecht@...|      Clea|     M|  Eckenbrecht|
|800000009014|   null| 22|    2036.0152859916593|     5845.483106360582|1996-03-29|deondrick.manopin...| Deondrick|     F|  Manopinives|
|800000003508|   null| 33|    1970.2678858109768|     6031.421696497392|1985-04-26|jchristophides@ms...|  Jazabell|     F|Christophides|
|800000009097|   null| 74|    1860.8318397319022|     5882.868594280499|1944-05-26|kiptyn.chato@hotm...|    Kiptyn|     F|        Chato|
|800000003132|   null| 76|    2023.022367

In [30]:
merchants = spark.read.options(inferSchena = True).json(base_path + "merchants.json")
merchants.show()
merchants.printSchema()
(merchants
    .write
    .mode("overwrite")
    .format("org.apache.spark.sql.cassandra")
    .options(table = "merchant", keyspace = "cc")
    .save())

+------------+--------------------+
|          id|                name|
+------------+--------------------+
|DZ0000000000|iShares 7-10 Year...|
|DZ0000000001|National American...|
|DZ0000000002|Jensyn Acquistion...|
|DZ0000000003|     Interface, Inc.|
|DZ0000000004| FTD Companies, Inc.|
|DZ0000000005|NextDecade Corpor...|
|DZ0000000006|  MakeMyTrip Limited|
|DZ0000000007|Dynavax Technolog...|
|DZ0000000008|        HyreCar Inc.|
|DZ0000000009|Highland/iBoxx Se...|
|DZ0000000010|  Liberty Global plc|
|DZ0000000011|Green Plains Part...|
|DZ0000000012|  Vertex Energy, Inc|
|DZ0000000013|     Fuel Tech, Inc.|
|DZ0000000014|    TiVo Corporation|
|DZ0000000015|        Cerecor Inc.|
|DZ0000000016|    SMTC Corporation|
|DZ0000000017|Magellan Health, ...|
|DZ0000000018|      Check-Cap Ltd.|
|DZ0000000019|Village Bank and ...|
+------------+--------------------+
only showing top 20 rows

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)



In [26]:
(spark
    .read
    .format("org.apache.spark.sql.cassandra")
    .options(table = "merchant", keyspace = "cc")
    .load()).show()

+------------+-------+--------------------+------+
|          id|address|                name|ticker|
+------------+-------+--------------------+------+
|DZ0000000006|   null|  MakeMyTrip Limited|  null|
|DZ0000000088|   null|Energy Recovery, ...|  null|
|DZ0000000032|   null|Brooks Automation...|  null|
|DZ0000000087|   null|Pluristem Therape...|  null|
|DZ0000000035|   null|    The Organics ETF|  null|
|DZ0000000092|   null|ETF Series Soluti...|  null|
|DZ0000000017|   null|Magellan Health, ...|  null|
|DZ0000000053|   null|  Zion Oil & Gas Inc|  null|
|DZ0000000073|   null|  Blink Charging Co.|  null|
|DZ0000000099|   null|Primo Water Corpo...|  null|
|DZ0000000049|   null|Constellation Alp...|  null|
|DZ0000000056|   null|AgroFresh Solutio...|  null|
|DZ0000000064|   null|WisdomTree Middle...|  null|
|DZ0000000002|   null|Jensyn Acquistion...|  null|
|DZ0000000098|   null|RCI Hospitality H...|  null|
|DZ0000000063|   null|  Spark Energy, Inc.|  null|
|DZ0000000050|   null|Coffee Ho

In [53]:
transactions = (spark
                .read
                .options(inferSchena = True)
                .json(base_path + "transactions.json")
                .withColumn("timestamp", F.expr("from_unixtime(cast(timestamp/pow(10, 9) as bigint))"))
               )
transactions.show()
transactions.printSchema()

+------------------+--------+------------+------------+------------+-------------------+
|            amount|category| customer_id|          id| merchant_id|          timestamp|
+------------------+--------+------------+------------+------------+-------------------+
| 802.7103986581193|     atm|800000002081|690000000000|DZ0000000085|2018-08-31 16:45:00|
|507.74628999676526|     web|800000005031|690000000001|DZ0000000058|2018-07-16 01:53:31|
| 606.4194934089878|     pos|800000003872|690000000002|DZ0000000063|2018-05-17 14:57:21|
| 319.4347823556573|     web|800000004311|690000000003|DZ0000000065|2018-08-07 19:28:49|
| 6.499356159099801|  mobile|800000001603|690000000004|DZ0000000061|2018-07-05 13:25:49|
|1202.5906039488204|     web|800000008644|690000000005|DZ0000000045|2018-09-24 08:59:37|
| 334.1742337330727|  mobile|800000009101|690000000006|DZ0000000023|2018-05-27 18:44:58|
|1276.2935468125615|     atm|800000003658|690000000007|DZ0000000003|2018-08-18 22:33:10|
|1014.0822024764368| 

In [54]:
(transactions
    .write
    .mode("overwrite")
    .format("org.apache.spark.sql.cassandra")
    .options(table = "transactions", keyspace = "cc")
    .save())

In [56]:
transactions = (spark
    .read
    .format("org.apache.spark.sql.cassandra")
    .options(table = "transactions", keyspace = "cc")
    .load())
transactions.limit(10).toPandas()

Unnamed: 0,customer_id,timestamp,id,amount,category,flag_ml,flag_threshold,location,location_id,merchant_id,overruled,overruled_comment,overruled_date,score
0,800000001153,2018-09-30 12:15:45,690000427644,3425.031316,web,,,,,DZ0000000064,,,,
1,800000001153,2018-09-29 02:19:26,690000687444,4336.292096,pos,,,,,DZ0000000005,,,,
2,800000001153,2018-09-28 19:20:54,690000408436,3677.334347,mobile,,,,,DZ0000000062,,,,
3,800000001153,2018-09-26 10:59:57,690000260053,2112.259579,web,,,,,DZ0000000032,,,,
4,800000001153,2018-09-26 10:05:18,690000816774,5649.595428,mobile,,,,,DZ0000000009,,,,
5,800000001153,2018-09-24 02:56:26,690000323967,3665.461645,mobile,,,,,DZ0000000049,,,,
6,800000001153,2018-09-23 07:23:32,690000291467,2979.588375,web,,,,,DZ0000000065,,,,
7,800000001153,2018-09-20 03:06:53,690000151729,1464.626877,web,,,,,DZ0000000068,,,,
8,800000001153,2018-09-17 20:26:13,690000192725,1434.020856,atm,,,,,DZ0000000020,,,,
9,800000001153,2018-09-17 13:55:24,690000527597,3245.769799,atm,,,,,DZ0000000038,,,,


In [62]:
agg = (transactions
    .groupBy("customer_id")
    .agg(F.avg("amount").alias("amount_avg"), F.stddev("amount").alias("amount_std"))
    .withColumn("amount_upper_threshold", F.expr("amount_avg + amount_std"))
    .withColumn("amount_lower_threshold", F.expr("amount_avg - amount_std"))
)

agg.show()

+------------+------------------+------------------+----------------------+----------------------+
| customer_id|        amount_avg|        amount_std|amount_upper_threshold|amount_lower_threshold|
+------------+------------------+------------------+----------------------+----------------------+
|800000001787| 3983.031096243898| 2060.086462252426|     6043.117558496324|     1922.944633991472|
|800000001931|3974.1879014627393| 1946.648429706895|     5920.836331169634|    2027.5394717558443|
|800000004812| 4423.359909963747|1988.9363026886174|     6412.296212652364|     2434.423607275129|
|800000005529| 4159.854916014828| 1900.305779842582|      6060.16069585741|    2259.5491361722457|
|800000002590|4004.3542519619828|1819.3895657008268|     5823.743817662809|     2184.964686261156|
|800000009699| 3977.281326918583|1642.0694171450184|     5619.350744063601|     2335.211909773565|
|800000007241|4046.4066313541302|2207.9199344858944|     6254.326565840025|    1838.4866968682359|
|800000000

In [64]:
(agg
    .select("customer_id", "amount_upper_threshold", "amount_lower_threshold")
    .withColumnRenamed("customer_id", "id")
    .write
    .mode("append")
    .format("org.apache.spark.sql.cassandra")
    .options(table = "customer", keyspace = "cc")
    .save())

In [69]:
cass_table("customer").show()

+------------+-------+---+----------------------+----------------------+----------+--------------------+----------+------+--------------+
|          id|address|age|amount_lower_threshold|amount_upper_threshold|       dob|               email|first_name|gender|     last_name|
+------------+-------+---+----------------------+----------------------+----------+--------------------+----------+------+--------------+
|800000001153|   null| 47|    2351.3492719376018|     5921.054209719426|1970-10-23|alexxis.wolligand...|   Alexxis|     M|    Wolligandt|
|800000005616|   null| 31|    2068.9989358302373|     5829.592572147718|1987-08-18|     dkindem@msn.com|    Dijion|     F|        Kindem|
|800000009943|   null| 73|    2149.6996574971977|     6070.356958454686|1945-07-10|jocie.briatte@hot...|     Jocie|     M|       Briatte|
|800000008957|   null| 31|    1857.9226551039055|     5960.910822192255|1987-07-11|  qharback@gmail.com|   Quanzie|     F|       Harback|
|800000000422|   null| 20|     221

In [4]:
cass = Cluster(["localhost"]).connect("cc")
cass.shutdown

<cassandra.cluster.Session at 0x7f15ff2b9160>

In [13]:
list(cassandra_session.execute("select * from customer limit 3"))

[Row(id='800000009260', address=None, age=38, amount_lower_threshold=2406.1891696538737, amount_upper_threshold=6044.088365042324, dob=Date(3790), email='kamaree.rodenhuser@msn.com', first_name='Kamaree', gender='M', last_name='Rodenhäuser'),
 Row(id='800000001652', address=None, age=88, amount_lower_threshold=1999.5693000247313, amount_upper_threshold=5770.614137833819, dob=Date(-14514), email='riser.chelliah@msn.com', first_name='Riser', gender='F', last_name='Chelliah'),
 Row(id='800000005063', address=None, age=45, amount_lower_threshold=2129.9385858059727, amount_upper_threshold=6063.82829296996, dob=Date(1064), email='vhausting@gmail.com', first_name='Vyom', gender='M', last_name='Hausting')]

In [25]:
cass.execute("select amount_lower_threshold from customer where id = %s", ('800000009260',)).one()

Row(amount_lower_threshold=2406.1891696538737)

In [6]:
from cassandra.cluster import Cluster

rdd = sc.parallelize(['800000009260', '800000001652', '800000005063'])

def detect_anomalies(tnx):
    cluster = Cluster(["localhost"])
    cass = cluster.connect("cc")
    result = []
    for r in tnx:
        rec = cass.execute("select id, amount_lower_threshold from customer where id = %s", (r,)).one()
        result.append((rec.id, rec.amount_lower_threshold))
    cass.shutdown()
    cluster.shutdown()
    return result

rdd.mapPartitions(detect_anomalies).collect()

[('800000009260', 2406.1891696538737),
 ('800000001652', 1999.5693000247313),
 ('800000005063', 2129.9385858059727)]