# Collaborative Filtering and Comedy! 
------
<img src="images/seinfeld.jpg" width="400" height="400">

## A demo using DataStax Enterprise Analytics, Apache Cassandra, Apache Spark, Python, Jupyter Notebooks, Spark MlLib, and KMeans 

#### Real Dataset: http://eigentaste.berkeley.edu/dataset/

## Import python packages -- all are required
* Need to tell Jupyter to display with %matplotlib otherwise you will generate the plot but not display it

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.datastax.spark:spark-cassandra-connector_2.11:2.3.2 pyspark-shell'

In [3]:
import pandas
import cassandra
import pyspark
import re
import os
import matplotlib.pyplot as plt
from IPython.display import IFrame
from IPython.display import display, Markdown
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

#### Helper function to have nicer formatting of Spark DataFrames

In [4]:
#Helper for pretty formatting for Spark DataFrames
def showDF(df, limitRows =  10, truncate = False):
    if(truncate):
        pandas.set_option('display.max_colwidth', 100)
    else:
        pandas.set_option('display.max_colwidth', -1)
    pandas.set_option('display.max_rows', limitRows)
    display(df.limit(limitRows).toPandas())
    pandas.reset_option('display.max_rows')

# Apache Cassandra 
<img src="images/cassandralogo.png" width="200" height="200">

## Creating Tables and Loading Tables

## Connect to Apache Cassandra Local Instance

In [5]:
from cassandra.cluster import Cluster

cluster = Cluster(['127.0.01'])
session = cluster.connect()

### Create Demo Keyspace 

In [6]:
session.execute("""
    CREATE KEYSPACE IF NOT EXISTS jokes 
    WITH REPLICATION = 
    { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }"""
)

<cassandra.cluster.ResultSet at 0x117ecbe10>

### Set keyspace 

In [7]:
session.set_keyspace('jokes')

### Create table called wine. Our PRIMARY will be a unique key (userid) we generate for each row. This will result in an even distribution of the data but we will have to utilize that PRIMARY KEY in our WHERE clause in any of our CQL queries. 

In [8]:
query = "CREATE TABLE IF NOT EXISTS jokes_table1 \
                                    (userid int, jokeid int, rating float, \
                                     PRIMARY KEY (userid, jokeid))"
session.execute(query)

<cassandra.cluster.ResultSet at 0x117c1f0b8>

### Load Joke Dataset
<img src="images/laughing.gif" width="300" height="300">

### Load Jokes dataset from CSV file (jester-data-1.csv)
* No clean up was requried! How nice :)

#### Insert all the Joke Rating Data into the Apache Cassandra table `jokes_table`

In [9]:
fileName = 'data/jester_ratings3.csv'
input_file = open(fileName, 'r')

for line in input_file:
    jokeRow = line.split(',')
    query = "INSERT INTO jokes_table1 (userid, jokeid, rating)"
    
    query = query + "VALUES (%s, %s, %s)"
    
    session.execute(query, (int(jokeRow[0]), int(jokeRow[1]) , float(jokeRow[2]) ))

#### Do a select * on joke_table WHERE wineid = x to verify that data was loaded into the table

In [10]:
query = 'SELECT * FROM jokes_table1 WHERE userid = 100'
rows = session.execute(query)
for row in rows:
    print (row.userid, row.jokeid, row.rating)

100 5 -0.875
100 7 9.906000137329102
100 8 -0.843999981880188
100 13 8.937999725341797
100 15 -0.968999981880188
100 16 -9.75
100 17 9.593999862670898


## Machine Learning with Apache Spark
<img src="images/sparklogo.png" width="150" height="200">

### Finally time for Apache Spark! 

#### Create a spark session that is connected to Cassandra. From there load each table into a Spark Dataframe and take a count of the number of rows in each.

In [11]:
spark = SparkSession.builder.appName('demo').master("local").getOrCreate()

jokeTable = spark.read.format("org.apache.spark.sql.cassandra").options(table="jokes_table1", keyspace="jokes").load()

print ("Table Row Count: ")
print (jokeTable.count())

Table Row Count: 
10000


In [12]:
showDF(jokeTable)
joke_df = jokeTable.withColumn("rating", jokeTable.rating.cast('int'))
showDF(joke_df)

Unnamed: 0,userid,jokeid,rating
0,218,5,9.531
1,218,7,-0.594
2,218,8,9.875
3,4,5,-5.812
4,4,7,-4.5
5,4,8,-4.906
6,18,5,-0.438
7,18,7,-7.344
8,18,8,2.375
9,18,13,-2.281


Unnamed: 0,userid,jokeid,rating
0,218,5,9
1,218,7,0
2,218,8,9
3,4,5,-5
4,4,7,-4
5,4,8,-4
6,18,5,0
7,18,7,-7
8,18,8,2
9,18,13,-2


In [13]:
(training, test) = jokeTable.randomSplit([0.8, 0.2])

training_df = training.withColumn("rating", training.rating.cast('int'))
testing_df = test.withColumn("rating", test.rating.cast('int'))

showDF(training_df)

Unnamed: 0,userid,jokeid,rating
0,4,5,-5
1,4,7,-4
2,4,8,-4
3,18,5,0
4,18,7,-7
5,18,8,2
6,18,13,-2
7,18,16,-4
8,18,17,-5
9,18,18,-2


In [14]:
als = ALS(maxIter=5, regParam=0.01, userCol="userid", itemCol="jokeid", ratingCol="rating",
          coldStartStrategy="drop")

model = als.fit(training_df)

In [15]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(testing_df)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

# Generate top 10 joke recommendations for each user
userRecs = model.recommendForAllUsers(10)

showDF(userRecs)

# Generate top 10 user recommendations for each joke
jokeRecs = model.recommendForAllItems(10)

showDF(jokeRecs)


Root-mean-square error = 6.828945359706267


Unnamed: 0,userid,recommendations
0,148,"[(40, 27.385730743408203), (28, 24.54682159423828), (90, 24.088212966918945), (87, 22.36663818359375), (146, 20.649139404296875), (109, 20.540903091430664), (116, 20.276275634765625), (58, 18.22146987915039), (88, 17.108795166015625), (64, 16.95281219482422)]"
1,243,"[(141, 33.20899200439453), (33, 23.810840606689453), (112, 20.11299705505371), (16, 15.617582321166992), (90, 15.261026382446289), (120, 14.408105850219727), (116, 14.169095993041992), (60, 13.992173194885254), (138, 13.043665885925293), (110, 12.933907508850098)]"
2,251,"[(138, 8.331933975219727), (146, 7.703481674194336), (139, 4.0359015464782715), (41, 3.621737003326416), (141, 3.049499273300171), (18, 0.36673831939697266), (56, -0.03189730644226074), (122, -1.328823447227478), (50, -1.8599724769592285), (16, -2.2206239700317383)]"
3,85,"[(124, 51.7579345703125), (58, 49.576534271240234), (55, 39.49469757080078), (52, 30.88326072692871), (141, 29.356218338012695), (90, 28.08550262451172), (122, 24.83456802368164), (64, 23.766693115234375), (28, 23.167940139770508), (138, 19.826740264892578)]"
4,137,"[(71, 13.842877388000488), (145, 12.839102745056152), (44, 12.686053276062012), (82, 12.268667221069336), (100, 12.248852729797363), (75, 11.92045783996582), (51, 11.273324966430664), (58, 10.890685081481934), (124, 10.753368377685547), (94, 9.016607284545898)]"
5,65,"[(145, 14.56903076171875), (43, 11.844061851501465), (79, 11.283380508422852), (80, 11.234314918518066), (94, 11.042588233947754), (73, 10.83102035522461), (137, 10.767558097839355), (38, 10.715609550476074), (82, 10.636094093322754), (40, 10.619543075561523)]"
6,53,"[(41, 14.527985572814941), (40, 14.371387481689453), (146, 11.115885734558105), (22, 10.421339988708496), (119, 9.233083724975586), (113, 8.829170227050781), (44, 8.780244827270508), (134, 8.295533180236816), (93, 7.711897850036621), (21, 7.61931037902832)]"
7,133,"[(75, 15.029008865356445), (83, 13.779787063598633), (21, 11.41662883758545), (40, 10.257682800292969), (22, 10.067930221557617), (66, 9.839597702026367), (69, 9.485353469848633), (42, 9.052323341369629), (34, 9.001533508300781), (80, 8.929502487182617)]"
8,155,"[(34, 65.23779296875), (60, 52.6846923828125), (130, 52.02973175048828), (140, 51.4713020324707), (95, 49.10539245605469), (22, 48.443782806396484), (24, 47.93186950683594), (135, 47.30842971801758), (129, 46.749114990234375), (67, 46.025169372558594)]"
9,108,"[(46, 24.919315338134766), (130, 23.839946746826172), (25, 21.68096160888672), (108, 21.43865203857422), (21, 21.034143447875977), (41, 20.960330963134766), (119, 19.911073684692383), (53, 17.531200408935547), (121, 17.505170822143555), (49, 16.961029052734375)]"


Unnamed: 0,jokeid,recommendations
0,148,"[(84, 31.883636474609375), (203, 28.840511322021484), (155, 23.952516555786133), (121, 20.91714859008789), (250, 18.80213165283203), (136, 18.156707763671875), (185, 18.147274017333984), (258, 17.760921478271484), (248, 15.642515182495117), (38, 15.021276473999023)]"
1,31,"[(207, 34.74746322631836), (203, 29.515464782714844), (185, 24.498666763305664), (84, 24.001625061035156), (175, 23.372262954711914), (107, 21.145418167114258), (166, 16.814014434814453), (248, 16.341184616088867), (253, 15.40349006652832), (205, 15.32844352722168)]"
2,85,"[(84, 42.55365753173828), (77, 24.728513717651367), (136, 23.978504180908203), (19, 23.709383010864258), (185, 23.596229553222656), (258, 23.30413055419922), (203, 23.22675323486328), (209, 22.25408935546875), (205, 20.89915657043457), (26, 20.5033016204834)]"
3,137,"[(155, 41.311981201171875), (258, 22.218164443969727), (136, 20.006208419799805), (178, 19.818944931030273), (195, 16.223234176635742), (121, 14.259716987609863), (250, 13.503952026367188), (38, 13.132500648498535), (84, 12.610593795776367), (19, 12.205896377563477)]"
4,65,"[(203, 41.614173889160156), (185, 32.86777114868164), (175, 32.07512664794922), (84, 31.291915893554688), (155, 26.088815689086914), (207, 24.46644401550293), (258, 24.45218849182129), (205, 22.0623836517334), (260, 21.331689834594727), (253, 19.051687240600586)]"
5,53,"[(207, 45.243717193603516), (203, 39.237579345703125), (107, 34.3674201965332), (185, 32.20622634887695), (84, 31.544376373291016), (175, 24.932315826416016), (166, 23.75750732421875), (205, 22.350461959838867), (248, 20.43204689025879), (155, 19.034744262695312)]"
6,133,"[(155, 39.132598876953125), (258, 31.94054412841797), (256, 19.752967834472656), (61, 19.264009475708008), (19, 18.96949577331543), (100, 16.08561897277832), (136, 15.490361213684082), (12, 14.685985565185547), (178, 14.305537223815918), (177, 13.406991004943848)]"
7,78,"[(207, 43.34546661376953), (61, 30.392776489257812), (258, 28.652931213378906), (166, 27.312915802001953), (155, 24.699382781982422), (136, 21.705585479736328), (203, 20.940933227539062), (248, 20.515445709228516), (177, 19.257081985473633), (100, 17.114490509033203)]"
8,108,"[(258, 47.372093200683594), (155, 45.57164764404297), (203, 40.554622650146484), (177, 33.66419219970703), (248, 29.076467514038086), (216, 22.26336097717285), (108, 21.43865203857422), (207, 21.126998901367188), (38, 21.101825714111328), (253, 20.68408203125)]"
9,34,"[(155, 65.23779296875), (258, 46.484519958496094), (136, 30.652894973754883), (177, 30.028345108032227), (178, 28.22173500061035), (75, 26.208759307861328), (256, 26.199298858642578), (19, 22.53771209716797), (150, 21.42724609375), (100, 20.803150177001953)]"


In [16]:
showDF(userRecs.filter(userRecs.userid == 65))

Unnamed: 0,userid,recommendations
0,65,"[(145, 14.56903076171875), (43, 11.844061851501465), (79, 11.283380508422852), (80, 11.234314918518066), (94, 11.042588233947754), (73, 10.83102035522461), (137, 10.767558097839355), (38, 10.715609550476074), (82, 10.636094093322754), (40, 10.619543075561523)]"


In [17]:
IFrame(src='images/init94.html', width=700, height=200)

In [18]:
IFrame(src='images/init43.html', width=700, height=200)

In [19]:
session.execute("""drop table jokes_table1""")

<cassandra.cluster.ResultSet at 0x1173c1eb8>