# W205 - Spring 2020 - Project 3: Understanding User Behavior

### By: Ali Asadi Nikooyan

#### Check PySpark is running

In [1]:
spark

In [2]:
sc

#### Import required packages

In [3]:
import json
from pyspark.sql import Row
from pyspark.sql.functions import udf

#### User defined functions: 

> returning True only for _purchase a sword_ event

In [4]:
@udf('boolean')
def is_purchase_sword(event_as_json):
    event = json.loads(event_as_json)
    if event['event_type'] == 'purchase_sword':
        return True
    return False

> returning True only for _purchase a knife_ event

In [5]:
@udf('boolean')
def is_purchase_knife(event_as_json):
    event = json.loads(event_as_json)
    if event['event_type'] == 'purchase_knife':
        return True
    return False

#### Reading and printing created raw events from Kafka into Spark

In [6]:
raw_events = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "events") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load()

In [7]:
raw_events

DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]

In [8]:
raw_events.show()

+----+--------------------+------+---------+------+--------------------+-------------+
| key|               value| topic|partition|offset|           timestamp|timestampType|
+----+--------------------+------+---------+------+--------------------+-------------+
|null|[7B 22 48 6F 73 7...|events|        0|     0|2020-04-14 05:26:...|            0|
|null|[7B 22 48 6F 73 7...|events|        0|     1|2020-04-14 05:26:...|            0|
|null|[7B 22 48 6F 73 7...|events|        0|     2|2020-04-14 05:26:...|            0|
|null|[7B 22 48 6F 73 7...|events|        0|     3|2020-04-14 05:26:...|            0|
|null|[7B 22 48 6F 73 7...|events|        0|     4|2020-04-14 05:26:...|            0|
|null|[7B 22 48 6F 73 7...|events|        0|     5|2020-04-14 05:26:...|            0|
|null|[7B 22 48 6F 73 7...|events|        0|     6|2020-04-14 05:26:...|            0|
|null|[7B 22 48 6F 73 7...|events|        0|     7|2020-04-14 05:26:...|            0|
|null|[7B 22 48 6F 73 7...|events|        0

#### Filtering the raw events for specific event by using the user defined function

> **Purchase a sword:**

In [9]:
purchase_sword_events = raw_events \
    .select(raw_events.value.cast('string').alias('raw'),
            raw_events.timestamp.cast('string')) \
    .filter(is_purchase_sword('raw'))

In [10]:
purchase_sword_events

DataFrame[raw: string, timestamp: string]

In [11]:
purchase_sword_events.show()

+--------------------+--------------------+
|                 raw|           timestamp|
+--------------------+--------------------+
|{"Host": "user1.c...|2020-04-14 05:26:...|
|{"Host": "user1.c...|2020-04-14 05:26:...|
|{"Host": "user1.c...|2020-04-14 05:26:...|
|{"Host": "user1.c...|2020-04-14 05:26:...|
|{"Host": "user1.c...|2020-04-14 05:26:...|
|{"Host": "user1.c...|2020-04-14 05:26:...|
|{"Host": "user1.c...|2020-04-14 05:26:...|
|{"Host": "user1.c...|2020-04-14 05:26:...|
|{"Host": "user1.c...|2020-04-14 05:26:...|
|{"Host": "user1.c...|2020-04-14 05:26:...|
|{"Host": "user2.a...|2020-04-14 05:26:...|
|{"Host": "user2.a...|2020-04-14 05:26:...|
|{"Host": "user2.a...|2020-04-14 05:26:...|
|{"Host": "user2.a...|2020-04-14 05:26:...|
|{"Host": "user2.a...|2020-04-14 05:26:...|
|{"Host": "user2.a...|2020-04-14 05:26:...|
|{"Host": "user2.a...|2020-04-14 05:26:...|
|{"Host": "user2.a...|2020-04-14 05:26:...|
|{"Host": "user2.a...|2020-04-14 05:26:...|
|{"Host": "user2.a...|2020-04-14

In [12]:
purchase_sword_events.show(truncate = False)

+---------------------------------------------------------------------------------------------------------------+-----------------------+
|raw                                                                                                            |timestamp              |
+---------------------------------------------------------------------------------------------------------------+-----------------------+
|{"Host": "user1.comcast.com", "event_type": "purchase_sword", "Accept": "*/*", "User-Agent": "ApacheBench/2.3"}|2020-04-14 05:26:25.316|
|{"Host": "user1.comcast.com", "event_type": "purchase_sword", "Accept": "*/*", "User-Agent": "ApacheBench/2.3"}|2020-04-14 05:26:25.32 |
|{"Host": "user1.comcast.com", "event_type": "purchase_sword", "Accept": "*/*", "User-Agent": "ApacheBench/2.3"}|2020-04-14 05:26:25.322|
|{"Host": "user1.comcast.com", "event_type": "purchase_sword", "Accept": "*/*", "User-Agent": "ApacheBench/2.3"}|2020-04-14 05:26:25.324|
|{"Host": "user1.comcast.com", "ev

> **Purchase a Knife:**

In [13]:
purchase_knife_events = raw_events \
    .select(raw_events.value.cast('string').alias('raw'),
            raw_events.timestamp.cast('string')) \
    .filter(is_purchase_knife('raw'))

In [14]:
purchase_knife_events

DataFrame[raw: string, timestamp: string]

In [15]:
purchase_knife_events.show(truncate = False)

+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+
|raw                                                                                                                                               |timestamp              |
+--------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+
|{"Host": "user1.comcast.com", "User-Agent": "ApacheBench/2.3", "event_type": "purchase_knife", "Accept": "*/*", "description": "very sharp knife"}|2020-04-14 05:27:38.916|
|{"Host": "user1.comcast.com", "User-Agent": "ApacheBench/2.3", "event_type": "purchase_knife", "Accept": "*/*", "description": "very sharp knife"}|2020-04-14 05:27:38.92 |
|{"Host": "user1.comcast.com", "User-Agent": "ApacheBench/2.3", "event_type": "purchase_knife", "Accept": "*/*", "description": "very s

#### Define and using a Lambda function to extract the purchase events into a DataFrame

> **Purchase a Sword:**

In [17]:
# lambda transform
extracted_purchase_sword_events = purchase_sword_events \
    .rdd \
    .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.raw))) \
    .toDF()

In [18]:
extracted_purchase_sword_events.printSchema()

root
 |-- Accept: string (nullable = true)
 |-- Host: string (nullable = true)
 |-- User-Agent: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [19]:
extracted_purchase_sword_events.show()

+------+-----------------+---------------+--------------+--------------------+
|Accept|             Host|     User-Agent|    event_type|           timestamp|
+------+-----------------+---------------+--------------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2020-04-14 05:26:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2020-04-14 05:26:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2020-04-14 05:26:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2020-04-14 05:26:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2020-04-14 05:26:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2020-04-14 05:26:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2020-04-14 05:26:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2020-04-14 05:26:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2020-04-14 05:26:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_s

> **Purchase a Knife:**

In [20]:
# lambda transform
extracted_purchase_knife_events = purchase_knife_events \
    .rdd \
    .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.raw))) \
    .toDF()

In [21]:
extracted_purchase_knife_events.printSchema()

root
 |-- Accept: string (nullable = true)
 |-- Host: string (nullable = true)
 |-- User-Agent: string (nullable = true)
 |-- description: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [24]:
extracted_purchase_knife_events.show()

+------+-----------------+---------------+----------------+--------------+--------------------+
|Accept|             Host|     User-Agent|     description|    event_type|           timestamp|
+------+-----------------+---------------+----------------+--------------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|   */*|user1.comcast.com|ApacheBench/2.3|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|   */*|user1.comcast.com|ApacheBench/2.3|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|   */*|user1.comcast.com|ApacheBench/2.3|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|   */*|user1.comcast.com|ApacheBench/2.3|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|   */*|user1.comcast.com|ApacheBench/2.3|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|   */*|user1.comcast.com|ApacheBench/2.3|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|   */*|user1.comcast.com|ApacheBench/2.

#### Write the extracted events into a Parquet file

In [25]:
extracted_purchase_sword_events \
    .write \
    .mode('overwrite') \
    .parquet('/tmp/purchases_sword')

In [26]:
extracted_purchase_knife_events \
    .write \
    .mode('overwrite') \
    .parquet('/tmp/purchases_knife')

#### Reading back from the saved Parquet file directly into Spark

> **Purchase a Sword:**

In [27]:
purchases_sword = spark.read.parquet('/tmp/purchases_sword')

In [28]:
purchases_sword.show()

+------+-----------------+---------------+--------------+--------------------+
|Accept|             Host|     User-Agent|    event_type|           timestamp|
+------+-----------------+---------------+--------------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2020-04-14 05:26:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2020-04-14 05:26:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2020-04-14 05:26:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2020-04-14 05:26:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2020-04-14 05:26:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2020-04-14 05:26:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2020-04-14 05:26:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2020-04-14 05:26:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|2020-04-14 05:26:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_s

> **Purchase a Knife:**

In [29]:
purchases_knife = spark.read.parquet('/tmp/purchases_knife')

In [30]:
purchases_knife.show()

+------+-----------------+---------------+----------------+--------------+--------------------+
|Accept|             Host|     User-Agent|     description|    event_type|           timestamp|
+------+-----------------+---------------+----------------+--------------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|   */*|user1.comcast.com|ApacheBench/2.3|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|   */*|user1.comcast.com|ApacheBench/2.3|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|   */*|user1.comcast.com|ApacheBench/2.3|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|   */*|user1.comcast.com|ApacheBench/2.3|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|   */*|user1.comcast.com|ApacheBench/2.3|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|   */*|user1.comcast.com|ApacheBench/2.3|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|   */*|user1.comcast.com|ApacheBench/2.

### Queries

#### Register as a tempoarary tables:

In [31]:
purchases_sword.registerTempTable('purchases_sword')
purchases_knife.registerTempTable('purchases_knife')

**Query 1:**
> Select only knife purchases by User1

In [51]:
purch_knife_u1 = spark.sql("select Host, description, event_type, timestamp from purchases_knife \
                            where Host = 'user1.comcast.com'")

In [52]:
purch_knife_u1.show()

+-----------------+----------------+--------------+--------------------+
|             Host|     description|    event_type|           timestamp|
+-----------------+----------------+--------------+--------------------+
|user1.comcast.com|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|user1.comcast.com|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|user1.comcast.com|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|user1.comcast.com|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|user1.comcast.com|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|user1.comcast.com|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|user1.comcast.com|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|user1.comcast.com|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|user1.comcast.com|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|user1.comcast.com|very sharp knife|purchase_knife|2020-04-14 05:27:...|
|user1.comcast.com|very sharp knife|purchase_knife|

In [53]:
# Using toPandas
df = purch_knife_u1.toPandas()

In [54]:
df.head()

Unnamed: 0,Host,description,event_type,timestamp
0,user1.comcast.com,very sharp knife,purchase_knife,2020-04-14 05:27:38.916
1,user1.comcast.com,very sharp knife,purchase_knife,2020-04-14 05:27:38.92
2,user1.comcast.com,very sharp knife,purchase_knife,2020-04-14 05:27:38.924
3,user1.comcast.com,very sharp knife,purchase_knife,2020-04-14 05:27:38.927
4,user1.comcast.com,very sharp knife,purchase_knife,2020-04-14 05:27:38.932


In [55]:
df.describe()

Unnamed: 0,Host,description,event_type,timestamp
count,20,20,20,20
unique,1,1,1,20
top,user1.comcast.com,very sharp knife,purchase_knife,2020-04-14 05:33:51.375
freq,20,20,20,1


**Query 2:**
> count the number of all swords and knives purchased

In [46]:
total_sword_knives = spark.sql("select (select count(*) from purchases_sword) as total_purchased_swords,\
(select count(*) from purchases_knife) as total_purchased_knives")

In [47]:
total_sword_knives.show()

+----------------------+----------------------+
|total_purchased_swords|total_purchased_knives|
+----------------------+----------------------+
|                    50|                    40|
+----------------------+----------------------+



**Query 3:**
> Count the number of swords and knives purchased by user 2

In [49]:
swords_knives_u2 = spark.sql("select (select count(*) from purchases_sword where Host like 'user2%') as user2_purchased_swords,\
(select count(*) from purchases_knife where Host like 'user2%') as user2_purchased_knives")

In [50]:
swords_knives_u2.show()

+----------------------+----------------------+
|user2_purchased_swords|user2_purchased_knives|
+----------------------+----------------------+
|                    30|                    20|
+----------------------+----------------------+

