# Spark RDD User Defined Functions & Yelp Dataset

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.functions import udf 
from pyspark.sql.types import TimestampType 
from pyspark.sql.types import MapType 
from pyspark.sql.types import StringType 
from pyspark.sql.types import IntegerType 
from pyspark.sql.types import ArrayType 
from pyspark.sql.functions import hour 
from pyspark.sql.functions import col
import re

# start the park session & context
spark = SparkSession.builder.master("local[*]").appName('Sparky').getOrCreate()

sc = spark.sparkContext

In [2]:
# read in some data

business = spark.read.json('../non_auto_assignments/data/yelp_academic/yelp_academic_dataset_business.json.gz')
checkin = spark.read.json('../non_auto_assignments/data/yelp_academic/yelp_academic_dataset_checkin.json.gz')
review = spark.read.json('../non_auto_assignments/data/yelp_academic/yelp_academic_dataset_review.json.gz')
tip = spark.read.json('../non_auto_assignments/data/yelp_academic/yelp_academic_dataset_tip.json.gz')
user = spark.read.json('../non_auto_assignments/data/yelp_academic/yelp_academic_dataset_user.json.gz')

### How many users have received more than 5000 cool votes?

In [3]:
user.printSchema()

root
 |-- average_stars: double (nullable = true)
 |-- compliment_cool: long (nullable = true)
 |-- compliment_cute: long (nullable = true)
 |-- compliment_funny: long (nullable = true)
 |-- compliment_hot: long (nullable = true)
 |-- compliment_list: long (nullable = true)
 |-- compliment_more: long (nullable = true)
 |-- compliment_note: long (nullable = true)
 |-- compliment_photos: long (nullable = true)
 |-- compliment_plain: long (nullable = true)
 |-- compliment_profile: long (nullable = true)
 |-- compliment_writer: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- elite: string (nullable = true)
 |-- fans: long (nullable = true)
 |-- friends: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)
 |-- yelping_since: string (nullable = true)



In [4]:
result = user.filter(user['compliment_cool'] > 5000).count()
print(result)

79


### What are the top 10 most useful positive reviews?
<p> I considered a 5 star review as a positive review. I then took the top ten useful 5 star reviews by sort order and did not consider ties. The top ten 5 star reivews had useful cvotes of between 358 and 203.</p>

In [5]:
pos_review = review['stars','text','useful'].filter(review['stars'] == 5)

In [6]:
top_ten = pos_review.orderBy(pos_review['useful'].desc()).head(10)

In [7]:
top_ten

[Row(stars=5.0, text='The Wynn Hotel. One word. Opulent. \n\nWynn, is a luxury resort and casino located on the Las Vegas Strip. Named after the casino developer Steve Wynn, this $2.7 billion resort together with Encore is the world\'s seventh-largest hotels and one of the finest in the world. \n\nHolding the title for the highest rated resort company in the world with 16 Forbes Travel Guide Five-Star Awards, AAA five diamond, Mobil five-star, Michelin five star, and Zagat Survey Top U.S. Hotel ratings, as well as one Michelin star for its restaurant Wing Lei, the first Chinese restaurant to receive a Michelin Star in the country. \n\nThe Wynn is a genre almost entirely their own. Talk about a blend of opulent creativity and seductive comfort. Wynn Hotel designer Roger Thomas calls it "Evocatect." The "architecture that evokes wonderful moments in people\'s lives." \n\nShare a moment alone or with someone special and walk through the sunlit Atrium, gaze at the immense beauty of the Par

### During what hour of the day do most checkins occur?

In [8]:
checkin.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- date: string (nullable = true)



<h3> First we try with pyspark functions</h3>

In [9]:
split_list_udf = udf(lambda x: x.split(','), ArrayType(StringType())) # turn Date string into array for exploding

In [10]:
checkin_list = checkin.withColumn("date_exploded", f.explode(split_list_udf(col("date"))))

In [11]:
hours_df = checkin_list.withColumn('hour', hour(col("date_exploded")))

In [12]:
check_hour = hours_df.groupby(['hour']).count().sort('count', ascending=False)

In [13]:
check_hour.show(24)

+----+--------+
|hour|   count|
+----+--------+
|null|18927198|
|  19|   13481|
|  23|   13207|
|  22|   13191|
|  18|   13177|
|  21|   12960|
|  20|   12553|
|  17|   12304|
|   0|   11577|
|  16|   10416|
|   1|    9803|
|   2|    7258|
|  15|    7000|
|   3|    5225|
|  14|    4340|
|   4|    3547|
|  13|    2910|
|   5|    2348|
|  12|    1597|
|   6|    1450|
|   7|    1020|
|  11|     824|
|   8|     809|
|  10|     483|
+----+--------+
only showing top 24 rows



<h3> Next we try with User Defined Functions</h3>
<p>Interestingly, pyspark function creates a bunch of null values because it is sensitive to spacing issues. To get around this I converted the string to an arary and indexed the array which allowed me to then group, sort and show when checkins are.</p>

In [14]:
hour_udf = udf(lambda x: int(x.split()[1].split(':')[0]), IntegerType())

In [15]:
string_hour = checkin_list.withColumn("hour", hour_udf(col("date_exploded")))

In [16]:
string_check_hour = string_hour.groupby(['hour']).count().sort('hour', ascending=False)

In [17]:
string_check_hour.show(24)

+----+-------+
|hour|  count|
+----+-------+
|  23|1344117|
|  22|1257437|
|  21|1238808|
|  20|1350195|
|  19|1502271|
|  18|1272108|
|  17|1006102|
|  16| 852076|
|  15| 617830|
|  14| 418340|
|  13| 270145|
|  12| 178910|
|  11| 111769|
|  10|  88486|
|   9| 100568|
|   8| 151065|
|   7| 231417|
|   6| 321764|
|   5| 485129|
|   4| 747453|
|   3|1078939|
|   2|1411255|
|   1|1561788|
|   0|1491176|
+----+-------+



### Sentiment analysis
<p>Aknowledging the fact that people often use slang words, emphatic spelling, and mispell in life and in reviews, I let the expressions intersect themselves. I did not want to force a perspective on how people expression themselves in the written form. I considered a positive review as a review with 5 stars and a negative review as a review with 1 or less stars. My thinking is that 2-4 stars is an average review. Once I filtered the datasets based on these conditions, I then created a list of words used in the review that were not "stop words". After I found the intersection of the words used in positive and negative reviews I then filtered on the inverse of the intersection list. This allowed me to show the top 50 unique "words" used in positive and negative reviews. I was quite surprised by the slang words, acronyms and emphatic spelling that was in the resulting sets. </p>

In [18]:
def stop(text):
    STOPWORDS = {'i', 'we', 'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than'}
    WORD_RE = re.compile(r"[\w']+") # any whitespace or apostrophe, used to split lines below

    r_list = []
    for word in WORD_RE.findall(text.lower()):
        if word not in STOPWORDS:
            r_list.append(word)
            
    return r_list

stop_udf = udf(lambda x: stop(x), ArrayType(StringType())) # mapping words to an array before exploding

In [19]:
col_reduced = pos_review['stars','text'] # minimize information carried
neg_review = review['stars','text'].filter(review['stars'] <= 1) # minimize information carried

### Map and Reduce the words into the respective positive and negative frames

In [20]:
df_positive_count = col_reduced.withColumn('word',  f.explode(stop_udf(col('text'))))\
    .groupBy('word')\
    .count()\
    .alias("df_positive_count")

df_negative_count = neg_review.withColumn('word',  f.explode(stop_udf(col('text'))))\
    .groupBy('word')\
    .count()\
    .alias("df_negative_count")

In [21]:
# create a list of the intersection words to reduce the expense of replicating the process.
overlap_words = df_positive_count.select('word').intersect(df_negative_count.select('word'))\
                .rdd.flatMap(lambda x:x).collect()

In [22]:
# show top 50 positive words
flt_pos_unique = df_positive_count.filter(~col('word').isin(overlap_words)).sort('count', ascending=False).show(50)

+--------------+-----+
|          word|count|
+--------------+-----+
|      foodgasm|  328|
|         eloff|  291|
|         gluch|  248|
|          jabz|  196|
|incontournable|  188|
|   annihilator|  173|
|    onolicious|  170|
|       yummmmy|  165|
|     deliciosa|  159|
|  satisfyingly|  159|
|        fixler|  157|
|       hojicha|  157|
|         wigle|  155|
|           ceg|  139|
|   amaaaaazing|  136|
|   amazinggggg|  136|
|     heartwood|  135|
|   ahhhhmazing|  129|
|          emme|  126|
|        boothe|  125|
|        boyger|  125|
|         artur|  122|
|       mckenzi|  118|
|         burek|  118|
|        alodia|  117|
|         safak|  114|
|     bavette's|  113|
|           f45|  111|
|     mouthgasm|  110|
|        sidell|  109|
|         régal|  108|
|           nui|  107|
|  shutterbooth|  106|
|      burgushi|  103|
|      injeolmi|  102|
|      deeelish|   99|
|        cajeta|   98|
|      yummmmmy|   98|
|   deeelicious|   98|
|           imr|   98|
|          

In [23]:
# show top 50 negative words
flt_neg_unique = df_negative_count.filter(~col('word').isin(overlap_words)).sort('count', ascending=False).show(50)

+---------------+-----+
|           word|count|
+---------------+-----+
| unproffesional|  105|
|      ineptness|   87|
|       condones|   79|
|   ripoffreport|   71|
|       snarkily|   62|
|   incompetents|   61|
|    undelivered|   60|
|         acanac|   58|
|          wrost|   57|
|unproffessional|   54|
|     falsifying|   50|
|       snottily|   48|
|     fraudsters|   47|
|   unsubscribed|   44|
|     homelovers|   42|
|     impolitely|   41|
|consumeraffairs|   41|
|     defrauding|   40|
|        barlows|   39|
|       ungloved|   35|
|unexceptionable|   35|
|    unexcusable|   35|
|  unfriendliest|   34|
|     creditcard|   31|
|   'supervisor'|   31|
|unprofessionals|   31|
|   unempathetic|   31|
|         voiers|   30|
|     dealticker|   30|
|       soggiest|   30|
|         pesimo|   30|
|           fahw|   30|
|    negligently|   29|
|       'manager|   29|
|         worest|   29|
|      discusted|   28|
|     unlawfully|   28|
|      snottiest|   28|
|            rmi