In [1]:
from pyspark import SparkContext, SparkConf
import time
import json
import sys

In [2]:
appName = "mining-yelp-dataset-with-spark"
master = "local[*]"

conf = SparkConf().setAppName(appName).setMaster(master)
sc = SparkContext(conf=conf)
sc.setLogLevel("WARN")

In [3]:
review_file_path = "/Users/ZhengYang/Documents/yelp_dataset/review.json"
business_file_path = "/Users/ZhengYang/Documents/yelp_dataset/business.json"

## review.json

In [5]:
# review.json: Contains full review text data including the user_id that wrote the review and the business_id the review is written for.
review_rdd = sc.textFile(review_file_path, 240)
review_rdd.take(1)

['{"review_id":"Q1sbwvVQXV2734tPgoKj4Q","user_id":"hG7b0MtEbXx5QzbzE6C_VA","business_id":"ujmEBvifdJM6h6RLv4wQIg","stars":1.0,"useful":6,"funny":1,"cool":0,"text":"Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.","date":"2013-05-07 04:34:36"}']

In [12]:
# The total number of reviews 
n_review = review_rdd \
    .map(lambda r: 1) \
    .reduce(lambda a, b: a + b)

    
# The number of distinct users who wrote reviews
user_count = review_rdd \
        .map(lambda r: (json.loads(r)['user_id'], 1)) \
        .reduceByKey(lambda a, b: a + b).cache()
n_user = user_count \
    .map(lambda r : 1) \
    .reduce(lambda a, b: a + b)
    
    
# The top 10 users who wrote the largest numbers of reviews and the number of reviews they wrote
top10_user = user_count \
        .sortBy(lambda r: (-r[1], r[0])) \
        .take(10)

In [13]:
# The number of distinct businesses that have been reviewed 
business_count = review_rdd \
    .map(lambda r: (json.loads(r)['business_id'], 1)) \
    .reduceByKey(lambda a, b: a + b, 1).cache()
    
# The top 10 businesses that had the largest numbers of reviews and the number of reviews they had 
top10_business = business_count \
    .sortBy(lambda r: (-r[1], r[0])) \
    .take(10)

In [38]:
print("number of reviews: %d \n" % n_review)
print("number of users who has written at least one review on yelp: %d \n" % n_user)
print("top 10 active users (user_id, count): ")
print(*top10_user, sep='\n')
print("\ntop 10 popular businesses (business_id, count): ")
print(*top10_business, sep='\n')

number of reviews: 6685900 

number of users who has written at least one review on yelp: 1637138 

top 10 active users (user_id, count): 
('CxDOIDnH8gp9KXzpBHJYXw', 4129)
('bLbSNkLggFnqwNNzzq-Ijw', 2354)
('PKEzKWv_FktMm2mGPjwd0Q', 1822)
('ELcQDlf69kb-ihJfxZyL0A', 1764)
('DK57YibC5ShBmqQl97CKog', 1727)
('U4INQZOPSUaj8hMjLlZ3KA', 1559)
('QJI9OSEn6ujRCtrX06vs1w', 1496)
('d_TBs6J3twMy9GChqUEXkg', 1360)
('hWDybu_KvYLSdEFzGrniTw', 1355)
('cMEtAiW60I5wE_vLfTxoJQ', 1255)

top 10 popular businesses (business_id, count): 
('4JNXUYY8wbaaDmk3BPzlWw', 8570)
('RESDUcs7fIiihp38-d6_6g', 8568)
('K7lWdNUhCbcnEvI0NhGewg', 6887)
('f4x1YBxkLrZg652xt2KR5g', 5847)
('cYwJA2A6I12KNkm2rtXd5g', 5575)
('DkYS3arLOhA8si5uUEmHOw', 5206)
('2weQS-RnoOBhb1KsHKyoSQ', 4534)
('5LNZ67Yw9RD6nf4_UhXOjw', 4522)
('iCQpiavjjPzJ5_3gPD5Ebg', 4351)
('SMPbvZLSMMb7KU76YNYMGg', 4350)


In [39]:
"""
TODO: 

+ business_id ---> business_name using business.json (find top 10 popular businesses)

++ visualization: bar chart etc.

- What categories are they in? 

- What about in different states?

- their corresponding avg stars?


"""

'\nTODO: \n\n+ business_id ---> business_name using business.json (find top 10 popular businesses)\n\n- What categories are they in? \n\n- What about in different states?\n\n- their corresponding avg stars?\n\n\n'