In [1]:
from pyspark import SparkContext
import json
import datetime

In [2]:
from operator import add

In [3]:
sc = SparkContext("local", "practice_app")
sc

In [4]:
stopwords = set()
with open('datasets/stopwords', "r") as fp:
    for word in fp.readlines():
        stopwords.add(word.strip().lower())
stopwords.add("")
len(stopwords)

128

In [5]:
puncts = ("(", "[", ",", ".", "!", "?", ":", ";", "]", ")")

# Task 1

In [6]:
reviews = sc.textFile('datasets/review.json')
def preprocess(json_txt):
    json_dict = json.loads(json_txt)
    json_dict['date'] = datetime.datetime.strptime(json_dict['date'], "%Y-%m-%d %H:%M:%S")
    text = []
    for c in list(json_dict['text'].lower()):#.replace("\n","")):
        if c not in puncts:
            text.append(c)
    text = "".join(text).split(" ")
    json_dict['text'] = [ word for word in text if word not in stopwords ]
    return json_dict

reviews_json = reviews.map( lambda x : preprocess(x) )

### Ans : The total number of reviews

In [7]:
ans1 = reviews.count()
ans1

1151625

### Ans : The number of reviews in a given year, y 

In [8]:
y = 2017
ans2 = reviews_json.filter( lambda x : x['date'].year == y  ).count()
ans2

209995

### Ans : The number of distinct users who have written the reviews 

In [9]:
ans3 = reviews_json.map( lambda x: (x['user_id'],1) ).reduceByKey(add).count()
ans3

566269

### Ans : Top m users who have the largest number of reviews and its count

In [10]:
m = 10
ans4 = reviews_json \
            .map( lambda x: (x['user_id'],1) ) \
            .reduceByKey(add) \
            .sortBy( lambda x: -1*x[1] ) \
            .take(m)
ans4

[('CxDOIDnH8gp9KXzpBHJYXw', 715),
 ('bLbSNkLggFnqwNNzzq-Ijw', 424),
 ('PKEzKWv_FktMm2mGPjwd0Q', 322),
 ('DK57YibC5ShBmqQl97CKog', 291),
 ('ELcQDlf69kb-ihJfxZyL0A', 288),
 ('U4INQZOPSUaj8hMjLlZ3KA', 276),
 ('QJI9OSEn6ujRCtrX06vs1w', 258),
 ('d_TBs6J3twMy9GChqUEXkg', 253),
 ('hWDybu_KvYLSdEFzGrniTw', 239),
 ('dIIKEfOgo0KqUfGQvGikPg', 216)]

### Top n frequent words in the review text. The words should be in lower cases. The following punctuations “(”, “[”, “,”, “.”, “!”, “?”, “:”, “;”, “]”, “)” and the given stopwords are excluded (1pts)

In [11]:
n = 10
ans5 = reviews_json \
            .flatMap( lambda x: [ (word, 1) for word in x['text'] ] ) \
            .reduceByKey(add) \
            .sortBy( lambda x: -1*x[1] ) \
            .take(n)
ans5

[('food', 556438),
 ('place', 552730),
 ('good', 541824),
 ('great', 485467),
 ('like', 402121),
 ('service', 394193),
 ('time', 390830),
 ('get', 382805),
 ('one', 367582),
 ('would', 349574)]

# Task 2

In [106]:
business_txt = sc.textFile('datasets/business.json')
# def preprocess_business(json_txt):
#     json_dict = json_txt)
#     return json_dict

business_json = business_txt.map( json.loads )
business_json.take(1)

[{'business_id': '1SWheh84yJXfytovILXOAQ',
  'name': 'Arizona Biltmore Golf Club',
  'address': '2818 E Camino Acequia Drive',
  'city': 'Phoenix',
  'state': 'AZ',
  'postal_code': '85016',
  'latitude': 33.5221425,
  'longitude': -112.0184807,
  'stars': 3.0,
  'review_count': 5,
  'is_open': 0,
  'attributes': {'GoodForKids': 'False'},
  'categories': 'Golf, Active Life',
  'hours': None}]

In [124]:
temp = reviews_json.map( lambda x: (x['business_id'], (x['stars'], 1)) ).reduceByKey( lambda a,b: ( a[0]+b[0], a[1]+b[1] ) ).map( lambda x: (x[0], x[1][0]/x[1][1]) ).take(5)
temp

[('abaIvBrlg3QI4FUG1v3bdA', 4.0),
 ('M8a5DRdXl8KMu4bMFLPgQg', 4.25),
 ('1Df5WnLX3DqN6ymlhqznaQ', 4.589041095890411),
 ('DVfCbJhJUDWRlUfrKzaKOA', 1.9056603773584906),
 ('E7eOOXwMC7wwCRywsvNzUA', 3.4814814814814814)]

In [169]:
review_json1 = reviews_json.map( lambda x: (x['business_id'], x['stars']) )
def business_process(js):
    if js['categories'] is None: return []
    return [ (js['business_id'], cat.strip()) for cat in js.get('categories', "").split(',') ]

business_json1 = business_json.flatMap(business_process)

In [170]:
review_json1.take(5)

[('mRUVMJkUGxrByzMQ2MuOpA', 1.0),
 ('LUN6swQYa4xJKaM_UEUOEw', 4.0),
 ('NyLYY8q1-H3hfsTwuwLPCg', 4.0),
 ('6lj2BJ4tJeu7db5asGHQ4w', 5.0),
 ('Mem13A3C202RzT53npn4NA', 5.0)]

In [171]:
business_json1.take(5)

[('1SWheh84yJXfytovILXOAQ', 'Golf'),
 ('1SWheh84yJXfytovILXOAQ', 'Active Life'),
 ('QXAEGFB4oINsVuTFxEYKFQ', 'Specialty Food'),
 ('QXAEGFB4oINsVuTFxEYKFQ', 'Restaurants'),
 ('QXAEGFB4oINsVuTFxEYKFQ', 'Dim Sum')]

In [172]:
business_cat_stars = review_json1.join(business_json1)
business_cat_stars.take(5)

[('8WBHKj2davW6hhZWlUQ7DA', (5.0, 'Performing Arts')),
 ('8WBHKj2davW6hhZWlUQ7DA', (5.0, 'Nightlife')),
 ('8WBHKj2davW6hhZWlUQ7DA', (5.0, 'Music Venues')),
 ('8WBHKj2davW6hhZWlUQ7DA', (5.0, 'Stadiums & Arenas')),
 ('8WBHKj2davW6hhZWlUQ7DA', (5.0, 'Arts & Entertainment'))]

In [177]:
n = 10
business_cat_stars.map(lambda x: (x[1][1], (x[1][0], 1))) \
        .reduceByKey(lambda a,b: (a[0]+b[0], a[1]+b[1])) \
        .map( lambda x: (x[0], x[1][0]/x[1][1]) ) \
        .sortBy( lambda x: -1*x[1] ) \
        .take(10)

[('Calabrian', 5.0),
 ('Storefront Clinics', 5.0),
 ('Safety Equipment', 5.0),
 ('Christmas Markets', 5.0),
 ('Vocal Coach', 5.0),
 ('Bocce Ball', 5.0),
 ('Hearing Aids', 5.0),
 ('Astrologers', 5.0),
 ('Registry Office', 5.0),
 ('Makerspaces', 5.0)]

# Task 3

In [4]:
sc = SparkContext.getOrCreate()

reviews_txt = sc.textFile('datasets/review.json')
reviews_json = reviews_txt.map( json.loads )

In [5]:
reviews_json.take(1)

[{'review_id': '-I5umRTkhw15RqpKMl_o1Q',
  'user_id': '-mA3-1mN4JIEkqOtdbNXCQ',
  'business_id': 'mRUVMJkUGxrByzMQ2MuOpA',
  'stars': 1.0,
  'text': "Walked in around 4 on a Friday afternoon, we sat at a table just off the bar and walked out after 5 min or so. Don't even think they realized we walked in. However everyone at the bar noticed we walked in!!! Service was non existent at best. Not a good way for a new business to start out. Oh well, the location they are at has been about 5 different things over the past several years, so they will just be added to the list. SMDH!!!",
  'date': '2017-12-15 23:27:08'}]

[('M8a5DRdXl8KMu4bMFLPgQg', 20),
 ('1Df5WnLX3DqN6ymlhqznaQ', 73),
 ('DVfCbJhJUDWRlUfrKzaKOA', 106),
 ('E7eOOXwMC7wwCRywsvNzUA', 27),
 ('VoXUt6TIJWyIf_lihbN5xQ', 51),
 ('6eRSdODyj9GS0w6zlhrWkg', 23),
 ('qx6WhZ42eDKmBchZDax4dQ', 173),
 ('WOj8nPos3R9IIf0Pyttd8g', 34),
 ('gc_GGat7TgNCmrgc4nVOgw', 46),
 ('zra20XPGVL9P3i5hMoKjig', 26),
 ('eoyvbnRYQe-z85e8Rc6vAg', 78),
 ('vUfFS5BR8WXsx9r72QMCvg', 14),
 ('vyDW7_CHhDUJshMqaSjj0g', 77),
 ('GyAjuDFTsSLLE1v8NdmZ7g', 23),
 ('OVTZNSkSfbl3gVB9XQIJfw', 246),
 ('CAy7twD5McMiNrkFWOUn7g', 14),
 ('xYy54Y2VTBAG5k7WJcaH4A', 14),
 ('HjJp2tI4Xzs7MKyL31E0DA', 12),
 ('Ls1NqcogI-szYoQ7tLh4Og', 18),
 ('wSBNzVkRzTVlnR95AFvYMg', 24),
 ('kPfhSK0kcUmOTmC5DEjaDw', 42),
 ('EdY6q0CO2mlATpoTFkvHAw', 71),
 ('4v7dr2o5qXsHVGh9tM1KLA', 22),
 ('biSsdqExmdvIKiFuowOadg', 61),
 ('0e2oJ82g4ZfvQzIc8SE9Xw', 26),
 ('83cVpTJNyzv9lJ20vzBgdg', 17),
 ('WnUttoJffplgWaQGR2J2Xw', 22),
 ('yAXWKWRTs4wNy-zrJgoeXQ', 52),
 ('7EZ4Eu7YJ1ltRCC5jXFJrQ', 138),
 ('l9vx-OTdDPU-oHZ17aVxdg', 35),
 ('ES6