# Analytics on Glassdoor Reviews and Yelp Category Data

### Config

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("review_and_category_analytics") \
    .config("spark.executor.memory", '8g') \
    .config('spark.executor.cores', '4') \
    .config('spark.cores.max', '4') \
    .config("spark.driver.memory",'8g') \
    .getOrCreate()

sc = spark.sparkContext

### Read in the dataset

In [2]:
df = sc.textFile("reviews_and_categories.csv")

### Set header as the first row of values in the rdd

In [3]:
header = df.first()

In [4]:
header

'index,review_emp_txt,categories'

### Look at some data

In [5]:
df.take(5)

['index,review_emp_txt,categories',
 '0,,"[\'point of interest\', \'mexican\', \'establishment\', \'food\', \'restaurant\']"',
 '1,,[]',
 '2,,"[\'other\', \'food & beverages\']"',
 '3,"Some franchise owners dock hours. Pros Good discounts on the food. Cons The location where I was working, in North Fresno near Riverpark Mall, was ran by the owner s father who treated the female staff with contempt and derision. Would yell at the staff, in front of the guests, if they didn t exactly follow his formula for making the sandwiches (even when the staff were trying to fulfill the special requests of the guests). He would clock out the closers (with or without their knowledge) before they were done with their tasks, and ask employees to stay an hour or two past the end of their shift, but would not pay them for their time.","[\'lunch\', \'best sandwich\', \'entertainment\', \'restaurants\', \'sub\', \'arizona\', \'quick\', \'social networks\', \'washington\', \'catering reno\', \'establishment

### Look at non-header records

#### The categories are in a list, so I split at the beginning of the list to separate cols (1,2) from col (3), then split again to obtain cols (1,2,3)

In [6]:
data = df.filter(lambda r: r != header) \
        .map(lambda row: (row.split('[')[0], row.split('[')[1:])) \
        .map(lambda x: (x[0].split(',')[0], ''.join(x[0].split(',')[1:]), x[1]))

In [7]:
data.take(5)

[('0',
  '"',
  ['\'point of interest\', \'mexican\', \'establishment\', \'food\', \'restaurant\']"']),
 ('1', '', [']']),
 ('2', '"', ['\'other\', \'food & beverages\']"']),
 ('3',
  '"Some franchise owners dock hours. Pros Good discounts on the food. Cons The location where I was working in North Fresno near Riverpark Mall was ran by the owner s father who treated the female staff with contempt and derision. Would yell at the staff in front of the guests if they didn t exactly follow his formula for making the sandwiches (even when the staff were trying to fulfill the special requests of the guests). He would clock out the closers (with or without their knowledge) before they were done with their tasks and ask employees to stay an hour or two past the end of their shift but would not pay them for their time.""',
  ['\'lunch\', \'best sandwich\', \'entertainment\', \'restaurants\', \'sub\', \'arizona\', \'quick\', \'social networks\', \'washington\', \'catering reno\', \'establishment

In [8]:
data.map(lambda x: x[1]).take(5)

['"',
 '',
 '"',
 '"Some franchise owners dock hours. Pros Good discounts on the food. Cons The location where I was working in North Fresno near Riverpark Mall was ran by the owner s father who treated the female staff with contempt and derision. Would yell at the staff in front of the guests if they didn t exactly follow his formula for making the sandwiches (even when the staff were trying to fulfill the special requests of the guests). He would clock out the closers (with or without their knowledge) before they were done with their tasks and ask employees to stay an hour or two past the end of their shift but would not pay them for their time.""',
 '"']

In [9]:
data.map(lambda x: x[2]).take(5)

[['\'point of interest\', \'mexican\', \'establishment\', \'food\', \'restaurant\']"'],
 [']'],
 ['\'other\', \'food & beverages\']"'],
 ['\'lunch\', \'best sandwich\', \'entertainment\', \'restaurants\', \'sub\', \'arizona\', \'quick\', \'social networks\', \'washington\', \'catering reno\', \'establishment\', \'nevada\', \'restaurant\', \'wraps\', \'qsr\', \'small business\', \'meal takeaway\', \'hospitality\', \'sandwich\', \'franchise\', \'seminars\', \'deli\', \'point of interest\', \'sandwiches\', \'port\', \'other\', \'food\', \'party trays reno\', \'service\', \'entrepeneur\', \'franchises\', \'fast food\', \'grillers\', \'griller\', \'salad\', \'management\', \'businesses\', \'self employed\', \'wrap\', \'submarine\', \'delis\', \'lake tahoe\', \'boss\', \'salads\', \'trade shows\', \'eating places\', \'franchising\', \'reno\', \'subs\', \'phoenix\']"'],
 ['\'french\', \'event space\', \'wine, full bar & cocktails\', \'credit cards\', \'drinks\', \'price\', \'outdoor seating\'

### Return a count of the total number of records in this data

In [10]:
data.count()

1305

### Store only the records with non-empty *review_emp_txt*

In [11]:
# filter out records with '"' (from rows w category data) and '' (from rows w null category data)
data_has_review = data.filter(lambda r: r[1] != '"') \
                      .filter(lambda r: r[1] != '')

### Return a count of these non-empty records

In [12]:
data_has_review.count()

305

In [13]:
data_has_review.take(3)

[('3',
  '"Some franchise owners dock hours. Pros Good discounts on the food. Cons The location where I was working in North Fresno near Riverpark Mall was ran by the owner s father who treated the female staff with contempt and derision. Would yell at the staff in front of the guests if they didn t exactly follow his formula for making the sandwiches (even when the staff were trying to fulfill the special requests of the guests). He would clock out the closers (with or without their knowledge) before they were done with their tasks and ask employees to stay an hour or two past the end of their shift but would not pay them for their time.""',
  ['\'lunch\', \'best sandwich\', \'entertainment\', \'restaurants\', \'sub\', \'arizona\', \'quick\', \'social networks\', \'washington\', \'catering reno\', \'establishment\', \'nevada\', \'restaurant\', \'wraps\', \'qsr\', \'small business\', \'meal takeaway\', \'hospitality\', \'sandwich\', \'franchise\', \'seminars\', \'deli\', \'point of int

### Return how many reviews contain the word "awesome"

In [14]:
awesome_records = data_has_review.filter(lambda r: 'awesome' in r[1])

In [15]:
awesome_records.count()

10

### Take a look at those reviews

In [16]:
awesome_records.collect()

[('280',
  '"Manager Pros Great environment awesome owners! I was happy to come to work every day and face any challenges presented. It was a pleasant environment with a lot of opportunity to advance if you worked hard. Cons I had performed the role of a manager long before I was given the raise to match. There is a high turn over rate and it s hard to find good team members who want to work hard.""',
  ['\'entertainment\', \'credit cards\', \'restaurants\', \'green smoothie\', \'colleges and universities\', \'smoothies and juice bars\', \'menus\', \'food, beverages & tobacco\', \'1\', \'juice bar\', \'establishment\', \'las vegas\', \'meal takeaway\', \'price\', \'juice bars & smoothies\', \'hospitality\', \'tallahassee\', \'point of interest\', \'shopping\', \'sandwiches\', \'health foods\', \'wheat grass\', \'other\', \'food\', \'smoothies\', \'hampton university\', \'franchises\', \'$$\', \'restaurant chains\', \'blimey limey\', \'food and beverages\', \'dinner, lunch & more\', \'e

### To eliminate case sensitivity, I've lowercased the reviews and run the same filter to compare results.

In [17]:
awesome_records_lower = data_has_review.map(lambda r: (r[0], r[1].lower(), r[2])) \
                .filter(lambda r: 'awesome' in r[1]) \

In [18]:
awesome_records_lower.collect()

[('280',
  '"manager pros great environment awesome owners! i was happy to come to work every day and face any challenges presented. it was a pleasant environment with a lot of opportunity to advance if you worked hard. cons i had performed the role of a manager long before i was given the raise to match. there is a high turn over rate and it s hard to find good team members who want to work hard.""',
  ['\'entertainment\', \'credit cards\', \'restaurants\', \'green smoothie\', \'colleges and universities\', \'smoothies and juice bars\', \'menus\', \'food, beverages & tobacco\', \'1\', \'juice bar\', \'establishment\', \'las vegas\', \'meal takeaway\', \'price\', \'juice bars & smoothies\', \'hospitality\', \'tallahassee\', \'point of interest\', \'shopping\', \'sandwiches\', \'health foods\', \'wheat grass\', \'other\', \'food\', \'smoothies\', \'hampton university\', \'franchises\', \'$$\', \'restaurant chains\', \'blimey limey\', \'food and beverages\', \'dinner, lunch & more\', \'e

### There are two additional reviews that contain the word awesome after changing to lowercase

In [19]:
awesome_records_lower.count()

12

### Preprocess the categories in order to return frequencies.

#### Characters [] ' " are stripped, spaces before and after words are trimmed, lowercase, blank categories are removed.

In [20]:
cats=data.map(lambda r: r[2]) 

In [21]:
cats.take(3)

[['\'point of interest\', \'mexican\', \'establishment\', \'food\', \'restaurant\']"'],
 [']'],
 ['\'other\', \'food & beverages\']"']]

In [22]:
# Characters [] ' " are stripped, 
# spaces before and after words are trimmed, 
# lowercase, blank categories are removed.
cats_flat = cats.map(lambda row: [token.replace('[','') \
                                .replace(']','') \
                                .replace('\\','') \
                                .replace("'",'') \
                                .replace('"','') \
                                .strip() \
                                .lower() for token in row]) \
                                .flatMap(lambda x: x) \
                                .filter(lambda x: x != '') \
                                .flatMap(lambda x: x.split(',')) \
                                .map(lambda x: (x,1)) \
                                .reduceByKey(lambda x,y:x+y) \
                                .map(lambda x:(x[1],x[0])) \
                                .sortByKey(False) 

### Look at some of the categories

In [23]:
cats_flat.take(10)

[(705, ' establishment'),
 (701, ' food'),
 (671, ' point of interest'),
 (643, ' restaurant'),
 (473, ' price'),
 (471, ' other'),
 (311, ' menus'),
 (254, ' eating places'),
 (253, ' dining options'),
 (232, ' credit cards')]