# YELP Dataset analysis

In [1]:
# imports
from collections import Counter
import reverse_geocoder
import itertools
import json
import nltk
import os
from tqdm import tqdm

In [2]:
# DEFAULTS

# define folder structure
base_path = os.getcwd()
data_path = base_path + '/data'
intermediate_data_path = data_path + '/intermediate'

try:
    os.mkdir(data_path)
    print("Folder created.")
except FileExistsError:
    print("Folder already exists.")

try:
    os.mkdir(intermediate_data_path)
    print("Folder created.")
except FileExistsError:
    print("Folder already exists.")

Folder already exists.
Folder already exists.


In [3]:
# DATA MODELS

# Business
class Business:
    def __init__(self, json):
        self.__dict__ = json

businesses = dict()

# Review
class Review:
    def __init__(self, json):
        self.__dict__ = json

reviews = dict()

In [4]:
# import businesses
with open(data_path + '/yelp/yelp_academic_dataset_business.json', encoding="utf8") as businesses_file:
    for l in tqdm(businesses_file.readlines()):
        b = Business(json.loads(l))
        businesses[b.business_id] = b

print("# of Businesses: " + str(len(businesses)))

100%|██████████| 192609/192609 [00:04<00:00, 47892.39it/s]# of Businesses: 192609



In [5]:
# Filter businesses by state.

# Choose the state(s) to filter
STATE_TO_FILTER = 'Illinois'

states = dict()
def add_or_update(state, business):
    if state in states:
        states[state].add(business)
    else:
        states[state] = set([business])

business_list = list(businesses.values())

# Find coordinates by using the reverse_geocoder
coordinates = [(c.latitude, c.longitude) for c in business_list]
res = reverse_geocoder.search(coordinates)
ctr = 0
for r in res:
    state = r['admin1']
    if state == STATE_TO_FILTER:
        add_or_update(state, business_list[ctr])
    ctr += 1

for s in states:
    print(s + ": " + str(len(states[s])))

Loading formatted geocoded file...
Illinois: 1930


In [6]:
# List all businesses of the given states
business_ids = set()
for b in states[STATE_TO_FILTER]:
    business_ids.add(b.business_id)

# Get the number of businesses to look for reviews for
print("# Businesses to be reviewed: " + str(len(business_ids)))

# Load all reviews with respect to the given businesses
with open(data_path + '/yelp/yelp_academic_dataset_review.json', encoding="utf8") as reviews_file:
    for l in tqdm(reviews_file.readlines()):
        r = Review(json.loads(l))
        if r.business_id in business_ids:
            reviews[r.review_id] = r

print("# Reviews loaded: " + str(len(reviews.values())))

# Businesses to be reviewed: 1930
100%|██████████| 6685900/6685900 [01:01<00:00, 108530.29it/s]
# Reviews loaded: 42316


In [7]:
businesses_intermediate_file = open(intermediate_data_path + '/' + STATE_TO_FILTER + '_businesses.json', 'w')
for b in businesses:
    json.dump(businesses[b].__dict__, businesses_intermediate_file)
    businesses_intermediate_file.write("\n")
businesses_intermediate_file.close()

reviews_intermediate_file = open(intermediate_data_path + '/' + STATE_TO_FILTER + '_reviews.json', 'w')
for r in reviews:
    json.dump(reviews[r].__dict__, reviews_intermediate_file)
    reviews_intermediate_file.write("\n")
reviews_intermediate_file.close()

## Collocations:

* down hill -
* great price +
* high quality +
* poor quality -
* huge fan +

## Indicators for negative reviews:
* avoid

In [8]:
def compute_zipf_table(WORDS, sort_parameters=("rank", "ascending"), num_rows=10):
    '''
    WORDS = list of words;
    sort_parameters is 2 tuple: (("rank" | "frequency" | "frequency_times_rank"), ("ascending" | "descending"))
    num_rows: number of rows displayed in table

    '''
    zipf_values = [(wort, frequ, rank, frequ*rank) for rank, (wort, frequ) in enumerate(Counter(WORDS).most_common(len(WORDS)), 1)]
     
    if sort_parameters[0] in ("rank", "frequency", "frequency_times_rank") and sort_parameters[1] in ("ascending", "descending"):
        if sort_parameters[1] == "ascending":
            sorting_order = False
        else:
            sorting_order = True
        if sort_parameters[0] == "rank":
            zipf_values.sort(key = lambda values: values[2], reverse=sorting_order)
        elif sort_parameters[0] == "frequency":
            print("ja")
            zipf_values.sort(key = lambda values: values[1], reverse=sorting_order)
        elif sort_parameters[0] == "frequency_times_rank":
            zipf_values.sort(key = lambda values: values[3], reverse=sorting_order)

        print("\n\nZipf’s law\n")
        print("word"+12*(" ")+ "frequency" +5*(" ") + "rank"+ 9*(" ") + "f*runtitled:Untitled-1")
        print("-----------------------------------------------")   
        i = 0
        for wort, f, r, f_r in zipf_values:
            if i < num_rows:
                i += 1 
                if len(str(wort))<15:
                    wort = wort+ " "*(15-len(str(wort)))
                if len(str(f))<12:
                    f_str = str(f) + " "*(12-len(str(f)))
                if len(str(r))<12:
                    r_str = str(r) + " "*(12-len(str(r)))
                if len(str(f_r))<12:
                    f_r_str = str(f_r)+ " "*(12-len(str(f_r)))
                print(wort,f_str,r_str,f_r_str, "\n")   
            else:
                 break
    else:
        print("Invalid sorting parameter(s)!")
    return zipf_values

def get_words(review_dict):
    #return [word for review_obj in review_dict.values() for word in nltk.word_tokenize(review_obj.text)]
    all_words = []
    for review_obj in tqdm(review_dict.values()):
        for word in nltk.word_tokenize(review_obj.text):
            all_words.append(word)
    return all_words



In [9]:
thounsand_reviews = {key:value for index, (key, value) in enumerate(reviews.items()) if index < 10000}
        



In [10]:

WORDS = compute_zipf_table(get_words(thounsand_reviews), ("rank", "descending"), num_rows=100)

print(len(WORDS))

100%|██████████| 10000/10000 [00:10<00:00, 974.01it/s]


Zipf’s law

word            frequency     rank         f*runtitled:Untitled-1
-----------------------------------------------
.trust          1            34123        34123        

shift/slide     1            34122        34122        

tectonic        1            34121        34121        

'in-between     1            34120        34120        

hardness/thickness/denseness 1            34119        34119        

it.m            1            34118        34118        

ENOUGH          1            34117        34117        

chalkenge       1            34116        34116        

hockey-puck     1            34115        34115        

3300            1            34114        34114        

Medicare        1            34113        34113        

.charge         1            34112        34112        

Garfield        1            34111        34111        

Varies          1            34110        34110        

must-s