# YELP Dataset analysis

In [1]:
# imports
from collections import Counter
import itertools
import json
import nltk
import os
from tqdm import tqdm

In [2]:
# DEFAULTS

# define folder structure
base_path = os.getcwd()
data_path = base_path + '/data'
intermediate_data_path = data_path + '/intermediate'

try:
    os.mkdir(data_path)
    print("Folder created.")
except FileExistsError:
    print("Folder already exists.")

try:
    os.mkdir(intermediate_data_path)
    print("Folder created.")
except FileExistsError:
    print("Folder already exists.")

Folder already exists.
Folder already exists.


In [3]:
# DATA MODELS

# Business
class Business:
    def __init__(self, json):
        self.__dict__ = json

businesses = dict()

# Checkin
class Checkin:
    def __init__(self, json):
        self.__dict__ = json

checkins = []

# Review
class Review:
    def __init__(self, json):
        self.__dict__ = json

reviews = dict()

# Tip
class Tip:
    def __init__(self, json):
        self.__dict__ = json

tips = []

# User
class User:
    def __init__(self, json):
        self.__dict__ = json

users = dict()

In [4]:
# import files
with open(data_path + '/yelp/yelp_academic_dataset_business.json', encoding="utf8") as businesses_file:
    for l in tqdm(businesses_file.readlines()):
        b = Business(json.loads(l))
        businesses[b.business_id] = b

with open(data_path + '/yelp/yelp_academic_dataset_checkin.json', encoding="utf8") as checkins_file:
    for l in tqdm(checkins_file.readlines()):
        c = Checkin(json.loads(l))
        checkins.append(c)

with open(data_path + '/yelp/yelp_academic_dataset_review.json', encoding="utf8") as reviews_file:
    for l in tqdm(reviews_file.readlines()):
        r = Review(json.loads(l))
        reviews[r.review_id] = r

with open(data_path + '/yelp/yelp_academic_dataset_tip.json', encoding="utf8") as tips_file:
    for l in tqdm(tips_file.readlines()):
        t = Tip(json.loads(l))
        tips.append(t)

with open(data_path + '/yelp/yelp_academic_dataset_user.json', encoding="utf8") as users_file:
    for l in tqdm(users_file.readlines()):
        u = User(json.loads(l))
        users[u.user_id] = u

100%|████████████████████████████████████████████████████████████████████| 192609/192609 [00:02<00:00, 68128.90it/s]
100%|███████████████████████████████████████████████████████████████████| 161950/161950 [00:01<00:00, 127319.34it/s]


KeyboardInterrupt: 

In [None]:
print("# of Businesses: " + str(len(businesses)))
print("# of Checkins: " + str(len(checkins)))
print("# of Reviews: " + str(len(reviews)))
print("# of Tips: " + str(len(tips)))
print("# of Users: " + str(len(users)))

## Collocations:

* down hill -
* great price +
* high quality +
* poor quality -
* huge fan +

## Indicators for negative reviews:
* avoid

In [None]:
def compute_zipf_table(WORDS, sort_parameters=("rank", "ascending"), num_rows=10):
    '''
    WORDS = list of words;
    sort_parameters is 2 tuple: (("rank" | "frequency" | "frequency_times_rank"), ("ascending" | "descending"))
    num_rows: number of rows displayed in table

    '''
    zipf_values = [(wort, frequ, rank, frequ*rank) for rank, (wort, frequ) in enumerate(Counter(WORDS).most_common(len(WORDS)), 1)]
     
    if sort_parameters[0] in ("rank", "frequency", "frequency_times_rank") and sort_parameters[1] in ("ascending", "descending"):
        if sort_parameters[1] == "ascending":
            sorting_order = False
        else:
            sorting_order = True
        if sort_parameters[0] == "rank":
            zipf_values.sort(key = lambda values: values[2], reverse=sorting_order)
        elif sort_parameters[0] == "frequency":
            print("ja")
            zipf_values.sort(key = lambda values: values[1], reverse=sorting_order)
        elif sort_parameters[0] == "frequency_times_rank":
            zipf_values.sort(key = lambda values: values[3], reverse=sorting_order)

        print("\n\nZipf’s law\n")
        print("word"+12*(" ")+ "frequency" +5*(" ") + "rank"+ 9*(" ") + "f*runtitled:Untitled-1")
        print("-----------------------------------------------")   
        i = 0
        for wort, f, r, f_r in zipf_values:
            if i < num_rows:
                i += 1 
                if len(str(wort))<15:
                    wort = wort+ " "*(15-len(str(wort)))
                if len(str(f))<12:
                    f_str = str(f) + " "*(12-len(str(f)))
                if len(str(r))<12:
                    r_str = str(r) + " "*(12-len(str(r)))
                if len(str(f_r))<12:
                    f_r_str = str(f_r)+ " "*(12-len(str(f_r)))
                print(wort,f_str,r_str,f_r_str, "\n")   
            else:
                 break
    else:
        print("Invalid sorting parameter(s)!")
    return zipf_values

def get_words(review_dict):
    #return [word for review_obj in review_dict.values() for word in nltk.word_tokenize(review_obj.text)]
    all_words = []
    for review_obj in tqdm(review_dict.values()):
        for word in nltk.word_tokenize(review_obj.text):
            all_words.append(word)
    return all_words



In [None]:
thounsand_reviews = {key:value for index, (key, value) in enumerate(reviews.items()) if index < 10000}
        



In [None]:

WORDS = compute_zipf_table(get_words(thounsand_reviews), ("rank", "descending"), num_rows=100)

print(len(WORDS))