In [62]:
import pandas as pd
import numpy as np
from collections import Counter 
from operator import itemgetter
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [2]:
review = pd.read_csv('yelp-dataset/yelp-dataset/yelp_review.csv')
business = pd.read_csv('yelp-dataset/yelp-dataset/yelp_business.csv')

In [3]:
business_rest = business[business['categories'].str.lower().str.contains('restaurants')]

In [4]:
% time
category = [business_rest.iloc[i,:]['categories'].lower() for i in range(business_rest.shape[0])]

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.44 µs


In [117]:
# restaurant's business id
# remove not the restaurant's reviews
bid = np.array(business_rest['business_id'])
review_rest = review[review['business_id'].isin(bid)].reset_index()

In [10]:
% time
categories = []
removelist = ['restaurant','restaurants']
for cat in category:
    temp = cat.split(';')
    temp = [t for t in temp if t not in removelist]
    categories.append(temp)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 6.68 µs


In [13]:
# a business could belong to multiple categories
biz_cat_map = {biz:cat for biz, cat in zip(business_rest['business_id'],categories)}
cot = [t for c in categories for t in c ]
cot_ = Counter(cot)
cat_top10 = sorted(cot_.items(), key=itemgetter(1),reverse=True)[:10]

In [122]:
# (category:review)
review_category_map = []
for i in range(review_rest.shape[0]):
    category = biz_cat_map.get(review_rest.loc[review_rest.index[i],'business_id'])
    if category:
        review_category_map.append(category)
    else:
        review_category_map.append(['NotFound'])
review_category_map = np.array(review_category_map)

In [124]:
# if any review doesn't have any corresponding category
nocat = []
for idx in range(len(review_category_map)):
    if 'NotFound' in review_category_map[idx]:
        nocat.append(idx)

In [126]:
nocat = np.array(nocat)
withcate = np.ones(len(review_category_map)).astype(bool)
withcate[nocat] = False
review_category_map = review_category_map[withcate] # len: 3219106

In [127]:
review_category_map

array([['diners', 'food', 'delis'], ['canadian (new)', 'italian'],
       ['specialty food', 'food', 'sandwiches', 'burgers', 'delis'], ...,
       ['chinese'], ['korean'],
       ['seafood', 'nightlife', 'steakhouses', 'bars']], dtype=object)

In [128]:
# remove any review without corresponding category
review_text = review.iloc[withcate,:]['text'] # shape = (3219106,)

In [129]:
mlb = MultiLabelBinarizer()
multilabel = mlb.fit_transform(np.array(review_category_map)) # shape = (3219106, 697)

In [130]:
label_name = mlb.classes_

In [132]:
label_name

array(['& probates', 'acai bowls', 'accessories', 'accountants',
       'active life', 'acupuncture', 'adult', 'adult education',
       'adult entertainment', 'advertising', 'afghan', 'african',
       'air duct cleaning', 'aircraft repairs', 'airport lounges',
       'airport shuttles', 'airport terminals', 'airports', 'airsoft',
       'alsatian', 'amateur sports teams', 'american (new)',
       'american (traditional)', 'amusement parks', 'animal shelters',
       'antiques', 'apartments', 'appliances', 'appliances & repair',
       'aquarium services', 'aquariums', 'arabian', 'arcades', 'argentine',
       'armenian', 'art classes', 'art galleries', 'art schools',
       'arts & crafts', 'arts & entertainment', 'asian fusion',
       'australian', 'austrian', 'auto customization', 'auto detailing',
       'auto glass services', 'auto insurance', 'auto parts & supplies',
       'auto repair', 'auto upholstery', 'automotive',
       'baby gear & furniture', 'baden', 'bagels', 'bague