In [40]:
import pandas as pd
import numpy as np
from collections import Counter 
from operator import itemgetter
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [4]:
review = pd.read_csv('yelp-dataset/yelp-dataset/yelp_review.csv')
business = pd.read_csv('yelp-dataset/yelp-dataset/yelp_business.csv')

In [5]:
# extract restaurant's business id 
business_rest = business[business['categories'].str.lower().str.contains('restaurants')]
bid = np.array(business_rest.business_id)

In [6]:
# keep reviews for restaurants only
review_rest = review[review['business_id'].isin(bid)].reset_index(drop=True)

In [24]:
# subsample
random_indx = np.random.permutation(review_rest.shape[0])[:100000]
review_rest = review_rest.iloc[random_indx,:]

In [25]:
category_rest = {business_rest.iloc[i,:]['business_id']: business_rest.iloc[i,:]['categories'].lower() for i in range(business_rest.shape[0])}

In [26]:
# remove restaurant and restaurants in category
categories = dict()
removelist = ['restaurant','restaurants']
for key,value in category_rest.items():
    value_new = value.split(';')
    value_new = [''.join(t) for t in value_new if t not in removelist]
    categories[key] = value_new

In [27]:
rid = review_rest.review_id
bid = review_rest.business_id

In [28]:
print(len(rid))
print(len(bid))

100000
100000


In [29]:
review_categories = []
for b in bid:
    cat = categories.get(b)
    if cat is not None:
        review_categories.append(cat)
    else:
        review_categories.append(['NotFound'])

In [30]:
nocatecory_idx = []
for idx,r in enumerate(review_categories):
    if 'NotFound' in r:
        nocatecory_idx.append(idx)

In [31]:
print(len(review_categories))
print(len(rid))

100000
100000


In [32]:
mlb = MultiLabelBinarizer()
multilabel = mlb.fit_transform(np.array(review_categories)) # shape = (3219106, 697)

In [33]:
label_name = mlb.classes_

In [34]:
multilabel.shape

(100000, 552)

### Review - tfidf

In [35]:
review_rest_ = review_rest['text']

In [43]:
tfidf = TfidfVectorizer(stop_words='english',max_features=1000)

In [44]:
review_tfidf = tfidf.fit_transform(review_rest_)

In [42]:
review_tfidf.shape

(100000, 38)

In [39]:
review_tfidf.toarray()

array([[ 0.        ,  0.        ,  0.        , ...,  0.33522609,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.71976863],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.78145646,  0.        ,  0.        , ...,  0.35588582,
         0.        ,  0.38672422],
       [ 0.        ,  0.        ,  0.29505608, ...,  0.        ,
         0.26291976,  0.        ],
       [ 0.        ,  0.38511733,  0.        , ...,  0.39575974,
         0.        ,  0.        ]])