In [1]:
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np

def process_reviews():

    # read file in batches with pandas
    batch_size = 100000
    total_minibatches = 60
    
    # sets to hold the users and businesses
    users = set()
    businesses = set()
    user_review_pairs = set()

    # list to hold different ratings
    ratings = [0,0,0,0,0]

    # defaultdict to store ratings by year
    ratings_by_year = defaultdict(list)
    ratings_by_month = defaultdict(list)
    
    count=0
    # process dataset in minibatches
    for batch in pd.read_json('yelp_academic_dataset_review.json', lines=True, chunksize=batch_size):

        # progress
        print("Processing minibatch ", count+1, "/", total_minibatches)

        # iterate through current minibatch to collect data
        for index, row in batch.iterrows():

            # get current user and business for review
            curr_user = row['user_id']
            curr_business = row['business_id']
            curr_rating = row['stars']

            # split date into list of year, month, date and add rating to current year ratings
            review_date = str(row['date']).split('-')
            ratings_by_year[review_date[0]].append(curr_rating) 

            # process months < 10 to be single digit
            if review_date[1][0] == '1':
                review_month = review_date[1]
            else:
                review_month = review_date[1][1]

            # append to ratings for this month
            ratings_by_month[review_month].append(curr_rating)

            # add current user and business to sets
            users.add(curr_user)
            businesses.add(curr_business)

            # add to pairs of user - business review
            user_review_pairs.add((curr_user, curr_business))

            # increment number of ratings for these stars
            ratings[curr_rating - 1] += 1

        count+=1

    # Print statistics
    print("\nTotal users: ", len(users))
    print("Total businesses: ", len(businesses))
    print("Total reviews: ", len(user_review_pairs), "\n")
    
    # plots -- uncomment for statistics and plots
    #plot_ratings(ratings)
    #plot_ratings_by_year(ratings_by_year)
    #plot_ratings_by_month(ratings_by_month)

    return users, businesses, user_review_pairs


In [2]:
users, businesses, user_review_pairs = process_reviews()

Processing minibatch  1 / 60
Processing minibatch  2 / 60
Processing minibatch  3 / 60
Processing minibatch  4 / 60
Processing minibatch  5 / 60
Processing minibatch  6 / 60
Processing minibatch  7 / 60
Processing minibatch  8 / 60
Processing minibatch  9 / 60
Processing minibatch  10 / 60
Processing minibatch  11 / 60
Processing minibatch  12 / 60
Processing minibatch  13 / 60
Processing minibatch  14 / 60
Processing minibatch  15 / 60
Processing minibatch  16 / 60
Processing minibatch  17 / 60
Processing minibatch  18 / 60
Processing minibatch  19 / 60
Processing minibatch  20 / 60
Processing minibatch  21 / 60
Processing minibatch  22 / 60
Processing minibatch  23 / 60
Processing minibatch  24 / 60
Processing minibatch  25 / 60
Processing minibatch  26 / 60
Processing minibatch  27 / 60
Processing minibatch  28 / 60
Processing minibatch  29 / 60
Processing minibatch  30 / 60
Processing minibatch  31 / 60
Processing minibatch  32 / 60
Processing minibatch  33 / 60
Processing minibatc

In [4]:
def plot_ratings_by_month(ratings):

    # y-axis for plot to hold average rating per month
    y = []

    # iterate for all months 1 - 12
    for i in range(1, 13):

        # get count for reviews for current month
        count = len(ratings[str(i)])

        # calculate average rating for this month
        avg_rating = sum(ratings[str(i)]) / count

        # append to list
        y.append(avg_rating)

        # print statistic
        print("Number of reviews for month %d: %d" % (i, count))

    # arrange axes for graph
    x = np.arange(len(y))

    # plot graph and labels
    plt.plot(x, y)
    plt.xticks(x, ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'))
    plt.ylim(1, 5)
    plt.xlabel('Month')
    plt.ylabel('Average rating')
    plt.title('Average rating per month')
    plt.show()

In [5]:
def plot_ratings_by_year(ratings):

    # y-axis for plot to hold average rating per year
    y = []

    # iterate for all years from 2004 - 2018
    for i in range(2004, 2019):

        # get count of reviews for current year
        count = len(ratings[str(i)])

        # calculate average rating for this year
        avg_rating = sum(ratings[str(i)]) / count

        # append to list
        y.append(avg_rating)

        # print statistic
        print("Number of reviews for %d: %d" % (i, count))

    # arrange axes for graph
    x = np.arange(len(y))

    # plot graph and labels
    plt.plot(x, y)
    plt.xticks(x, list(range(2004,2019)))
    plt.ylim(1, 5)
    plt.xlabel('Year')
    plt.ylabel('Average rating')
    plt.title('Average rating per year')
    plt.show()


In [6]:
def plot_ratings(ratings):

    # print statistics
    for i in range(len(ratings)):
        print("Number of %d star ratings: %d" % (i+1, ratings[i]))

    # arrange axes for bar plot
    x = np.arange(5)

    # plot bar and label
    plt.bar(x, ratings, width=0.35)
    plt.xticks(x, ('1.0', '2.0', '3.0', '4.0', '5.0'))
    plt.xlabel('Ratings')
    plt.ylabel('Count of each rating')
    plt.title('Number of star ratings for all reviews')

    # Display plot of ratings
    plt.show()


In [3]:
len(users)

1518169

In [21]:
batch_size = 100000
count = 0
total_minibatches = 16

for batch in pd.read_json('yelp_academic_dataset_user.json', lines=True, chunksize=batch_size):
    # progress
    print("Processing minibatch ", count+1, "/", total_minibatches)
    userReviewCount = defaultdict(int)
    
    # iterate through current minibatch to collect data
    for index, row in batch.iterrows():
        curr_user = row['user_id']
        review_count = row['review_count']
        userReviewCount[curr_user] = review_count
    count += 1
            
            

            
            

Processing minibatch  1 / 16
Processing minibatch  2 / 16
Processing minibatch  3 / 16
Processing minibatch  4 / 16
Processing minibatch  5 / 16
Processing minibatch  6 / 16
Processing minibatch  7 / 16
Processing minibatch  8 / 16
Processing minibatch  9 / 16
Processing minibatch  10 / 16
Processing minibatch  11 / 16
Processing minibatch  12 / 16
Processing minibatch  13 / 16
Processing minibatch  14 / 16
Processing minibatch  15 / 16
Processing minibatch  16 / 16


In [9]:
len(userReviewCount)

18169

In [22]:
count = [(userReviewCount[d],d) for d in userReviewCount]

In [23]:
count.sort()
count.reverse()
print(count)

[(4175, 'wZPizeBxMAyOSl0M0zuCjg'), (2462, 'eSlOI3GhroEtcbaD_nFXJQ'), (1304, 'lMY8NBPyzlPbbu-KBYfD9A'), (1236, 'Kbl3Pycg1SkH9pjch531UQ'), (1101, 'hwSSM8Jn29dXRW4ym-roDQ'), (1065, 'wkrFqNxwTEEkG3mJV88Izw'), (1057, 'msOVLjTebf51njN6n22CiQ'), (1031, '_eJ9pXiPKoObEWpI2MmlqA'), (1029, 'WfePuaS_iXGNGU6bfDe3JQ'), (1015, '5DgFmyjW6hkBtXtTMKl4tA'), (923, 'eSU_HjKOA3Ppt8oS07oYkw'), (861, '03RJcIDcNwH4WQmqjIvwIw'), (851, '2-XLTj0Cfd81_XlmHo02Aw'), (833, 'KFZISl5mOUJ6bu3qCUrO-g'), (778, 'qy8DP0GZ4KIsNdnepjSLZQ'), (760, 's_rK7Dda29Gy6clJ1TFNmQ'), (709, 'e8ctA42_X95wLlrf09YZQg'), (685, 'PadmV2GEoA6mWpQUpPh7Ig'), (669, 'WJOjqOmrM2jJxwrlqHIK9w'), (643, 'dByAcisOdQIDEYufsQwKlg'), (634, 'vWZbYMFteqB85HlMCA5GMw'), (616, 'BxNBl3VrzLBAkwpJ3EUgXg'), (608, 'pFRE2mNCQvx9DUU542c6Dw'), (597, 'RYjt1X8Plwayx0cf6aLobA'), (582, 'Y1k3ofmktp644vah_TTs3g'), (572, 'IeV1IcxMzPWji8ZYXDzmUw'), (547, 'cDz4PbqLBkn2FLqMtNaXAg'), (546, 'uIB8qFzG5E3PUGfiCiUmqg'), (528, 'gbjJTvt9-vT9Fdpg00lnRg'), (515, 'IMD57wrCtAPbvBSAlI2nkA'),

In [45]:
USZIP = set()
for i in range(0,100000):
    USZIP.add(str(i))

In [146]:
batch_size = 100000
count = 0
total_minibatches = 2

for batch in pd.read_json('yelp_academic_dataset_business.json', lines=True, chunksize=batch_size):
    # progress
    print("Processing minibatch ", count+1, "/", total_minibatches)
    USBUS = []
    
    # iterate through current minibatch to collect data
    for index, row in batch.iterrows():
        curr_business = row['business_id']
        postal_code = row['postal_code']
        review_count = row['review_count']
        categories = row['categories']
        if postal_code in USZIP:
            USBUS.append(row)
    count += 1

Processing minibatch 


KeyboardInterrupt



In [75]:
USFOOD = []
for d in USBUS:
    if d[3]: 
        for w in d[3].lower().split():
            if 'restaurants' in w or 'food' in w:
                USFOOD.append(d)
                break
print(len(USFOOD))

21294


In [76]:
USBUSID = set()
for d in USFOOD:
    USBUSID.add(d[0])
len(USBUSID)

21294

In [148]:
print(USBUSID)

{'gvCW53iarF1eG030zSKuRA', 'wShT0KXCW0ZAnNKdJr_1pg', 'IZFODW0fifph9urOnl6pGQ', 'brMkNXBaa2aZWMTjDV-9ag', 'eU5Pe5xTC2BFnRDVhrfaog', 'VfH8cF7AOvvQ7hq6hcTj7Q', '5Hrv7KhPD0ALju_9DAuZJg', 'I_aIzz-3iKA3o7G6S4A30w', 'l1UfMEHHQni1ewUY5xe_QA', '-N94ldMGlQw_SHdfm2pi_A', 'FWBWBnM2zjycBI7QrayklA', 'R61ohx9M8sw_gHsPoDCgyw', '6TSpAuUdmo2UEn9emgd8xg', 'LvCt5YHhTKsqH2yI61CNyQ', '8u6wAMmTMdJskMGLYo-5Iw', 'zKeHgMw_yYcTuwxM4NDsbA', 'ZQCu2fgGoatxA6aAm82axQ', 'e9gaoUQEws5tmQROZodZMg', '8mx_TpMhpFLdQi_K5aPdOg', 'cO3m54wXVDJW8ryy_2wpBQ', 'dx034iwBKX3xkilKwqXxpw', '74Q5s1J1xknMX7TAkIPf8w', 'shNT-4EtlH8sHg5tIrfkjA', 'xzepzXckoEc_QwYqlcfDqQ', 'I-5qHCVwT7k_KH67-YOx3A', 'Ez1U0wMD7Wws2BPAYfW3yQ', 'l4x5FFzfiCFrGN0S-uZbBQ', 'FmjU2ZrFQGP7ecy7ZAt3Lw', 'NEZCr0zLIPNYWNF57KsJJQ', 'aq-1jcbOg1yJk98dxntSzQ', 'KzLjbXEDH1e1WrfzrrzZKw', 'iHlgAGn8kL1ZWfSgqxNWVg', 'ZiOvvKr0gwp3qH7Bu3YyTg', 'CaF9FC8GEBfDDbptPgkgBw', 'gWY94C5EgNClUOwqJXrBGg', 'QvC9BTeewBxMgdUadFglKw', 'FirWX-Ep5203TsdiGgShKg', 'nb3v4cQdZAYfbcw6eAD4Dg', 'hVcnr0TMi5

In [77]:
USUSERS = set()
USPAIRS = []
USINDEX = []
batch_size = 100000
count = 0
total_minibatches = 60

for batch in pd.read_json('yelp_academic_dataset_review.json', lines=True, chunksize=batch_size):
    # progress
    print("Processing minibatch ", count+1, "/", total_minibatches)
    
    # iterate through current minibatch to collect data
    for index, row in batch.iterrows():
        curr_user = row['user_id']
        curr_business = row['business_id']
        curr_rating = row['stars']
        
        if curr_business in USBUSID:
            USUSERS.add(curr_user)
            USPAIRS.append((curr_user,curr_business,curr_rating))
            USINDEX.append((count - 1)*batch_size + index)
    count += 1


Processing minibatch  1 / 60
Processing minibatch  2 / 60
Processing minibatch  3 / 60
Processing minibatch  4 / 60
Processing minibatch  5 / 60
Processing minibatch  6 / 60
Processing minibatch  7 / 60
Processing minibatch  8 / 60
Processing minibatch  9 / 60
Processing minibatch  10 / 60
Processing minibatch  11 / 60
Processing minibatch  12 / 60
Processing minibatch  13 / 60
Processing minibatch  14 / 60
Processing minibatch  15 / 60
Processing minibatch  16 / 60
Processing minibatch  17 / 60
Processing minibatch  18 / 60
Processing minibatch  19 / 60
Processing minibatch  20 / 60
Processing minibatch  21 / 60
Processing minibatch  22 / 60
Processing minibatch  23 / 60
Processing minibatch  24 / 60
Processing minibatch  25 / 60
Processing minibatch  26 / 60
Processing minibatch  27 / 60
Processing minibatch  28 / 60
Processing minibatch  29 / 60
Processing minibatch  30 / 60
Processing minibatch  31 / 60
Processing minibatch  32 / 60
Processing minibatch  33 / 60
Processing minibatc

In [82]:
import numpy as np
ratings = np.zeros((len(USUSERS),len(USFOOD)))

MemoryError: 

In [None]:
import numpy as np

class MF():

    def __init__(self, R, K, alpha, beta, iterations):
        """
        Perform matrix factorization to predict empty
        entries in a matrix.

        Arguments
        - R (ndarray)   : user-item rating matrix
        - K (int)       : number of latent dimensions
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        """

        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    def train(self):
        # Initialize user and item latent feature matrice
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Initialize the biases
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])

        # Create a list of training samples
        self.samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] > 0
        ]

        # Perform stochastic gradient descent for number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            mse = self.mse()
            training_process.append((i, mse))
            if (i+1) % 10 == 0:
                print("Iteration: %d ; error = %.4f" % (i+1, mse))

        return training_process

    def mse(self):
        """
        A function to compute the total mean square error
        """
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    def sgd(self):
        """
        Perform stochastic graident descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            # Update biases
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            # Update user and item latent feature matrices
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.P.dot(self.Q.T)

In [83]:
new = user_review_pairs[:100000]

TypeError: 'set' object is not subscriptable

In [103]:
batch_size = 100000
new_pair = set()
new_user = set()
new_business = set()
review_data = []
count = 0
for batch in pd.read_json('yelp_academic_dataset_review.json', lines=True, chunksize=batch_size):
    for index, row in batch.iterrows():
        curr_user = row['user_id']
        curr_business = row['business_id']
        curr_rating = row['stars']
        new_user.add(curr_user)
        new_business.add(curr_business)
        new_pair.add((curr_user,curr_business))
        review_data.append(row)
    break

In [98]:
def extract_user(d):
    user_data = []
    for batch in pd.read_json('yelp_academic_dataset_user.json', lines=True, chunksize=batch_size):
        for index, row in batch.iterrows():
            if row['user_id'] in d:
                user_data.append(row)
    return user_data

In [99]:
def extract_business(d):
    business_data = []
    for batch in pd.read_json('yelp_academic_dataset_business.json', lines=True, chunksize=batch_size):
        for index, row in batch.iterrows():
            if row['business_id'] in d:
                business_data.append(row)
    return business_data

In [100]:
user_data = extract_user(new_user)
business_data = extract_business(new_business)

In [101]:
user_lookup = defaultdict(int)
business_lookup = defaultdict(int)
for idx,d in enumerate(user_data):
    user_lookup[d['user_id']] = idx
for idx,d in enumerate(business_data):
    business_lookup[d['business_id']] = idx

In [142]:
def feature(d):
    user = d['user_id']
    business = d['business_id']
    feat = []
    u = user_data[user_lookup[user]]
    b = business_data[business_lookup[business]]
    feat.append(d['useful'])
    feat.append(d['funny'])
    feat.append(b['is_open'])
    feat.append(b['review_count'])
    feat.append(u['review_count'])
    feat.append(u['fans'])
    feat.append(len(u['elite']))
    feat.append(u['average_stars'])
    feat.append(u['compliment_hot'])
    feat.append(u['compliment_note'])
    feat.append(u['compliment_funny'])
    return feat

In [143]:
train = review_data[:80000]
val = review_data[80000:]
train_feat = []
val_feat = []
train_star = []
val_star = []
for d in train:
    train_feat.append(feature(d))
    train_star.append(d['stars'])
    
for d in val:
    val_feat.append(feature(d))
    val_star.append(d['stars'])

In [107]:
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn import svm
from sklearn.linear_model import LogisticRegression

In [136]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def RMSE(actual,prediction):
    return sqrt(mean_squared_error(actual, prediction))
    

In [138]:
from sklearn import linear_model
def classify(tf, tl, vf, vl):
    classifiers = [
    #KNeighborsClassifier(3),
    #svm.LinearSVC(),
    linear_model.Ridge(0.01, fit_intercept=False),
    linear_model.Ridge(0.1, fit_intercept=False),
    linear_model.Ridge(1, fit_intercept=False),
    linear_model.Ridge(10, fit_intercept=False),
    linear_model.Ridge(100, fit_intercept=False),

    #svm.LinearSVC(gamma=2, C=1),
    #GaussianProcessClassifier(1.0 * RBF(1.0)),
    #DecisionTreeClassifier(max_depth=4),
    #RandomForestClassifier(max_depth=4, n_estimators=50, max_features=1),
    #MLPClassifier(alpha=1),
    #AdaBoostClassifier(),
    #aussianNB(),
    #QuadraticDiscriminantAnalysis()
    #LogisticRegression()
    ]
    
    for clf in classifiers:
        clf.fit(tf, tl)
        pred = clf.predict(vf)
        print (RMSE(vl, pred))

In [144]:
classify(train_feat,train_star,val_feat,val_star)

1.1691759250886706
1.1691759295684543
1.1691759743882695
1.169176424783912
1.1691811480967567


In [None]:
theta