In [None]:
import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as plt
import pandas as pd

## Data Preprocess

In [None]:
yelp_review_train = pd.read_csv("yelp_academic_dataset_review_train.csv")

In [None]:
yelp_review_test = pd.read_csv("yelp_academic_dataset_review_test.csv")

In [None]:
yelp_review_all = yelp_review_train.append(yelp_review_test)

In [None]:
yelp_review_slim = yelp_review_all[["user_id", "business_id", "stars"]]

In [None]:
yelp_review_slim

In [None]:
user_ids, business_ids = yelp_review_slim.user_id.unique(), yelp_review_all.business_id.unique()

In [None]:
yelp_review_mat = np.empty((len(user_ids), len(business_ids))) * np.nan

In [None]:
yelp_review_mat.shape

In [None]:
count = 0
for idx, b_id in enumerate(business_ids):
    temp_df = yelp_review_slim[yelp_review_slim.business_id == b_id]
    u_ids = temp_df.user_id
    stars = temp_df.stars
    user_idx = np.array([np.argwhere(user_ids == u_id)[0][0] for u_id in u_ids])
    yelp_review_mat[user_idx, idx] = stars
    if count % 100 == 0:
        print "indexed {0} businesses..".format(count)
    count += 1
print "DONE"

In [None]:
from scipy.io import savemat

In [None]:
data = {"yelp_review_matrix": yelp_review_mat}

In [None]:
savemat("yelp_review.mat", data)

## Analysis

In [None]:
from scipy.io import loadmat

In [None]:
yelp_review_mat = loadmat("yelp_review.mat")["yelp_review_matrix"]

In [None]:
yelp_review_mat = np.nan_to_num(yelp_review_mat)

In [None]:
### Referencee: http://blog.ethanrosenthal.com/2015/11/02/intro-to-collaborative-filtering/

In [None]:
sparsity = float(len(yelp_review_mat.nonzero()[0]))
sparsity /= (yelp_review_mat.shape[0] * yelp_review_mat.shape[1])
sparsity *= 100
print 'Sparsity: {:4.2f}%'.format(sparsity)

In [None]:
def train_test_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in xrange(ratings.shape[0]):
        non_zeros = ratings[user, :].nonzero()[0]
        if len(non_zeros) == 0:
            continue
        test_ratings = np.random.choice(non_zeros, 
                                        size=int(len(non_zeros) * 0.3), 
                                        replace=False)
        train[user, test_ratings] = 0.
        test[user, test_ratings] = ratings[user, test_ratings]
        
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    return train, test

In [None]:
train_d, val_d = train_test_split(yelp_review_mat)

In [None]:
def collab_filtering(X, d):
    n, m = X.shape
    lamb = 100
    max_iter = 30
    W = np.random.random((n, d))
    H = np.random.random((m, d))
    nan_idx = np.isnan(X)
    for i in range(max_iter):
        X_p = W.dot(H.T)
        X[nan_idx] = X_p[nan_idx]
        H = (la.inv(W.T.dot(W) + lamb*np.identity(d)).dot(W.T.dot(X))).T
        W = (la.inv(H.T.dot(H) + lamb*np.identity(d)).dot(H.T.dot(X.T))).T
    return W, H

In [None]:
U, V = collab_filtering(np.copy(train_d), 5)

In [None]:
mat = U.dot(V.T)

In [None]:
mat

In [None]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [None]:
print 'MSE: ' + str(get_mse(mat, val_d))

In [None]:
import numpy.linalg as la

In [None]:
def k_rank_approximation(k, image):
    U, s, V = la.svd(image, full_matrices=False) 
    s = s[:k]
    U = U[:,:k]
    V = V[:k]
    sigma = np.diag(s)
    return U.dot(sigma).dot(V)

In [None]:
mat = k_rank_approximation(2000, train_d)

In [None]:
print 'MSE: ' + str(get_mse(mat, val_d))

## Test Predict

In [None]:
yelp_biz_test = pd.read_csv("yelp_academic_dataset_business_test.csv")

In [None]:
test_b_ids = yelp_biz_test.business_id.unique()

In [None]:
test_b_idx = np.array([np.argwhere(business_ids == bid)[0][0] for bid in test_b_ids])

In [None]:
preds = np.mean(mat[:,test_b_idx], axis=0)

In [None]:
import csv

In [None]:
with open("biz_pred.csv", "w") as pred_f:
    fieldnames = ["business_id", "stars"]
    writer = csv.DictWriter(pred_f, fieldnames=fieldnames)
    writer.writeheader()
    for idx, bid in enumerate(test_b_ids):
        pred = preds[idx]
        writer.writerow({"business_id": bid, "stars": pred})

## Process Attribute Column of business data

In [None]:
def attr_to_dic(attr):
    attr = eval(attr)
    dic = {}
    for item in attr:
        if "{" in item:
            temp = item.split("{")
            val = eval("{" + temp[1])
            key = temp[0].split(":")[0]
            dic[key] = val
        else:
            temp = item.split(":")
            val = temp[1].strip()
            if val == "True" or val == "False":
                val = eval(val)
            key = temp[0]
            dic[key] = val
    return dic

In [None]:
def add_attr_to_df(df):
    for idx, attr in enumerate(df.attributes):
        if (idx + 1) % 100 == 0:
            print "Finished {0} rows".format(idx + 1)
        if type(attr) != str:
            continue
        dic = attr_to_dic(attr)
        for k in dic:
            v = dic[k]
            if type(v) != dict:
                if k not in df:
                    df[k] = np.nan
                df[k][idx] = v
            else:
                for vk in v:
                    v2 = v[vk]
                    k2 = k + "_" + vk
                    if k2 not in df:
                        df[k2] = np.nan
                    df[k2][idx] = v2

In [None]:
yelp_biz_test = pd.read_csv("yelp_academic_dataset_business_test.csv")

In [None]:
yelp_biz_train = pd.read_csv('yelp_academic_dataset_business_train.csv')

In [None]:
yelp_biz_all = yelp_biz_train.append(yelp_biz_test)

In [None]:
add_attr_to_df(yelp_biz_all)

In [None]:
yelp_biz_all.to_csv('yelp_business_all.csv')