In [1]:
import gzip
import json
import random
import math
import time
from collections import defaultdict

import numpy as np
import tensorflow as tf
import pandas as pd

import sklearn
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image

2023-12-05 20:03:31.135953: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
############################## 
### Data Setup             ###
##############################

In [3]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readJSON(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        d = json.loads(l)

        yield d

# For REVIEW data
def readJSON_1(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        d = json.loads(l)
        u = d['user_id']
        n = d['name']
        b = d['gmap_id']  # businessID
        r = d['rating']
        yield u,b,r,n,d

# For BUSINESS META-data
def readJSON_2(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        d = json.loads(l)
        b      = d['gmap_id']  # businessID
        cat    = d['category']
        coords = (d['latitude'], d['longitude'])
        name   = d['name']
        #state = d['state']   #<<< shouldn't be "permanently closed"
        yield b,name,cat,coords
        #yield b,cat,coords

In [4]:
allReviews = []
for l in readJSON("review-District_of_Columbia_10.json.gz"):
    allReviews.append(l)

In [5]:
allMeta = []
for l in readJSON("meta-District_of_Columbia.json.gz"):
    allMeta.append(l)

In [6]:
# Reviews by Business ID
reviewById = defaultdict(list)

for review in allReviews:
    gmap_id = review["gmap_id"]
    reviewById[gmap_id].append(review)


In [7]:
# Reviews by Business ID
metaById = defaultdict(list)

for meta in allMeta:
    gmap_id = meta["gmap_id"]
    metaById[gmap_id].append(meta)


In [8]:
allMetaData = []
for l in readJSON_2("meta-District_of_Columbia.json.gz"):
    allMetaData.append(l)

In [9]:
word_list = ["restaurant", "food", "pizza", "juice", "dessert", "takeout", "sandwich", "diner", "bar", "cocktail", "coffee", "cafe", "deli"]  #EXPAND, NON-MANUALLY?
word_blacklist = ["barber shop", "eyebrow bar"]

restaurant_gmapIDs = set()

count=0
for i in range(len(allMetaData)):
    if(allMetaData[i][2]):
        category_list = [word.lower() for word in allMetaData[i][2]]
    else:
        category_list = []

    contains_word             = any(any(word in s for word in word_list) for s in category_list) # true if any word in word_list is contained in somewhere in category_list
    contains_blacklisted_word = any(any(word in s for word in word_blacklist) for s in category_list)

    if(category_list and contains_word and not contains_blacklisted_word):
        #print(category_list)
        restaurant_gmapIDs.add(allMetaData[i][0])

        count+=1
print(count)

restaurant_gmapIDs = list(restaurant_gmapIDs)

3453


In [10]:
# for every reviewed place that is a restaurant

allRestaurantReviews = []
for review in allReviews:
    u = review["user_id"]
    b = review["gmap_id"]
    r = review["rating"]

    if b in restaurant_gmapIDs:
        allRestaurantReviews.append((u,b,r, review))
        
print(len(allRestaurantReviews))

347232


In [35]:
userIDs = {}
bizIDs = {}
interactions = []

for u,b,r,_ in allRestaurantReviews:
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not b in bizIDs: bizIDs[b] = len(bizIDs)
    interactions.append((u,b,r))

In [36]:
random.shuffle(allRestaurantReviews)

nTrain = int(len(interactions) * 0.8)
restReviewsTrain = allRestaurantReviews[:nTrain]
restReviewsTest = allRestaurantReviews[nTrain:]

In [37]:
def mean_squared_error(labels, predicted):
    # Ensure both actual and predicted have the same length
    if len(labels) != len(predicted):
        raise ValueError("Lengths of actual and predicted arrays should be the same.")

    # Calculate MSE
    differences = [(x-y)**2 for x,y in zip(predicted,labels)]
    return sum(differences) / len(differences)

In [38]:
############################## 
### Linear Regression      ###
##############################

In [39]:
review_length_train = [len(r[3]['text']) if r[3]['text'] is not None else 0 for r in restReviewsTrain]

In [40]:
## Average User Rating Dictionary (using training data ONLY)
reviewsPerUser = defaultdict(list)

for u,b,r,_ in restReviewsTrain:
    reviewsPerUser[u].append(r)

userRatingAvgs = {}
for user in reviewsPerUser:
    if len(reviewsPerUser[user]) != 0:
        userRatingAvgs[user] = sum(reviewsPerUser[user])/len(reviewsPerUser[user])

In [41]:
## Average Business Rating Dictionary (using training data ONLY)
reviewsPerBiz = defaultdict(list)

for u,b,r,_ in restReviewsTrain:
    reviewsPerBiz[b].append(r)

bizRatingAvgs = {}
for biz in reviewsPerBiz:
    if len(reviewsPerBiz[biz]) != 0:
        bizRatingAvgs[biz] = sum(reviewsPerBiz[biz])/len(reviewsPerBiz[biz])

In [65]:
len(bizRatingAvgs)

2767

In [53]:
# Mean rating (using training data ONLY)
mu = sum([r for _,_,r,_ in restReviewsTrain]) / len(restReviewsTrain)

In [54]:
## Above is preprocessing
## Below is combining the features

In [55]:
def feauture_engineering(dataset):
#     users = [d[0] for d in dataset]
#     biz = [d[1] for d in dataset]
#     dataframe = pd.DataFrame({"users": users, "business":biz})
    dataframe = pd.DataFrame()
    
    dataframe["review_length"] = [len(r[3]['text']) if r[3]['text'] is not None else 0 for r in dataset]
    dataframe["avg_user_rating"] = [userRatingAvgs[d[0]] if d[0] in userRatingAvgs.keys() else mu for d in dataset]
    dataframe["avg_biz_rating"] = [bizRatingAvgs[d[1]] if d[1] in bizRatingAvgs.keys() else mu for d in dataset]
    # sentiment analysis library
    # 
    
    dataframe["labels"] = [r[2] for r in dataset]
    return dataframe

In [56]:
## Generating data
data = feauture_engineering(restReviewsTrain)
y = data['labels']
X = data.drop(['labels'], axis=1)

In [57]:
## Standardizing the features and splitting into train and validation
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.1, random_state=42)

In [58]:
## Fitting Model
model = linear_model.LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [59]:
## MSE of training set:
y_train_pred = model.predict(X_train)
mse = mean_squared_error(y_train, y_train_pred)
mse

0.6340336602745007

In [60]:
## MSE of validation set:
y_valid_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_valid_pred)
mse

0.613035587635915

In [61]:
# Accessing coefficients
coefficients = model.coef_
print("Coefficients:", coefficients)

# Accessing intercept
intercept = model.intercept_
print("Intercept:", intercept)

Coefficients: [-0.08321184  0.49140445  0.21595786]
Intercept: 4.287303624850521


In [66]:
## TESTING
data_test = feauture_engineering(restReviewsTest)
X_test = data.drop(['labels'], axis=1)
X_test_scaled = scaler.transform(X_test)
y_test = data['labels']
y_test_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_test_pred)
mse

0.6319338152151112