In [308]:
%matplotlib inline
import numpy as np
import scipy as sp
from scipy.stats.stats import pearsonr
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc

import seaborn as sns
sns.set_style()
sns.set_context("talk")
import warnings
warnings.filterwarnings("ignore")

import itertools

In [309]:
import re
re.compile('<title>(.*)</title>')

re.compile(r'<title>(.*)</title>', re.UNICODE)

In [310]:
# Please INSTALL this package if not already done
#!pip install surprise

In [311]:
import surprise
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split

#### 1. This is the rating prediction and similar restaurant recommendation part
#### 2. We will use SVD for factorization, and SGD to find the optimal solution
#### 3. To calculate similarity, we will use cosing similarity
#### 4. Python's surprise package is used

In [312]:
# Import the processed dataset
dataset = pd.read_csv('review_and_business_data_cleaned_withdate.csv')

In [313]:
first_model_df = dataset[['user_id', 'name', 'review_stars', 'categories']]

In [314]:
# Processing the categories column

In [315]:
first_model_df

Unnamed: 0,user_id,name,review_stars,categories
0,IMguz1Z9dp8HG0UfeLEdEg,Green World Cleaners,5.0,"Dry Cleaning & Laundry, Local Services, Laundr..."
1,_TAVpa1Y2_5KZ5wWYeX_6g,Green World Cleaners,1.0,"Dry Cleaning & Laundry, Local Services, Laundr..."
2,45R6BBybzwDuJaL08d1myQ,Green World Cleaners,5.0,"Dry Cleaning & Laundry, Local Services, Laundr..."
3,lYvUtZWr1gGv4vlwNcJXDQ,Green World Cleaners,1.0,"Dry Cleaning & Laundry, Local Services, Laundr..."
4,b0eCjnqua4C3f0OLDhZHxA,Green World Cleaners,1.0,"Dry Cleaning & Laundry, Local Services, Laundr..."
...,...,...,...,...
679945,5RMr1Xn9qSfuGVZKI7g4rQ,Ignite Funding,1.0,"Commercial Real Estate, Mortgage Brokers, Home..."
679946,bjx0elc3ZZgsfsxP6JxROA,Ignite Funding,4.0,"Commercial Real Estate, Mortgage Brokers, Home..."
679947,jbdTkLBXvZ7MW-eQYz4Mew,Ignite Funding,4.0,"Commercial Real Estate, Mortgage Brokers, Home..."
679948,wvbPHizTJNz26s-xMO-vGw,Ignite Funding,1.0,"Commercial Real Estate, Mortgage Brokers, Home..."


In [316]:
# =============== CATEGORY COLUMN CLEANING STARTS==============

In [317]:
first_model_df["categories"] = first_model_df['categories'].astype(str)

In [318]:
category_string = []
dict_cat = {}
for i in range(len(first_model_df["categories"].values)):
        for j in range(len(first_model_df["categories"].values[i].split(","))):
            
            if first_model_df["categories"].values[i].split(",")[j] not in dict_cat:
                dict_cat[first_model_df["categories"].values[i].split(",")[j]] = 1
            else:
                dict_cat[first_model_df["categories"].values[i].split(",")[j]] += 1

In [319]:
# Viewing the top repeated keywords

In [320]:
sorted(dict_cat.items(), key=lambda x: x[1])[-100:]

[(' Massage', 6975),
 (' Fitness & Instruction', 7241),
 ('Italian', 7291),
 ('Pizza', 7302),
 (' Diners', 7317),
 (' Chicken Wings', 7405),
 (' Ice Cream & Frozen Yogurt', 7413),
 ('Automotive', 7549),
 (' Thai', 7623),
 (' Mediterranean', 7630),
 ('Breakfast & Brunch', 7900),
 (' French', 7949),
 ('Japanese', 8070),
 (' Korean', 8183),
 ('Active Life', 8185),
 ('Mexican', 8288),
 (' Pubs', 8398),
 ('Sandwiches', 8442),
 ('Seafood', 8455),
 (' Home & Garden', 9052),
 ('Home Services', 9212),
 ('American (Traditional)', 9315),
 ('Hotels & Travel', 9328),
 (' Real Estate', 9386),
 ('Health & Medical', 9541),
 (' Soup', 9640),
 (' Wine Bars', 9783),
 (' Vegetarian', 9919),
 (' Waxing', 10161),
 (' Nail Salons', 10371),
 (' Auto Repair', 11015),
 (' Juice Bars & Smoothies', 11044),
 ('Bars', 11088),
 (' Local Flavor', 11186),
 (' Music Venues', 11363),
 (' Caterers', 11489),
 (' Professional Services', 11762),
 (' Skin Care', 11773),
 (' Sports Bars', 11966),
 (' Venues & Event Spaces', 1

In [321]:
first_model_df.shape

(679950, 4)

In [322]:
#Check the reduced dataset shape
first_model_df[first_model_df['categories'].str.contains("Italian|Pizza|Diners|Chicken|Yogurt|Thai|Mediterranean|Breakfast|Brunch|French|Japanese|Korean|Mexican|Pubs|Sandwich|Seafood|Soup|Wine|Bars|Vegetarian|Juice|Flavor|Baker|Beer|Vegan|Barbeque|Cafe|Food|Buffet|Chinese|Sushi|Salad|Pizza|Italian|Coffee|Tea|Steak|Dessert|Burger|Sandwich|Restaurant", flags=re.IGNORECASE, regex=True)].shape

(421003, 4)

In [323]:
# Assigning the new dataframe as the main DF
first_model_df = first_model_df[first_model_df['categories'].str.contains("Italian|Pizza|Diners|Chicken|Yogurt|Thai|Mediterranean|Breakfast|Brunch|French|Japanese|Korean|Mexican|Pubs|Sandwich|Seafood|Soup|Wine|Bars|Vegetarian|Juice|Flavor|Baker|Beer|Vegan|Barbeque|Cafe|Food|Buffet|Chinese|Sushi|Salad|Pizza|Italian|Coffee|Tea|Steak|Dessert|Burger|Sandwich|Restaurant", flags=re.IGNORECASE, regex=True)]

In [324]:
first_model_df

Unnamed: 0,user_id,name,review_stars,categories
58,-xDW3gYiYaoeVASXywTPgw,Mama Napoli Pizza,5.0,"Food, Food Trucks, Restaurants, Pizza"
60,OE5MNd5PVORXxcrHEoWPdA,Taco Bell,1.0,"Fast Food, Restaurants"
61,7S2wwOSVSRn4CEZdtQKG_Q,Taco Bell,4.0,"Fast Food, Restaurants"
62,IRMEgNiP4IMRqw8i00ZMug,Taco Bell,5.0,"Fast Food, Restaurants"
63,XUEwSGOGARxW-3gPiGJKUg,Taco Bell,4.0,"Fast Food, Restaurants"
...,...,...,...,...
679933,Wv99UnS1Cpr__vRbo_v9YQ,Habaneros Taco Grill,1.0,"Mexican, Fast Food, Restaurants, Tacos"
679934,rB1mNFQTe-TPDsk8I3PpqA,Zest - Bistro & Bar,4.0,"American (New), Karaoke, Restaurants, Lounges,..."
679935,5LqcnK9OC3UTIicsT0iv8g,Zest - Bistro & Bar,5.0,"American (New), Karaoke, Restaurants, Lounges,..."
679936,jS5ICHrBOyo8LFvyLjtkqw,Zest - Bistro & Bar,5.0,"American (New), Karaoke, Restaurants, Lounges,..."


In [325]:
# =============== CATEGORY COLUMN CLEANING ENDS==============

In [326]:
# Drop nulls
first_model_df.dropna(inplace = True)

In [327]:
# Check what is the rating scale, from below we see it is from 1 to 5
first_model_df.describe()

Unnamed: 0,review_stars
count,421003.0
mean,3.703551
std,1.452722
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


In [328]:
first_model_df.shape

(421003, 4)

In [329]:
# Convert ratings to int
first_model_df['review_stars'] = first_model_df['review_stars'].astype('int')

In [330]:
# Defining the rating scale for our model
reader = Reader(rating_scale = (1,5))

In [331]:
# Loading the dataset
first_model_df = first_model_df[['user_id', 'name', 'review_stars']]
data = Dataset.load_from_df(first_model_df,reader)

In [332]:
# Train, test split
trainset, testset = train_test_split(data, test_size = 0.25)

In [333]:
# Taking number of latent features as 100; this can be changed
model = SVD(n_factors = 100)

In [334]:
# Fitting the model on the trainset
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a65273b50>

In [335]:
# Inspect the shape of our product matrix
model.qi.shape

(4262, 100)

In [336]:
# In above, we see that for 22240 data rows, we have created 1000 latent features

In [337]:
# Mapping vector back to its restaurant
dict_items_restaurants = trainset._raw2inner_id_items.items()
dict_restaurants = trainset._raw2inner_id_items
print(list(dict_items_restaurants)[:5])

[('WinCo Foods', 0), ('The Buffet at ARIA', 1), ('Paradise Place Jamaican Cuisine', 2), ('White Castle', 3), ('The Cosmopolitan of Las Vegas', 4)]


In [338]:
# Predicting Rating ---> Using MATRIX RECONSTRUCTION

In [339]:
customer_test = '_TAVpa1Y2_5KZ5wWYeX_6g'
restaurant_test = 'MadHouse Coffee'
model.predict(customer_test,restaurant_test)

Prediction(uid='_TAVpa1Y2_5KZ5wWYeX_6g', iid='MadHouse Coffee', r_ui=None, est=3.581543501402021, details={'was_impossible': False})

In [340]:
# RECOMMENDING RESTAURANT ---> Using ITEM SIMILARITY, USING COSINE SIMILARITY
# Similarity is high when cosine is low (close to 0), similarity is less when cosine is high (close to 1)

In [341]:
# Fetch indices
restraurant1_idx = model.trainset._raw2inner_id_items['MadHouse Coffee']
restraurant2_idx = model.trainset._raw2inner_id_items["Roberto's Taco Shop"]

In [342]:
# Get vectors
restraurant1_vector = model.qi[restraurant1_idx]
restraurant2_vector = model.qi[restraurant2_idx]

In [343]:
# Cosine distance between restaurants
sp.spatial.distance.cosine(restraurant1_vector,restraurant2_vector)

1.0462852767347521

In [344]:
def similar_restaurants(input_business):
    reco = {}
    for i in dict_restaurants.keys():
            reco[i] = np.abs(sp.spatial.distance.cosine(model.qi[model.trainset._raw2inner_id_items[input_business]],model.qi[model.trainset._raw2inner_id_items[i]]))
    
    return(sorted(reco.items(), key=lambda x: x[1]))


In [345]:
# First restaurant, will always be itself, as similarity with itself is 0
# TOP 10 Recommendations
similar_restaurants("Roberto's Taco Shop")[:11]

[("Roberto's Taco Shop", 0.0),
 ('Paleteria Y Neveria Mexicana', 0.6613472236191806),
 ('Vegan Meals by Mindy', 0.6821334672285561),
 ('Wreck Room', 0.6863374458761893),
 ('ChinaLatina', 0.6928993260863445),
 ('Palms Race & Sports Book', 0.6934508808314943),
 ('Nigerian Food', 0.6935258838346848),
 ('Pyramid Cafe', 0.6939921263453466),
 ('USO Terminal 1', 0.6985395610486261),
 ('Pho Kim Long', 0.7057687739978599),
 ('IKEA', 0.712755359242305)]