# Google Local Recommenders

#### Agnes Bao

The goal of this project is to show what I learned about the methods and algorithms of recommendation systems, demonstrated using the Google Local data (http://cseweb.ucsd.edu/~jmcauley/datasets.html#google_local)

### Show Me the Data

In [1]:
import warnings
warnings.filterwarnings('ignore')

import gzip
import pprint

def sneakpeek(fname):
    with gzip.open(fname) as f:
        print(fname)
        line = eval(next(f))
        pprint.pprint(line)

* Places

In [2]:
sneakpeek("data/places.clean.json.gz")

data/places.clean.json.gz
{'address': ['2615 Angler Ave', 'Hemet, CA 92545'],
 'closed': False,
 'gPlusPlaceId': '104699454385822125632',
 'gps': [33.703804, -117.003209],
 'hours': [['Monday', [['6:30 am--4:15 pm']]],
           ['Tuesday', [['6:30 am--4:15 pm']]],
           ['Wednesday', [['6:30 am--4:15 pm']], 1],
           ['Thursday', [['6:30 am--4:15 pm']]],
           ['Friday', [['6:30 am--4:15 pm']]],
           ['Saturday', [['6:30 am--4:15 pm']]],
           ['Sunday', [['6:30 am--4:15 pm']]]],
 'name': 'Diamond Valley Lake Marina',
 'phone': '(951) 926-7201',
 'price': None}


* Reviews:

In [3]:
sneakpeek("data/reviews.clean.json.gz")

data/reviews.clean.json.gz
{'categories': ['Giải Trí - Café'],
 'gPlusPlaceId': '108103314380004200232',
 'gPlusUserId': '100000010817154263736',
 'rating': 3.0,
 'reviewText': 'Chất lượng tạm ổn',
 'reviewTime': 'Jul 1, 2013',
 'reviewerName': 'an lam',
 'unixReviewTime': 1372686659}


* Users data is mostly unstructured and is not very useful

In [4]:
sneakpeek("data/users.clean.json.gz")

data/users.clean.json.gz
{'currentPlace': ['Thành phố Hồ Chí Minh, Việt Nam',
                  [[], 108230990, 1066296640, 1]],
 'education': [[[], [], [], [], [], 6],
               [['Đại học Kiến trúc tp Hồ chí minh, Việt nam',
                 'KS.XD',
                 [[], [1, 1, 2013], 1],
                 '',
                 '']]],
 'gPlusUserId': '100000010817154263736',
 'jobs': [['Tổng công ty IDICO',
           'Chuyên viên Kỹ thuật XD',
           [[1, 1, 1998], [1, 1, 2013], 1],
           '',
           '']],
 'previousPlaces': [['tp. Nam Định, Nam Định, Việt Nam',
                     [[], 204200000, 1061683330, 1]]],
 'userName': 'an lam'}


### After making the dataset easier to work with...

In [5]:
import pandas as pd

places = pd.read_csv("data/places_sub.csv", index_col="gPlusPlaceId")
places.head()

Unnamed: 0_level_0,name,price,closed,lat,long
gPlusPlaceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100327153115986850675,T C's Referee Sports Bar,$$,False,43.529494,-96.792244
106432060150136868000,China Cottage,$$,False,39.692899,-84.136173
105455649021131746775,Byron Center High School,,False,42.810092,-85.702919
100184392614713668281,Smokey Mountain Wings,$$,False,35.98598,-83.610598
110904357941128849034,Juliet Photography,,False,34.998894,-80.797123


In [6]:
reviews = pd.read_csv("data/reviews_sub.csv")
reviews.head()

Unnamed: 0,rating,first_category,gPlusPlaceId,gPlusUserId,unixReviewTime
0,2.0,Restaurant,100073820849130920147,100000053212755369563,1376204000.0
1,2.0,Restaurant,103519165841762621376,100000053212755369563,1376616000.0
2,2.0,American Restaurant,107667540471917464953,100000053212755369563,1376204000.0
3,2.0,Community College,103021148620072345104,100000069918550320216,1366220000.0
4,1.0,Used Car Dealer,109689897713798178848,100000169041345399252,1391464000.0


## 1. Popularity based recommender
I'm a new user searching for restaurants in 70 miles around (42.0480, -87.6843)...

You know nothing about me.

<img src="data/places.png">

In [7]:
# subset a smaller map area
coord = (42.0480, -87.6843)

def subset_places(df, coord, deg):
    return df[(df["lat"] > coord[0] - deg) & 
              (df["lat"] < coord[0] + deg) & 
              (df["long"] > coord[1] - deg) & 
              (df["long"] < coord[1] + deg)]

places = subset_places(places, coord, deg=0.5)
places = places[~places["closed"]]
restaurant_reviews = reviews[reviews["gPlusPlaceId"].isin(places.index) &
                            reviews["first_category"].str.contains("restaurant", case=False, na=False)]
restaurants = places[places.index.isin(restaurant_reviews["gPlusPlaceId"].unique())]

In [8]:
# Size of data set we're working with:
print(f"Number of restaurants: {restaurants.shape[0]}")
print(f"Number of reviews: {restaurant_reviews.shape[0]}")

Number of restaurants: 7922
Number of reviews: 67988


In [9]:
# popular restaurants
# rating count and mean
pop = restaurant_reviews.groupby("gPlusPlaceId")["rating"].agg(["count", "mean", "std"])
# rating count per month
first_review = pd.to_datetime(restaurant_reviews.groupby("gPlusPlaceId")["unixReviewTime"].min(), unit="s")
pop["count_per_month"] = pop["count"]/((pd.to_datetime("2020-01-01")-first_review).dt.days/30)
# popularity
pop["popularity"] = pop["count_per_month"]*pop["mean"]
# sample and rank
rec = pop.sample(frac=0.1).sort_values(by="popularity", ascending=False)
rec.join(restaurants["name"]).head(10)

Unnamed: 0_level_0,count,mean,std,count_per_month,popularity,name
gPlusPlaceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
116341229473795181891,228,4.298246,0.750093,1.864268,8.013083,Quartino Ristorante
107944093167831776994,141,4.41844,0.71869,1.201363,5.308151,Pequod's Pizza
113323940057430428803,306,4.330065,0.723524,0.86661,3.752478,Lou Malnati's Pizzeria - River North
107327602211513967918,81,4.45679,0.633431,0.662306,2.951758,Irazu
112821476426732624508,66,4.242424,0.823886,0.533405,2.262931,Sultan's Market
103176555101355324253,45,4.044444,0.903417,0.527756,2.13448,Bub City
117989694645594132696,65,3.707692,0.878974,0.533954,1.979737,Chipotle
100252657326603969904,41,4.146341,0.853258,0.410274,1.701134,Roots Handmade Pizza
107879486909228835887,65,4.169231,0.858218,0.393066,1.638783,The Art of Pizza
107145016482912535785,41,4.243902,0.969033,0.383058,1.625662,Three Aces


## 2. Model-free collaborative filtering

1) **Item-item similarity based on user rating profile**

I liked "Lou Malnati's Pizzeria - River North"

#### `You liked this place. You might also like ...`
- recommend the k nearist items to this prefered item
- difficult for new items (information confinement area)
- less personalized

In [10]:
cur_place = "113323940057430428803" # Lou Malnati's Pizzeria - River North
# all reviews of this place
cur_place_rev = restaurant_reviews[restaurant_reviews["gPlusPlaceId"]==cur_place][["gPlusUserId", "gPlusPlaceId", "rating"]]
# all places this current place's reviewers also rated
# and get review counts for each place
review_count = restaurant_reviews[restaurant_reviews["gPlusUserId"].isin(cur_place_rev["gPlusUserId"])].groupby("gPlusPlaceId")["rating"].count()
# utility matrix
util_mat = pd.pivot_table(
    data=restaurant_reviews[
        (restaurant_reviews["gPlusUserId"].isin(cur_place_rev["gPlusUserId"])) &
        (restaurant_reviews["gPlusPlaceId"].isin(review_count[review_count>5].index))
    ],
    values="rating",
    index="gPlusUserId",
    columns="gPlusPlaceId",
)
util_mat

gPlusPlaceId,100199306044415996142,100205881314444220407,100254219887109012864,100282811086308265590,100700249890815282981,100701963402087948560,100863928547461103392,101029780710086662158,101097541684591248783,101250692374788073906,...,117456477230560896376,117461567266001867636,117463995629081312281,117547816554417724132,117828697210626616313,117848916184386580966,117883521543351079180,117897055725908382944,117994119729310504354,118287305628878461968
gPlusUserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100002842042743258817,,,,,,,,,,,...,,,,,,,,,,
100205700132584907031,,,,,,,,,,,...,,,,,,,,,,
100288926844943293334,,,,,,,,,,,...,,,,,,,,,,
100364026395276833091,,,,,,,,,,,...,,,,,,,,,,
100375468260384609195,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118220314771111147334,,,,,,,,,,,...,,3.0,,,,,,,,
118234783420005686408,,,,,,,,,,,...,,,,,,,,,,
118252287369254921684,,,,,,,,,,,...,,,,,,,,,,
118325022531310056373,,,,,,,,,,,...,,,,,,,,,,


In [11]:
# places that has a similar user rating provile as the current place
cur_place_corr = util_mat.corrwith(cur_place_rev.set_index("gPlusUserId")["rating"]).dropna()
# sample and rank
cur_place_corr.sample(frac=0.1).sort_values(ascending=False).to_frame(name="corr").join(places["name"]).head(10)

Unnamed: 0_level_0,corr,name
gPlusPlaceId,Unnamed: 1_level_1,Unnamed: 2_level_1
104906831812811596204,0.774597,The Original Pancake House
109224633363000918002,0.756167,Billy Goat Tavern
115548193648290595548,0.742781,Wishbone Restaurant
111105500595448501663,0.600397,Rainforest Cafe
117072529310447548474,0.440086,Mike Ditka's Restaurant
110414189941733040442,0.408248,Twisted Spoke
110202234679870279189,0.381881,Al's Beef
117547816554417724132,0.316228,Pizzeria Due
101621464224083207515,0.29277,Old Town Pour House
101029780710086662158,0.184637,Little Goat


## 2. Model-free collaborative filtering

2) **User-user similarity based on item rating profile**

#### `Other people who liked this place also like ...`

- Select k nearest users, recommend most popular restaurants among them
- Require item rating profile from the current user
- Higher variance

## 3. Model based collaborative filtering

#### - The model: interaction matrix is the dot product of _latent features_ of users and items

#### - Matrix factorization: SVD, Funk SVD

#### - Deep learning embedding

In [12]:
from funk_svd import SVD
from sklearn.metrics import mean_absolute_error

df = restaurant_reviews.rename(
    columns={
        "gPlusPlaceId": "i_id",
        "gPlusUserId": "u_id"
    }
)

train = df.sample(frac=0.8, random_state=0)
val = df.drop(train.index.tolist()).sample(frac=0.5, random_state=0)
test = df.drop(train.index.tolist()).drop(val.index.tolist())

svd = SVD(learning_rate=0.01, regularization=0.02, n_epochs=100,
          n_factors=10, min_rating=1, max_rating=5)
svd.fit(X=train, X_val=val, early_stopping=True, shuffle=False)

Preprocessing data...

Epoch 1/100  | val_loss: 1.11 - val_rmse: 1.05 - val_mae: 0.80 - took 1.6 sec
Epoch 2/100  | val_loss: 1.09 - val_rmse: 1.04 - val_mae: 0.80 - took 0.0 sec
Epoch 3/100  | val_loss: 1.07 - val_rmse: 1.03 - val_mae: 0.79 - took 0.0 sec
Epoch 4/100  | val_loss: 1.06 - val_rmse: 1.03 - val_mae: 0.79 - took 0.0 sec
Epoch 5/100  | val_loss: 1.05 - val_rmse: 1.02 - val_mae: 0.79 - took 0.0 sec
Epoch 6/100  | val_loss: 1.04 - val_rmse: 1.02 - val_mae: 0.78 - took 0.0 sec
Epoch 7/100  | val_loss: 1.03 - val_rmse: 1.02 - val_mae: 0.78 - took 0.0 sec
Epoch 8/100  | val_loss: 1.03 - val_rmse: 1.01 - val_mae: 0.78 - took 0.0 sec
Epoch 9/100  | val_loss: 1.03 - val_rmse: 1.01 - val_mae: 0.78 - took 0.0 sec
Epoch 10/100 | val_loss: 1.02 - val_rmse: 1.01 - val_mae: 0.78 - took 0.0 sec
Epoch 11/100 | val_loss: 1.02 - val_rmse: 1.01 - val_mae: 0.78 - took 0.0 sec
Epoch 12/100 | val_loss: 1.02 - val_rmse: 1.01 - val_mae: 0.78 - took 0.0 sec

Training took 3 sec


<funk_svd.svd.SVD at 0x14ed7dc8>

In [13]:
pred = svd.predict(test)
mae = mean_absolute_error(test['rating'], pred)
mae

0.7687392897207723

In [14]:
# pick a user from users latent factor matrix 
import random
a_user = svd.pu[random.randint(0, svd.pu.shape[0]-1),:]
a_user

array([-0.18283137,  0.16531507, -0.0608287 ,  0.17419338,  0.10318207,
        0.08645731, -0.24240123,  0.14659161, -0.0810965 , -0.08869372])

In [15]:
# get user rating of all items
import numpy as np
a_uesr_rating = np.matmul(svd.qi, a_user)+svd.global_mean
a_user_df = pd.DataFrame(data=a_uesr_rating, index=svd.item_dict.keys(), columns=["pred_rating"])
a_user_df.sample(frac=0.1).sort_values(by="pred_rating", ascending=False).join(restaurants["name"]).head(10)

Unnamed: 0,pred_rating,name
102324339366555832180,4.231696,Asian Sweets
102250889214598410789,4.229202,Niku Niku Toyo
117743854970367159482,4.206649,KFC Gurnee
111623329334501182946,4.206415,La Roqueta
113840703420596938063,4.205838,Pizza Hut
100123535875809706791,4.195857,Mike's Pizzeria
102509845343060103660,4.193117,Tong's Hunan Restaurant
110920984080575977359,4.190401,Over Easy Café
111646534064556834777,4.18986,Burrito Beach
117168274089711966594,4.189051,Restaurant Ararat


## 4. Content-Based Recommender

#### - Explicit features of items and users

#### - Feature processing

#### - Scalability 

In [16]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import NearestNeighbors
# preprocessing
restaurants["num$"] = restaurants["price"].apply(lambda x: len(x) if type(x)==str else None)
df = restaurants[["lat", "long", "num$"]].join(pop[["popularity"]]).join(pd.DataFrame(data=svd.qi, index=svd.item_dict.keys()))

imputer = SimpleImputer(strategy="median")
scaler = MinMaxScaler()
X = imputer.fit_transform(df)
X = scaler.fit_transform(X)

nn = NearestNeighbors(n_neighbors=10, n_jobs=-1)
nn.fit(X)
d, i = nn.kneighbors(X[restaurants.index==cur_place])

df

Unnamed: 0_level_0,lat,long,num$,popularity,0,1,2,3,4,5,6,7,8,9
gPlusPlaceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
109732449745213650739,41.627895,-87.698000,,0.082450,,,,,,,,,,
104172507453522212414,41.776340,-87.873955,,0.048701,0.089212,-0.017286,0.059501,0.087992,-0.085542,0.095051,-0.008874,0.150383,-0.107815,0.030942
104594164305625743160,41.822511,-87.616570,2.0,1.750248,0.139988,-0.125279,0.118263,-0.180355,0.077776,0.073447,-0.149685,-0.009102,-0.068623,0.088835
116113404563544723933,41.711720,-88.067319,3.0,0.639256,-0.089051,-0.015773,0.116178,0.001340,-0.049817,-0.040515,0.009462,-0.101169,0.054930,0.215696
111893831217900737869,42.009931,-88.144971,3.0,0.152225,-0.009928,0.021961,-0.119395,-0.020592,0.169615,0.012975,-0.141713,-0.091019,-0.137992,-0.095903
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112182063035786859065,41.890705,-87.630869,2.0,2.795242,-0.040544,0.103580,0.141328,0.014569,-0.217216,0.064556,-0.158159,-0.088433,0.077226,0.035924
113174876647727547296,42.016500,-88.146250,2.0,0.195087,0.035373,-0.019830,-0.013405,-0.032934,-0.013010,0.011152,0.081117,0.141722,-0.191811,0.079996
117976747639574641902,41.832525,-87.795600,2.0,0.243363,-0.078209,-0.030791,0.014238,0.097165,-0.081729,0.008089,0.074273,-0.147126,0.037470,0.076703
116836257060300466195,42.124622,-88.063506,,0.098969,-0.363291,-0.050278,-0.017214,-0.041382,-0.026811,0.036162,0.189780,-0.127241,-0.097566,0.085112


In [17]:
# our final recommendation
rec = places.loc[df.index[i[0]]]
rec["distance"] = d[0]
rec

Unnamed: 0_level_0,name,price,closed,lat,long,distance
gPlusPlaceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
113323940057430428803,Lou Malnati's Pizzeria - River North,$$,False,41.890371,-87.633859,0.0
101646397214680360675,Butcher & The Burger,,False,41.917938,-87.654308,0.327122
107759450539450251450,Wildberry Pancakes & Cafe,$$,False,41.884823,-87.622956,0.33214
110747772392848664111,Bandera,$$,False,41.891874,-87.623676,0.352148
103099812915938305491,Toro Sushi,$$,False,41.928973,-87.642624,0.35648
118189991821825790034,Julius Meinl Cafe,$$,False,41.947351,-87.663789,0.374537
101250692374788073906,Jake Melnick's Corner Tap,$$,False,41.895581,-87.626532,0.375384
108991109845699664902,American Junkie,$$,False,41.890626,-87.628697,0.376727
114662299966649482685,Palace Grill Restaurant,,False,41.881685,-87.662423,0.381238
100228374295811796305,Wakamono Sushi Bar,$$,False,41.942514,-87.644359,0.385854


#### References
1. https://developers.google.com/machine-learning/recommendation
2. https://towardsdatascience.com/introduction-to-recommender-systems-6c66cf15ada
3. https://towardsdatascience.com/recommender-systems-in-practice-cef9033bb23a
4. http://nicolas-hug.com/blog/matrix_facto_3