# Google Local Recommenders

#### Agnes Bao

The goal of this project is to show what I learned about the methods and algorithms of recommendation systems, demonstrated using the Google Local data (http://cseweb.ucsd.edu/~jmcauley/datasets.html#google_local)

### Show Me the Data

In [1]:
import warnings
warnings.filterwarnings('ignore')

import gzip
import pprint
import pandas as pd

def sneakpeek(fname):
    with gzip.open(fname) as f:
        print(fname)
        line = eval(next(f))
        pprint.pprint(line)

* Places

In [2]:
sneakpeek("data/places.clean.json.gz")

data/places.clean.json.gz
{'address': ['2615 Angler Ave', 'Hemet, CA 92545'],
 'closed': False,
 'gPlusPlaceId': '104699454385822125632',
 'gps': [33.703804, -117.003209],
 'hours': [['Monday', [['6:30 am--4:15 pm']]],
           ['Tuesday', [['6:30 am--4:15 pm']]],
           ['Wednesday', [['6:30 am--4:15 pm']], 1],
           ['Thursday', [['6:30 am--4:15 pm']]],
           ['Friday', [['6:30 am--4:15 pm']]],
           ['Saturday', [['6:30 am--4:15 pm']]],
           ['Sunday', [['6:30 am--4:15 pm']]]],
 'name': 'Diamond Valley Lake Marina',
 'phone': '(951) 926-7201',
 'price': None}


* Reviews:

In [3]:
sneakpeek("data/reviews.clean.json.gz")

data/reviews.clean.json.gz
{'categories': ['Giải Trí - Café'],
 'gPlusPlaceId': '108103314380004200232',
 'gPlusUserId': '100000010817154263736',
 'rating': 3.0,
 'reviewText': 'Chất lượng tạm ổn',
 'reviewTime': 'Jul 1, 2013',
 'reviewerName': 'an lam',
 'unixReviewTime': 1372686659}


* Users data is mostly unstructured and is not very useful

In [4]:
sneakpeek("data/users.clean.json.gz")

data/users.clean.json.gz
{'currentPlace': ['Thành phố Hồ Chí Minh, Việt Nam',
                  [[], 108230990, 1066296640, 1]],
 'education': [[[], [], [], [], [], 6],
               [['Đại học Kiến trúc tp Hồ chí minh, Việt nam',
                 'KS.XD',
                 [[], [1, 1, 2013], 1],
                 '',
                 '']]],
 'gPlusUserId': '100000010817154263736',
 'jobs': [['Tổng công ty IDICO',
           'Chuyên viên Kỹ thuật XD',
           [[1, 1, 1998], [1, 1, 2013], 1],
           '',
           '']],
 'previousPlaces': [['tp. Nam Định, Nam Định, Việt Nam',
                     [[], 204200000, 1061683330, 1]]],
 'userName': 'an lam'}


### After making the dataset easier to work with...

In [5]:
import dask.dataframe as dd
places = dd.read_parquet("data/places*.parquet")
places.head()

Unnamed: 0,name,price,closed,gPlusPlaceId,lat,long
0,Diamond Valley Lake Marina,,False,104699454385822125632,33.703804,-117.003209
1,Blue Ribbon Cleaners,,False,103054478949000078829,38.979759,-76.547538
2,Portofino,,False,109810290098030327104,43.22776,44.762726
3,T C's Referee Sports Bar,$$,False,100327153115986850675,43.529494,-96.792244
4,Carrefour - Palembang Square,,False,103368487323937936043,-2.976256,104.742662


In [6]:
reviews = dd.read_parquet("data/reviews_*.parquet")
reviews.head()

Unnamed: 0,rating,reviewerName,categories,gPlusPlaceId,unixReviewTime,reviewTime,gPlusUserId
0,3.0,an lam,[Giải Trí - Café],108103314380004200232,1372687000.0,"Jul 1, 2013",100000010817154263736
1,5.0,HALİL TURGUT,[Turkish Cuisine],102194128241608748649,1342871000.0,"Jul 21, 2012",100000013500285534661
2,5.0,森田さとこ,"[Fishing, Pond Fish Supplier, Seafood Market]",101409858828175402384,1390654000.0,"Jan 25, 2014",100000021336848867366
3,5.0,森田さとこ,[Museum],101477177500158511502,1389188000.0,"Jan 8, 2014",100000021336848867366
4,4.0,森田さとこ,[Police],106994170641063333085,1390486000.0,"Jan 23, 2014",100000021336848867366


## 1. Popularity based recommender
I'm a new user searching for restaurants in 70 miles around (42.0480, -87.6843)...

You know nothing about me.

<img src="data/places.png">

In [7]:
# subset a smaller map area
coord = (42.0480, -87.6843)

def subset_places(df, coord, deg):
    return df[(df["lat"] > coord[0] - deg) & 
              (df["lat"] < coord[0] + deg) & 
              (df["long"] > coord[1] - deg) & 
              (df["long"] < coord[1] + deg)]

places = places.set_index("gPlusPlaceId").drop_duplicates()
places = subset_places(places, coord, deg=0.5)
places = places[~places["closed"]]
places = places.compute()

# subset restaurants
reviews = reviews.merge(places[["name"]], left_on="gPlusPlaceId", right_index=True, how="right")
reviews = reviews.compute()

reviews["categories"] = reviews["categories"].apply(lambda x: ";".join(x) if x is not None else None)
restaurant_reviews = reviews[reviews["categories"].str.contains("restaurant", case=False, na=False)]

# Size of data set we're working with:
print(f"Number of restaurants: {restaurant_reviews['gPlusPlaceId'].nunique()}")
print(f"Number of reviews: {restaurant_reviews.shape[0]}")

Number of restaurants: 8975
Number of reviews: 76921


In [8]:
# popular restaurants
# rating count and mean
pop = restaurant_reviews.groupby(["gPlusPlaceId", "name"])["rating"].agg(["count", "mean", "std"]).reset_index(level=1)
# rating count per month
first_review = pd.to_datetime(restaurant_reviews.groupby("gPlusPlaceId")["unixReviewTime"].min(), unit="s")
pop["count_per_month"] = pop["count"]/((pd.to_datetime("2020-01-01")-first_review).dt.days/30)
# popularity
pop["popularity"] = pop["count_per_month"]*pop["mean"]
# sample and rank
rec = pop.sample(frac=0.1).sort_values(by="popularity", ascending=False)
rec.head(10)

Unnamed: 0_level_0,name,count,mean,std,count_per_month,popularity
gPlusPlaceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
107739044751658371768,Giordano's,208,4.110577,0.846866,1.221374,5.020552
100498053263705916302,Crisp,89,4.550562,0.639802,0.801561,3.647553
114591933512613513560,Lula Cafe,100,4.45,0.808728,0.683839,3.043082
106757378058059356379,P.F. Chang's,79,4.050633,0.845785,0.711285,2.881152
114750640397243234714,Lillie's Q,70,4.442857,0.694402,0.634058,2.817029
101250692374788073906,Jake Melnick's Corner Tap,71,4.253521,0.750587,0.651974,2.773186
115080027917661556608,Siena Tavern,43,4.418605,0.62612,0.519952,2.297461
106878921059236580288,Scofflaw,44,4.613636,0.57933,0.475676,2.194595
105470414311814116306,Berghoff Catering & Restaurant Group,98,3.806122,0.959576,0.575906,2.191969
115709739959060992854,Chicago Pizza & Oven Grinder Company,79,4.202532,0.897,0.521337,2.190937


## 2. Model-free collaborative filtering

1) **Item-item similarity based on user rating profile**

I liked "Lou Malnati's Pizzeria - River North"

#### `You liked this place. You might also like ...`
- recommend the k nearist items to this prefered item
- difficult for new items (information confinement area)
- less personalized

In [10]:
cur_place = "113323940057430428803" # Lou Malnati's Pizzeria - River North
# all reviews of this place
cur_place_rev = restaurant_reviews.loc[restaurant_reviews["gPlusPlaceId"]==cur_place, ["gPlusUserId", "gPlusPlaceId", "rating"]]
# all places this current place's reviewers also rated
# and get review counts for each place
review_count = restaurant_reviews[restaurant_reviews["gPlusUserId"].isin(cur_place_rev["gPlusUserId"])].groupby("gPlusPlaceId")["rating"].count()
# utility matrix
util_mat = pd.pivot_table(
    data=restaurant_reviews[
        (restaurant_reviews["gPlusUserId"].isin(cur_place_rev["gPlusUserId"])) &
        (restaurant_reviews["gPlusPlaceId"].isin(review_count[review_count>5].index))
    ],
    values="rating",
    index="gPlusUserId",
    columns="gPlusPlaceId",
)
util_mat

gPlusPlaceId,100199306044415996142,100205881314444220407,100254219887109012864,100282811086308265590,100520627144025256485,100700249890815282981,100701963402087948560,100863928547461103392,101029780710086662158,101097541684591248783,...,117487242634817497062,117547816554417724132,117828697210626616313,117848916184386580966,117883521543351079180,117897055725908382944,117981760190455907001,117994119729310504354,118287305628878461968,118341094737243669337
gPlusUserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100002842042743258817,,,,,,,,,,,...,,,,,,,,,,
100205700132584907031,,,,,,,,,,,...,,,,,,,,,,
100288926844943293334,,,,,,,,,,,...,,,,,,,,,,
100364026395276833091,,,,,,,,,,,...,,,,,,,,,,
100375468260384609195,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118220314771111147334,,,,,,,,,,,...,,,,,,,,,,
118234783420005686408,,,,,,,,,,,...,,,,,,,,,,
118252287369254921684,,,,,,,,,,,...,,,,,,,,,,
118325022531310056373,,,,,,,,,,,...,,,,,,,,,,


In [11]:
# places that has a similar user rating provile as the current place
cur_place_corr = util_mat.corrwith(cur_place_rev.set_index(["gPlusUserId"])["rating"]).dropna()
# sample and rank
cur_place_corr.sample(frac=0.1).sort_values(ascending=False).to_frame(name="corr").join(places["name"]).head(10)

Unnamed: 0_level_0,corr,name
gPlusPlaceId,Unnamed: 1_level_1,Unnamed: 2_level_1
101702678014294138515,0.77246,Heaven on Seven
106039536035413651782,0.75,Chicago Cut Steakhouse
114851610796405270991,0.661438,Bongo Room
107739044751658371768,0.636919,Giordano's
107944093167831776994,0.342291,Pequod's Pizza
101621464224083207515,0.29277,Old Town Pour House
115709739959060992854,0.286972,Chicago Pizza & Oven Grinder Company
103575142154334635317,0.248018,Piece Brewery & Pizzeria
114506345829113475784,0.218218,McDonald's
110121240885311259456,0.176505,Mercadito Chicago


## 2. Model-free collaborative filtering

2) **User-user similarity based on item rating profile**

#### `Other people who liked this place also like ...`

- Select k nearest users, recommend most popular restaurants among them
- Require item rating profile from the current user
- Higher variance

## 3. Model based collaborative filtering

#### - The model: interaction matrix is the dot product of _latent features_ of users and items

#### - Matrix factorization: SVD, Funk SVD

#### - Deep learning embedding

In [12]:
from funk_svd import SVD
from sklearn.metrics import mean_absolute_error

df = restaurant_reviews.rename(
    columns={
        "gPlusPlaceId": "i_id",
        "gPlusUserId": "u_id"
    }
)

train = df.sample(frac=0.8, random_state=0)
val = df.drop(train.index.tolist()).sample(frac=0.5, random_state=0)
test = df.drop(train.index.tolist()).drop(val.index.tolist())

svd = SVD(learning_rate=0.01, regularization=0.02, n_epochs=100,
          n_factors=10, min_rating=1, max_rating=5)
svd.fit(X=train, X_val=val, early_stopping=True, shuffle=False)

Preprocessing data...

Epoch 1/100  | val_loss: 1.06 - val_rmse: 1.03 - val_mae: 0.79 - took 0.7 sec
Epoch 2/100  | val_loss: 1.04 - val_rmse: 1.02 - val_mae: 0.78 - took 0.0 sec
Epoch 3/100  | val_loss: 1.02 - val_rmse: 1.01 - val_mae: 0.78 - took 0.0 sec
Epoch 4/100  | val_loss: 1.01 - val_rmse: 1.00 - val_mae: 0.78 - took 0.0 sec
Epoch 5/100  | val_loss: 1.00 - val_rmse: 1.00 - val_mae: 0.77 - took 0.0 sec
Epoch 6/100  | val_loss: 0.99 - val_rmse: 1.00 - val_mae: 0.77 - took 0.0 sec
Epoch 7/100  | val_loss: 0.98 - val_rmse: 0.99 - val_mae: 0.77 - took 0.0 sec
Epoch 8/100  | val_loss: 0.98 - val_rmse: 0.99 - val_mae: 0.77 - took 0.0 sec
Epoch 9/100  | val_loss: 0.98 - val_rmse: 0.99 - val_mae: 0.76 - took 0.0 sec
Epoch 10/100 | val_loss: 0.97 - val_rmse: 0.99 - val_mae: 0.76 - took 0.0 sec
Epoch 11/100 | val_loss: 0.97 - val_rmse: 0.98 - val_mae: 0.76 - took 0.0 sec
Epoch 12/100 | val_loss: 0.97 - val_rmse: 0.98 - val_mae: 0.76 - took 0.0 sec
Epoch 13/100 | val_loss: 0.97 - val_rmse:

<funk_svd.svd.SVD at 0x7faf8c9d2e90>

In [13]:
pred = svd.predict(test)
mae = mean_absolute_error(test['rating'], pred)
mae

0.7675678012036727

In [14]:
# pick a user from users latent factor matrix 
import random
a_user = svd.pu[random.randint(0, svd.pu.shape[0]-1),:]
a_user

array([ 0.08444672,  0.02228627, -0.04478465, -0.05332616,  0.10794631,
        0.001988  ,  0.04442098,  0.11977786, -0.06785335, -0.19495033])

In [15]:
# get user rating of all items
import numpy as np
a_uesr_rating = np.matmul(svd.qi, a_user)+svd.global_mean
a_user_df = pd.DataFrame(data=a_uesr_rating, index=svd.item_dict.keys(), columns=["pred_rating"])
a_user_df.sample(frac=0.1).sort_values(by="pred_rating", ascending=False).join(places["name"]).head(10)

Unnamed: 0,pred_rating,name
102665645105399820658,4.168841,Potbelly Sandwich Shop
109892629981685306367,4.161515,Nano sushi
114486776519301564362,4.145435,Red Lobster
118249673019416020139,4.145395,Fiamme
118082143550442702489,4.143318,El Lago
109075952281710943090,4.138922,KFC Chicago
110918984754336577233,4.136529,La Fiesta Azteca
103748049369509982958,4.134133,Vinci
110609540086200619649,4.133828,25 Degrees Chicago
115436799676387750515,4.133429,Wing Wah


## 4. Content-Based Recommender

#### - Explicit features of items and users

#### - Feature processing

#### - Scalability 

In [16]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import NearestNeighbors
# preprocessing
places["num$"] = places["price"].apply(lambda x: len(x) if isinstance(x, str) else None)
df = places[["lat", "long", "num$"]].join(pop[["popularity"]], how="inner").join(pd.DataFrame(data=svd.qi, index=svd.item_dict.keys()))

imputer = SimpleImputer(strategy="median")
scaler = MinMaxScaler()
X = imputer.fit_transform(df)
X = scaler.fit_transform(X)

nn = NearestNeighbors(n_neighbors=10, n_jobs=-1)
nn.fit(X)
d, i = nn.kneighbors(X[df.index==cur_place])

df

Unnamed: 0_level_0,lat,long,num$,popularity,0,1,2,3,4,5,6,7,8,9
gPlusPlaceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
100000116602613979588,41.586664,-87.716931,,0.113208,0.032273,-0.135647,-0.093660,-0.055921,-0.091884,0.300257,0.011193,-0.022051,0.038785,-0.120960
100009096662479920283,41.927056,-87.672115,2.0,0.250597,0.162955,0.062904,0.031999,0.144115,-0.053643,-0.040472,0.028285,-0.196695,-0.048578,-0.000702
100009897937377477102,41.968203,-87.688709,3.0,0.531324,0.040667,0.001651,-0.112040,0.082147,0.120970,0.069703,0.092514,0.196628,0.072796,0.093253
100012607573075242005,41.893757,-87.631407,2.0,1.235996,0.088247,0.014629,0.169352,0.019631,0.007552,0.127745,-0.115433,0.012671,0.275279,-0.026063
100014807162998766936,42.149062,-87.913639,2.0,0.371552,-0.017230,-0.089687,-0.155172,0.111763,0.228627,-0.148840,0.086316,-0.060088,-0.122987,-0.038294
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118438862075330472211,41.748854,-88.164407,2.0,0.100200,0.015151,-0.135321,0.131881,-0.059544,0.061260,0.281669,-0.038277,-0.041848,-0.109864,0.103364
118441860743580892299,41.939442,-87.727292,3.0,0.253262,-0.149975,-0.210103,0.012632,-0.010874,-0.021271,0.014132,-0.072941,-0.058936,0.122129,-0.080521
118442803934982778674,41.902636,-87.628823,3.0,2.484012,0.043105,-0.132609,-0.037402,0.046790,-0.169297,-0.176525,-0.191418,-0.095721,0.004562,-0.084596
118443644338325378077,41.707274,-87.623712,,0.204409,0.253711,-0.164493,-0.110731,-0.201695,0.186837,-0.075046,0.090350,0.102381,-0.092117,-0.191431


In [17]:
# our final recommendation
rec = places.loc[df.index[i[0]]]
rec["distance"] = d[0]
rec

Unnamed: 0_level_0,name,price,closed,lat,long,num$,distance
gPlusPlaceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
113323940057430428803,Lou Malnati's Pizzeria - River North,$$,False,41.890371,-87.633859,2.0,0.0
101646397214680360675,Butcher & The Burger,,False,41.917938,-87.654308,,0.327081
114750640397243234714,Lillie's Q,$$,False,41.910634,-87.674903,2.0,0.332704
114851610796405270991,Bongo Room,$$,False,41.867635,-87.626149,2.0,0.332942
103176555101355324253,Bub City,$$,False,41.890351,-87.630946,2.0,0.335195
112910599830164345191,West Egg Cafe,$$,False,41.893116,-87.620404,2.0,0.35955
106519799572085478346,Frontera Grill,,False,41.890476,-87.630956,,0.389674
107934800075000360329,Zaca Tacos,$$,False,41.785149,-87.722805,2.0,0.394726
109547921608822256407,Anna's Asian Bistro,$$,False,41.885544,-87.648074,2.0,0.395944
105122086657139882635,Rockwell's Neighborhood Grill,$$,False,41.965852,-87.693847,2.0,0.396625


#### References
1. https://developers.google.com/machine-learning/recommendation
2. https://towardsdatascience.com/introduction-to-recommender-systems-6c66cf15ada
3. https://towardsdatascience.com/recommender-systems-in-practice-cef9033bb23a
4. http://nicolas-hug.com/blog/matrix_facto_3