In [2]:
import pickle
import pandas as pd
from sklearn.neighbors import NearestNeighbors

In [3]:
DIR = "../attachment_intern/"

In [4]:
# user id to recommend restaurants for
USER_ID = "u00000"

# load model from pickle file
with open(DIR + "model.pkl", "rb") as f:
    model: NearestNeighbors = pickle.load(f)

In [5]:
# load user and restaurant data
user_df = pd.read_parquet(DIR + "user.small.parquet")
restaurant_df = pd.read_parquet(DIR + "restaurant.parquet").set_index("index")

In [6]:
user_df = pd.read_parquet(DIR + "user.parquet")
len(user_df)

100000

In [7]:
user_df.head()

Unnamed: 0,user_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_990,feature_991,feature_992,feature_993,feature_994,feature_995,feature_996,feature_997,feature_998,feature_999
0,u00000,0.193018,1.235771,0.478129,0.056068,0.79887,0.942501,-0.186956,-0.186927,-0.137488,...,0.28967,0.529229,0.658945,0.313935,1.343077,-0.160149,1.79604,0.078384,0.460348,0.22815
1,u00001,0.542445,0.061676,-0.172018,0.366714,0.690263,0.565948,0.030949,0.47896,0.033966,...,-0.564581,1.279485,-0.600161,-0.370372,1.984003,1.414539,1.877632,1.124837,2.27337,1.369663
2,u00002,0.501249,1.194607,0.750967,-0.078167,0.361179,0.124189,0.736151,1.427973,-0.676132,...,-0.185844,-0.247236,1.687951,0.849775,0.671021,-0.654202,-0.11887,0.419687,0.391202,-0.808181
3,u00003,0.613723,-0.652035,1.161827,0.122698,0.828054,-0.29604,0.563028,0.711285,0.097485,...,0.720368,0.070521,0.338198,0.885493,0.889271,0.778233,1.164597,0.572195,0.062597,-0.822071
4,u00004,0.17965,0.166768,0.337015,0.594367,0.897039,0.858783,-0.523686,0.813889,0.517246,...,0.404695,0.707832,0.700675,1.538892,0.114889,0.793582,-0.071324,1.282532,-0.052153,0.395376


In [8]:
restaurant_df.head(), len(restaurant_df)

(      restaurant_id            latitude           longitude
 index                                                      
 0             r0000  13.844851141246949   100.3086836106867
 1             r0001  13.781825896541061  100.84977532273388
 2             r0002  13.883572268183674   100.3085459374542
 3             r0003  14.066725452665281   100.7256760943531
 4             r0004  13.961173933525883  100.39580661948798,
 10000)

In [9]:
user_df[user_df["user_id"] == USER_ID].drop(columns="user_id").shape

(1, 1000)

In [10]:
# find 20 nearest neighbors to be recommend restaurants
dist, ind = model.kneighbors(
    user_df[user_df["user_id"] == USER_ID].drop(columns="user_id"), n_neighbors=20
)

dist, ind

(array([[23.66758003, 23.80081254, 23.83828907, 24.07205225, 24.16645254,
         24.18236662, 24.22441798, 24.23519259, 24.23628432, 24.26967065,
         24.27378404, 24.28877668, 24.29956383, 24.30405023, 24.30567711,
         24.33704798, 24.34151423, 24.34254504, 24.35122189, 24.35462439]]),
 array([[1737, 2116,  862, 1060, 5083, 4855, 7495, 7182, 9669, 8749, 2316,
         2159, 4255, 7596, 7523,  782, 7075, 2751, 3382, 9548]]))

In [11]:
# get restaurant id from restaurant indices returned from the model
recommend_df = restaurant_df.loc[ind[0]]

# set distance as restaurant score
recommend_df["score"] = dist[0]

In [12]:
ind[0].shape

(20,)

In [13]:
print(recommend_df[["restaurant_id", "score"]].to_json(orient="records", indent=2))

[
  {
    "restaurant_id":"r1737",
    "score":23.6675800283
  },
  {
    "restaurant_id":"r2116",
    "score":23.8008125402
  },
  {
    "restaurant_id":"r0862",
    "score":23.8382890655
  },
  {
    "restaurant_id":"r1060",
    "score":24.0720522494
  },
  {
    "restaurant_id":"r5083",
    "score":24.166452545
  },
  {
    "restaurant_id":"r4855",
    "score":24.182366617
  },
  {
    "restaurant_id":"r7495",
    "score":24.2244179801
  },
  {
    "restaurant_id":"r7182",
    "score":24.2351925876
  },
  {
    "restaurant_id":"r9669",
    "score":24.2362843205
  },
  {
    "restaurant_id":"r8749",
    "score":24.2696706463
  },
  {
    "restaurant_id":"r2316",
    "score":24.2737840441
  },
  {
    "restaurant_id":"r2159",
    "score":24.2887766768
  },
  {
    "restaurant_id":"r4255",
    "score":24.2995638295
  },
  {
    "restaurant_id":"r7596",
    "score":24.3040502318
  },
  {
    "restaurant_id":"r7523",
    "score":24.3056771101
  },
  {
    "restaurant_id":"r0782",
    "sc

In [14]:
request_df = pd.read_parquet(DIR + "request.parquet")
request_df.head()

Unnamed: 0,user_id,latitude,longitude,size,sort_dis,max_dis
0,u83153,14.068817,100.646536,50,0.0,5000.0
1,u45712,14.109562,100.69869,50,1.0,5000.0
2,u52829,13.727387,100.830825,50,1.0,5000.0
3,u11570,13.921809,100.468203,20,1.0,5000.0
4,u99991,13.804917,100.682749,50,,5000.0


In [15]:
request_df.to_dict(orient="records")

[{'user_id': 'u83153',
  'latitude': 14.068817471971206,
  'longitude': 100.64653564859792,
  'size': 50,
  'sort_dis': 0.0,
  'max_dis': 5000.0},
 {'user_id': 'u45712',
  'latitude': 14.109562142994584,
  'longitude': 100.6986897188348,
  'size': 50,
  'sort_dis': 1.0,
  'max_dis': 5000.0},
 {'user_id': 'u52829',
  'latitude': 13.72738690784919,
  'longitude': 100.83082543338223,
  'size': 50,
  'sort_dis': 1.0,
  'max_dis': 5000.0},
 {'user_id': 'u11570',
  'latitude': 13.921808757777047,
  'longitude': 100.46820296398501,
  'size': 20,
  'sort_dis': 1.0,
  'max_dis': 5000.0},
 {'user_id': 'u99991',
  'latitude': 13.804916705815488,
  'longitude': 100.68274869836898,
  'size': 50,
  'sort_dis': nan,
  'max_dis': 5000.0},
 {'user_id': 'u38657',
  'latitude': 13.755984152670258,
  'longitude': 100.52460461785762,
  'size': 50,
  'sort_dis': nan,
  'max_dis': 10000.0},
 {'user_id': 'u95302',
  'latitude': 14.030551991846853,
  'longitude': 100.79957194321223,
  'size': 20,
  'sort_dis':

In [16]:
request_df.isnull().sum()

user_id        0
latitude       0
longitude      0
size           0
sort_dis     589
max_dis      192
dtype: int64

In [17]:
request_df[request_df["user_id"] == "u83153"]

Unnamed: 0,user_id,latitude,longitude,size,sort_dis,max_dis
0,u83153,14.068817,100.646536,50,0.0,5000.0


In [18]:
request_df["sort_dis"].value_counts()

sort_dis
1.0    786
0.0    625
Name: count, dtype: int64

In [19]:
request_df["size"].value_counts()

size
50    1788
20     212
Name: count, dtype: int64

In [20]:
request_df["max_dis"].value_counts()

max_dis
5000.0     1411
1000.0      208
10000.0     189
Name: count, dtype: int64