In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sentence_transformers import SentenceTransformer, util
from pathlib import Path
import os, ast
from typing import List
import torch

  from .autonotebook import tqdm as notebook_tqdm


## Utility Functions

In [3]:
def load_embeddings(file_path: str) -> pd.DataFrame:
    embeddings_df = pd.read_csv(file_path)
    return embeddings_df

In [4]:
def compute_text_embeddings(df: pd.DataFrame, model_name: str) -> pd.DataFrame:
    model = SentenceTransformer(model_name)

    print("computing embeddings ", df.shape)
    features_to_encode = df["combined_text"].values

    embeddings = model.encode(features_to_encode)

    df["embeddings"] = embeddings.tolist()
    df["embeddings"] = df["embeddings"].to_numpy()

    file_name = f"{model_name}_zomato_embeddings.csv"
    file_path = f"{os.getcwd()}/../data/embeddings/{file_name}"
    print(file_path)
    df[["name", "embeddings"]].to_csv(file_path, index=False)

    print("embeddings computed")
    return df

In [5]:
def get_text_embeddings(df: pd.DataFrame, model_name: str) -> pd.DataFrame:
    file_name = f"{model_name}_zomato_embeddings.csv"
    file_path = Path(f"{os.getcwd()}/../data/embeddings/{file_name}")

    if file_path.is_file():
        embeddings_df = load_embeddings(file_path)
        embeddings_df["embeddings"] = embeddings_df["embeddings"].apply(lambda x: np.array(ast.literal_eval(x)))
    else:
        embeddings_df = compute_text_embeddings(df, model_name)
    return embeddings_df

In [6]:
def convert_to_tensor(query_vals, remaining_vals):

    if type(query_vals) is list:
        query_embeddings = torch.FloatTensor(query_vals).float()
    else:
        query_embeddings = torch.from_numpy(query_vals).float()
    remaining_embeddings = np.vstack(remaining_vals).astype(float)
    remaining_embeddings = torch.from_numpy(remaining_embeddings).float()
    return query_embeddings, remaining_embeddings

In [7]:
def compute_cosine_sim(query_vector, remaining_vector):
    results = util.cos_sim(query_vector, remaining_vector)
    return results.flatten().tolist()


## Recommendations

In [8]:
def recommend(query_name: str, df: pd.DataFrame):
    # TODO: change this automatically based on features selected
    df["combined_text"] = df["cuisine"] + " " + df["timing"] + " " + str(df["cost"]) + " " + str(df["rating"])
    embeddings_df = get_text_embeddings(df[["name", "combined_text"]], MODEL)

    query_cuisine = df.loc[df["name"] == query_name]["cuisine"].values[0]
    print(f"recommendations similar to {query_name} of {query_cuisine} cuisine are as follows \n")
    df_remaining = df.loc[df["name"] != query_name]

    query_embeddings = embeddings_df.loc[embeddings_df["name"] == query_name]["embeddings"].values[0]
    remaining_embeddings = embeddings_df.loc[embeddings_df["name"] != query_name]["embeddings"].values

    # converting them from numpy ndarray to tensors
    query_embeddings, remaining_embeddings = convert_to_tensor(query_embeddings, remaining_embeddings)

    results = compute_cosine_sim(query_embeddings, remaining_embeddings)
    df_remaining.loc[:, "sim_scores"] = results
    df_remaining = df_remaining.sort_values(by=["sim_scores"], ascending=False)

    return df_remaining.reset_index(drop=True)

In [9]:
MODEL = "all-MiniLM-L6-v2"

In [10]:
data = pd.read_csv("../data/Zomato_cleaned.csv")
data.head()

Unnamed: 0,name,voteCount,rating,address,cuisine,cost,timing
0,Peter Cat,12404,4.2,"18A, Park Street, Park Street Area, Kolkata","North Indian, Continental",1000.0,11am – 11:20pm (Mon-Sun)
1,Naturals Ice Cream,2498,4.9,"77/1/A, Ground Floor, Near West Bengal Board, ...","North Indian, Continental",200.0,11am – 12midnight (Mon-Sun)
2,Carpe Diem,4083,4.4,"18M, Park Street Area, Kolkata",Ice Cream,1000.0,"12noon – 12midnight (Mon, Tue, Wed, Thu, Fri..."
3,Barbeque Nation,5455,4.6,"1st Floor, 24, Park Center Building, Park Stre...",Ice Cream,1800.0,"12noon – 3:30pm, 6:30pm – 10:45pm (Mon-Sun)"
4,Flurys,4709,4.2,"15, Apeejay House, Park Street Area, Kolkata","Chinese, North Indian, Continental",400.0,7:30am – 11pm (Mon-Sun)


In [11]:
data.shape

(5193, 7)

In [12]:
data.dtypes

name          object
voteCount      int64
rating       float64
address       object
cuisine       object
cost         float64
timing        object
dtype: object

In [14]:
recommendations = recommend("Aahar", data)
recommendations.head()

recommendations similar to Aahar of Chinese, North Indian cuisine are as follows 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_remaining.loc[:, "sim_scores"] = results


Unnamed: 0,name,voteCount,rating,address,cuisine,cost,timing,combined_text,sim_scores
0,Haji Malang,137,3.6,"257, Panchanantala Road, Howrah Maidan Area, H...","Chinese, North Indian",300.0,12noon – 10pm (Mon-Sun),"Chinese, North Indian 12noon – 10pm (Mon-Sun) ...",1.0
1,New Arsalaan Biryani,114,3.8,"189/B/1, Santoshpur Avenue, Jora Bridge, Santo...","Chinese, North Indian",200.0,12noon – 10pm (Mon-Sun),"Chinese, North Indian 12noon – 10pm (Mon-Sun) ...",1.0
2,Bhojon Rasik,30,3.3,"42/A, Gopal Lal Tagore Road, Near Ramkrishna M...","Chinese, North Indian",500.0,12noon – 10pm (Mon-Sun),"Chinese, North Indian 12noon – 10pm (Mon-Sun) ...",1.0
3,New Tandoori Nights,442,2.7,"Shop 31,CE Market, Sector 1, Salt Lake","Chinese, North Indian",250.0,12noon – 10pm (Mon-Sun),"Chinese, North Indian 12noon – 10pm (Mon-Sun) ...",1.0
4,One By One Cafe,37,3.3,"1/1 Ramani Chatterjee Road, Gariahat, Kolkata","Chinese, North Indian",500.0,12noon – 9pm (Mon-Sun),"Chinese, North Indian 12noon – 9pm (Mon-Sun) 0...",0.999736
