In [1]:
!pip install -U sentence-transformers
!pip install chromadb

Collecting sentence-transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.1.1
Collecting chromadb
  Downloading chromadb-0.5.7-py3-none-any.whl.metadata (6.8 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.30.6-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.6.6-py2.py3-none-any.whl.metadata (2.0 k

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange


## Importing and EDA: Restaurant Reviews

In [77]:
df = pd.read_csv('/content/Restaurant reviews.csv')

In [78]:
df.head(3)

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures,7514
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5,"1 Review , 2 Followers",5/25/2019 15:54,0,2447.0
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5,"3 Reviews , 2 Followers",5/25/2019 14:20,0,
2,Beyond Flavours,Ashok Shekhawat,A must try.. great food great ambience. Thnx f...,5,"2 Reviews , 3 Followers",5/24/2019 22:54,0,


In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Restaurant  10000 non-null  object 
 1   Reviewer    9962 non-null   object 
 2   Review      9955 non-null   object 
 3   Rating      9962 non-null   object 
 4   Metadata    9962 non-null   object 
 5   Time        9962 non-null   object 
 6   Pictures    10000 non-null  int64  
 7   7514        1 non-null      float64
dtypes: float64(1), int64(1), object(6)
memory usage: 625.1+ KB


In [80]:
df.isnull().sum()

Unnamed: 0,0
Restaurant,0
Reviewer,38
Review,45
Rating,38
Metadata,38
Time,38
Pictures,0
7514,9999


In [81]:
df.drop(columns=['7514','Pictures'], inplace=True)

In [82]:
df['Restaurant'].unique()

array(['Beyond Flavours', 'Paradise', 'Flechazo',
       'Shah Ghouse Hotel & Restaurant', 'Over The Moon Brew Company',
       "The Fisherman's Wharf", 'eat.fit', 'Shah Ghouse Spl Shawarma',
       'Hyper Local', 'Cream Stone', "Sardarji's Chaats & More",
       'Barbeque Nation', 'Absolute Sizzlers',
       'The Lal Street - Bar Exchange', "AB's - Absolute Barbecues",
       'KFC', 'NorFest - The Dhaba', 'Hotel Zara Hi-Fi',
       '10 Downing Street', 'Pakwaan Grand', '13 Dhaba',
       "Jonathan's Kitchen - Holiday Inn Express & Suites", 'B-Dubs',
       'Amul', 'SKYHY', 'Tiki Shack', 'Mustang Terrace Lounge',
       "3B's - Buddies, Bar & Barbecue", 'Behrouz Biryani',
       'Hunger Maggi Point', 'Pot Pourri', 'Pista House',
       'Marsala Food Company', 'Club Rogue',
       'Mazzo - Marriott Executive Apartments',
       'Green Bawarchi Restaurant', 'Banana Leaf Multicuisine Restaurant',
       'Kritunga Restaurant', 'The Glass Onion', 'Deli 9 Bistro',
       'Frio Bistro', 'Kara

In [83]:
df['Reviewer'].fillna('Unknown', inplace=True)

In [84]:
df['Rating'].sort_values().unique()

array(['1', '1.5', '2', '2.5', '3', '3.5', '4', '4.5', '5', 'Like', nan],
      dtype=object)

In [85]:
df[df['Rating']=='Like']['Review'].values

array(['One of the best pizzas to try. It served with the fresh crust and the topping of veggies are fresh and the taste of the ingredients was awesome and it is fully overloaded with Cheese. I would like to recommend to try every Time I wager for pizza'],
      dtype=object)

In [86]:
df[df['Rating'].fillna('nan')=='nan']

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time
8777,American Wild Wings,Unknown,,,,
8778,American Wild Wings,Unknown,,,,
8779,American Wild Wings,Unknown,,,,
8780,American Wild Wings,Unknown,,,,
8781,American Wild Wings,Unknown,,,,
8782,American Wild Wings,Unknown,,,,
8783,American Wild Wings,Unknown,,,,
8784,American Wild Wings,Unknown,,,,
8785,American Wild Wings,Unknown,,,,
8786,American Wild Wings,Unknown,,,,


NaN rating doesn't have any meaning. Additionally they had errors on Time Value. They will be deprecated if Review, Reviewer and Rating are nan

## Cleaning Data:
Creating a Cleaning function based on EDA Insights

In [87]:
from datetime import datetime
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

class Cleaning:
    def __init__(self, df: pd.DataFrame) -> None:
        self.df = df
        self.stop_words = set(stopwords.words('english'))

    def __time_string(self, x: str) -> str:
        try:
            x = x.strftime('%Y')
        except:
            x = datetime.today().strftime('%Y')
        return int(x)

    def __remove_punctuation(self, text: str) -> str:
        return ''.join(char for char in text if char not in punctuation)

    # Function to tokenize text and remove stop words
    def __tokenize_and_remove_stopwords(self, text_column: str) -> None:
        # Function to process each row
        def process_text(text):
            # Remove punctuation
            text = self.__remove_punctuation(text)
            # Tokenize the text
            tokens = word_tokenize(text.lower())  # Convert to lowercase
            # Remove stop words
            filtered_tokens = [word for word in tokens if word not in self.stop_words]
            return filtered_tokens

        # Apply the processing function to the specified text column
        self.df[text_column].fillna('NA', inplace=True)
        self.df[text_column] = self.df[text_column].apply(lambda x: str(x))
        self.df['tokens'] = self.df[text_column].apply(process_text)
        self.df['cleaned_text_column'] = self.df['tokens'].apply(lambda x: ' '.join(x))
        self.df.drop(columns=['tokens'], inplace=True)
        return self.df

    def __cleaning_nans(self) -> pd.DataFrame:
        # Dropping columns
        try:
            self.df.drop(columns=['7514', 'Pictures'], inplace=True)
        except:
            print('Columns already dropped')

        self.df['Reviewer'].fillna('Unknown', inplace=True)
        self.df.fillna(np.nan, inplace=True)
        self.df['drop_filter'] = self.df.apply(lambda x: pd.isna(x['Rating']) and pd.isna(x['Review']), axis=1)
        self.df = self.df[self.df['drop_filter'] != True]
        self.df.drop(columns=['drop_filter'], inplace=True)
        self.df['Time'] = pd.to_datetime(self.df['Time'])
        self.df['Time'] = self.df['Time'].apply(lambda x: self.__time_string(x))
        self.df['Rating'] = pd.to_numeric(self.df['Rating'], errors="coerce").fillna(0).astype("float")

        return self.df

    def __concatenate_columns(self, concat_cols: list) -> pd.DataFrame:
        # Function to concatenate column name and its content
        def concat_with_column_name(row):
            return ' '.join([f"{col}: {row[col]}" for col in concat_cols])

        # Apply the function to concatenate each row
        self.df['text'] = self.df.apply(concat_with_column_name, axis=1)

        return self.df

    def clean_and_tokenize(self, text_column: str, concat_cols:list) -> pd.DataFrame:
        self.df=self.__cleaning_nans()  # First clean the DataFrame
        self.df=self.__tokenize_and_remove_stopwords(text_column)  # Then tokenize and remove stop words
        concat_cols=[col for col in concat_cols if col != text_column] ## Removing text_column because is the the one used for tokenize and remove stopwords
        concat_cols.append('cleaned_text_column')
        self.df=self.__concatenate_columns(concat_cols)  # Concatenate specified columns
        return self.df


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [88]:
review_col='Review'
concat_cols=['Restaurant','Reviewer','Review']
cleaning_client=Cleaning(df)
df=cleaning_client.clean_and_tokenize(text_column=review_col, concat_cols=concat_cols)


Columns already dropped


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df.drop(columns=['drop_filter'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['Time'] = pd.to_datetime(self.df['Time'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['Time'] = self.df['Time'].apply(lambda x: self.__time_string(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_i

In [89]:
df.reset_index(inplace=True, drop=True)
df.head(3)

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,cleaned_text_column,text
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5.0,"1 Review , 2 Followers",2019,ambience good food quite good saturday lunch c...,Restaurant: Beyond Flavours Reviewer: Rusha Ch...
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5.0,"3 Reviews , 2 Followers",2019,ambience good pleasant evening service prompt ...,Restaurant: Beyond Flavours Reviewer: Anusha T...
2,Beyond Flavours,Ashok Shekhawat,A must try.. great food great ambience. Thnx f...,5.0,"2 Reviews , 3 Followers",2019,must try great food great ambience thnx servic...,Restaurant: Beyond Flavours Reviewer: Ashok Sh...


## Sentence Transformer

In [16]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [90]:
embeddings = model.encode(df['text'],batch_size=64,show_progress_bar=True)
df['embeddings'] = embeddings.tolist()
df['ids'] = df.index
df['ids'] = df['ids'].astype('str')

Batches:   0%|          | 0/156 [00:00<?, ?it/s]

In [91]:
df.head(5)

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,cleaned_text_column,text,embeddings,ids
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5.0,"1 Review , 2 Followers",2019,ambience good food quite good saturday lunch c...,Restaurant: Beyond Flavours Reviewer: Rusha Ch...,"[-0.050087057054042816, -0.013545840978622437,...",0
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5.0,"3 Reviews , 2 Followers",2019,ambience good pleasant evening service prompt ...,Restaurant: Beyond Flavours Reviewer: Anusha T...,"[-0.030735014006495476, 0.01311657764017582, 0...",1
2,Beyond Flavours,Ashok Shekhawat,A must try.. great food great ambience. Thnx f...,5.0,"2 Reviews , 3 Followers",2019,must try great food great ambience thnx servic...,Restaurant: Beyond Flavours Reviewer: Ashok Sh...,"[-0.06927430629730225, -0.05464325472712517, -...",2
3,Beyond Flavours,Swapnil Sarkar,Soumen das and Arun was a great guy. Only beca...,5.0,"1 Review , 1 Follower",2019,soumen das arun great guy behavior sincerety g...,Restaurant: Beyond Flavours Reviewer: Swapnil ...,"[-0.019974758848547935, -0.0011745326919481158...",3
4,Beyond Flavours,Dileep,Food is good.we ordered Kodi drumsticks and ba...,5.0,"3 Reviews , 2 Followers",2019,food goodwe ordered kodi drumsticks basket mut...,Restaurant: Beyond Flavours Reviewer: Dileep c...,"[-0.03018971160054207, -0.0010741103906184435,...",4


## Vector DataBase: Chroma

In [20]:
import chromadb
from chromadb.utils import embedding_functions

In [64]:
chroma_client = chromadb.Client()
client_persistent = chromadb.PersistentClient(path="/content/data_embeddings")

In [92]:
db = client_persistent.create_collection(name='restaurant_reviews_3',
                                        embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2"))

In [93]:
metadata_cols=['Restaurant','Reviewer','Rating','Time','Review']
db.add(
    ids = df['ids'].tolist(),
    embeddings = df['embeddings'].tolist(),
    metadatas = df[metadata_cols].to_dict('records')
)

In [94]:
db.peek(2)

{'ids': ['0', '1'],
 'embeddings': [[-0.050087057054042816,
   -0.013545840978622437,
   0.00020296133880037814,
   0.04322675243020058,
   -0.03488820791244507,
   0.036715637892484665,
   0.059151824563741684,
   -0.03245466575026512,
   -0.031814683228731155,
   -0.05993684381246567,
   0.07883401215076447,
   -0.10487404465675354,
   -0.020607106387615204,
   -0.06955479830503464,
   0.054888755083084106,
   -0.03979510813951492,
   0.18518050014972687,
   -0.017964240163564682,
   -0.038340747356414795,
   -0.141551211476326,
   -0.14375939965248108,
   -0.012753669172525406,
   0.07226092368364334,
   0.028189146891236305,
   -0.06421712040901184,
   0.040876347571611404,
   0.022109629586338997,
   -0.025376809760928154,
   -0.004283218644559383,
   -0.09179361164569855,
   -0.011559348553419113,
   0.13621118664741516,
   0.035785187035799026,
   0.0160738043487072,
   -0.02048197016119957,
   0.048501331359148026,
   0.08267231285572052,
   -0.06671672314405441,
   0.029959941

In [95]:
results = db.query(
    query_texts=['restaurant with indian food with best rating'],
    n_results=10
)

In [96]:
results

{'ids': [['5003',
   '8184',
   '7113',
   '2010',
   '7162',
   '7130',
   '7166',
   '7001',
   '7190',
   '9071']],
 'distances': [[0.5373148918151855,
   0.5391300320625305,
   0.5455983281135559,
   0.546081006526947,
   0.5503934621810913,
   0.5553286075592041,
   0.5660889148712158,
   0.5740500688552856,
   0.5783491134643555,
   0.5800901055335999]],
 'metadatas': [[{'Rating': 4.0,
    'Restaurant': 'Gal Punjab Di',
    'Review': "I've been to place many times. really nice food. high on price side but that is okay because they serve a good north indian food\nwould recommend tandoori momos from this place",
    'Reviewer': 'Smrati Saxena',
    'Time': 2019},
   {'Rating': 3.5,
    'Restaurant': 'Delhi-39',
    'Review': "If you are looking for some good north Indian food within your budget then you should give a try here . This restaurant serves decent food but there is no ambience so don't set your expectations for it . I have tried Paneer dishes and Chinese and those are rea

### Where
1. Estructura

`
{
    "metadata_field": {
        <Operator>: <Value>
    }
}
`
2. Operadores

$eq - equal to (string, int, float)

$ne - not equal to (string, int, float)

$gt - greater than (int, float)

$gte - greater than or equal to (int, float)

$lt - less than (int, float)

$lte - less than or equal to (int, float)

In [138]:
def search(query, rating:int = None, year:int = None, n_results:int = None)->pd.DataFrame:
  if not rating:
    rating=0
  if not year:
    year=0
  if not n_results:
    n_results=5

  results= db.query(
    query_texts=[query],
    n_results=10,
    where = { '$and': [
        {
        "Rating": {"$gte": rating} },
        {"Time": {"$gte": year}}
    ]}
    )
  results=pd.DataFrame(results['metadatas'][0])
  return results


In [125]:
query='Best french fries'
rating=4
year=2018
n_results=3
query_df=search(db,query, rating, year, n_results)

In [126]:
query_df['Review'][0]

'The Caramel Brownie was one of my favorites. The taste of sweet caramel alongwith the brownies was a perfect combination. When there is too much of sweet, you definitely need some savory. So we had some french fries. Although the sauce on top wasn’t that great the fries were spicy and awesome. ...\nRead full post on thegastronomictale.wordpress.com'

In [127]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from gradio)
  Downloading python_multipart-0.0.10-py3-none-any.whl.metadata (1.9 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.6.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting tomlkit==0.12.0 (from gradio)
  Downloading tomlkit-0.12.0-py3-none-any.whl.metadata (2.7 kB)
Collecting websocke

In [139]:
import gradio as gr

current_year=int(datetime.today().strftime('%Y'))
## Assumption: db is executing and already loaded for query in backend
# Define possible genres
iface = gr.Interface(
    fn=search,
    inputs=[
        gr.Textbox(lines=5, placeholder="Write your query over here...", label="Restaurant Reviews Query"),
        #gr.Dropdown(choices=restaurant, label="Restaurante"),
        gr.Slider(minimum=1, maximum=5, value=3, label="Minimum Rating"),
        gr.Number(minimum=2000, maximum=current_year, value=2015, label="Year of Review"),
        gr.Number(minimum=1, maximum=10, value=3, label="Number of Results")

    ],
    outputs=gr.Dataframe(type="pandas", label="Results"),
    title="Restaurant Rating",
    description="Write your query about a restaurant",
)

# Launch the interface
iface.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://c6d2fc152848d53cd7.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


