In [1]:
from milvus_utils import *
import pickle
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.schema.document import Document
import more_itertools
from datetime import datetime

In [2]:
# Initialize e5-large-v2 embeddings model
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

hf_embeddings = HuggingFaceEmbeddings(model_name="models/e5-large-v2",
                                     model_kwargs=model_kwargs,
                                     encode_kwargs=encode_kwargs)

No sentence-transformers model found with name models/e5-large-v2. Creating a new one with MEAN pooling.


In [None]:
&apos; = '
&amp; = &

In [49]:
string_test = "yo this is a test of replacing typos, it&apos;s supposed to be easy &amp; fun"

In [50]:
string_test.replace('&apos;', "'").replace('&amp;', '&')

"yo this is a test of replacing typos, it's supposed to be easy & fun"

In [None]:
'review': just_review,
'food_rundown': food_rundown,
'cuisine': review_cuisine,
'perfect_for_tags': review_tags,
'price_range': review_price,
'review_date': review_date

In [89]:
with open('infatuation_reviews_v2.pkl', 'rb') as file:
    resto_reviews = pickle.load(file)

In [101]:
test_date = datetime.strptime(resto_reviews['Le Bernardin']['review_date'].split('T')[0], "%Y-%m-%d").date()

In [102]:
print(test_date)

2022-11-01


In [5]:
with open('infatuation_reviews_v2.pkl', 'rb') as file:
    resto_reviews = pickle.load(file)
    
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500,
                                     chunk_overlap=800,
                                     length_function=len)
final_docs = []
total_word_cnt = 0
for resto in resto_reviews:
    cleaned_review = resto_reviews[resto]['review'].replace('&apos;', "'").replace("&amp;", "&")
    # Split review text into chunks
    split_review_docs = text_splitter.create_documents([cleaned_review])
    # Loop through each chunk and assign metadata
    for doc in split_review_docs:
        doc.metadata['resto_name'] = resto.replace("&amp;", "&").replace('&apos;', "'")
#         doc.metadata['food_rundown'] = resto_reviews[resto]['food_rundown'].replace("&amp;", "&").replace('&apos;', "'")
        doc.metadata['cuisine'] = resto_reviews[resto]['cuisine']
        doc.metadata['perfect_for_tags'] = resto_reviews[resto]['perfect_for_tags'].replace("&amp;", "&").replace('&apos;', "'")
        doc.metadata['price_range'] = resto_reviews[resto]['price_range']
        doc.metadata['review_date'] = resto_reviews[resto]['review_date'].split('T')[0]
        # Add document to final list of documents
        final_docs.append(doc)
        total_word_cnt += len(doc.page_content.split())

print(f'Final total of {len(final_docs)} documents from {len(resto_reviews)} reviews.')
print(f'Average word of count of each document: {total_word_cnt/len(final_docs)}\n')

Final total of 268 documents from 254 reviews.
Average word of count of each document: 159.11194029850745



In [14]:
connect()

KeyboardInterrupt: 

In [4]:
listCollections()

['chompt_resto_data', 'chompt_resto_data_cleaned', 'chompt_resto_data_2']

In [111]:
getCollectionDetails('chompt_resto_data_cleaned')

{'collection.schema': {'auto_id': True, 'description': 'Restaurant reviews to use for Chompt restaurant picker', 'fields': [{'name': 'pk', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'resto_name', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 100}}, {'name': 'price_range', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 100}}, {'name': 'perfect_for_tags', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 1000}}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'name': 'vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1024}}]},
 'collection.description': 'Restaurant reviews to use for Chompt restaurant picker',
 'collection.name': 'chompt_resto_data_cleaned',
 'collection.is_empty': False,
 'collection.num_entities': 309,
 'collection.primary_field': {'name': 'pk', '

In [6]:
# dropCollection('chompt_resto_data_cleaned')
collection = createCollection('chompt_resto_data_cleaned_dec8')
# listCollections()

Creating collection: chompt_resto_data_cleaned_dec8
Created


In [None]:
collection = Collection(name='chompt_resto_data', using='default')

In [None]:
res = collection.query(expr="resto_name like 'Le Bernardin%'",
                      output_fields=['resto_name', 'text'])
res[0]

In [7]:
disconnect()

In [None]:
# sub_final_docs = final_docs[:10]
# vector_store = Milvus.from_documents(sub_final_docs,
#                                     collection_name=MILVUS_COLLECTION_NAME,
#                                     embedding=hf_embeddings,
#                                     connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT})

In [114]:
len(final_docs)

268

In [8]:
# sub_doc = [final_docs[0]]
sub_docs = list(more_itertools.batched(final_docs, 30))

In [120]:
len(sub_docs[0])

30

In [9]:
for count, doc_list in enumerate(sub_docs):
    vector_store = Milvus.from_documents(list(doc_list),
                                        collection_name='chompt_resto_data_cleaned_dec8',
                                        embedding=hf_embeddings,
                                        connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT})
    print(f"Stored sub list {count+1} in Milvus!")

# vector_store = Milvus(collection_name='chompt_resto_data_cleaned',
#                     embedding_function=hf_embeddings,
#                     connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT})

Stored sub list 1 in Milvus!
Stored sub list 2 in Milvus!
Stored sub list 3 in Milvus!
Stored sub list 4 in Milvus!
Stored sub list 5 in Milvus!
Stored sub list 6 in Milvus!
Stored sub list 7 in Milvus!
Stored sub list 8 in Milvus!
Stored sub list 9 in Milvus!


In [13]:
vector_store = Milvus(collection_name='chompt_resto_data_cleaned_dec8',
                    embedding_function=hf_embeddings,
                    connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT})

[93m[has_collection] retry:4, cost: 0.27s, reason: <_MultiThreadedRendezvous: StatusCode.UNAVAILABLE, failed to connect to all addresses; last error: UNKNOWN: ipv4:127.0.0.1:19530: Failed to connect to remote host: Connection refused>[0m
[93m[has_collection] retry:5, cost: 0.81s, reason: <_MultiThreadedRendezvous: StatusCode.UNAVAILABLE, failed to connect to all addresses; last error: UNKNOWN: ipv4:127.0.0.1:19530: Failed to connect to remote host: Connection refused>[0m
[93m[has_collection] retry:6, cost: 2.43s, reason: <_MultiThreadedRendezvous: StatusCode.UNAVAILABLE, failed to connect to all addresses; last error: UNKNOWN: ipv4:127.0.0.1:19530: Failed to connect to remote host: Connection refused>[0m
[93m[has_collection] retry:7, cost: 3.00s, reason: <_MultiThreadedRendezvous: StatusCode.UNAVAILABLE, failed to connect to all addresses; last error: UNKNOWN: ipv4:127.0.0.1:19530: Failed to connect to remote host: Connection refused>[0m


KeyboardInterrupt: 

In [10]:
query = "Going on a date and want to eat really good Chinese food in downtown Manhattan that will impress my date."

In [12]:
# search_expr = 'price_range == \"$$\"'
# result_docs = vector_store.similarity_search(query, search_expr)
result_docs = vector_store.similarity_search(query)

KeyboardInterrupt: 

In [85]:
for resto in result_docs:
    resto_name = resto.metadata['resto_name']
    price_range = resto.metadata['price_range']
    perfect_for = resto.metadata['perfect_for_tags']
    review = resto.page_content
    print(f"\nRestaurant: {resto_name}\nReview: {review}\nPrice Range: {price_range}\n")


Restaurant: Winner on Franklin
Review: The Breakfast Sandwich

This is a tried and true sandwich. If you live near Winner on Franklin, we hope you know this already. If not, swing by and eat it. Soon. You should add the optional Chinese sausage, but the custardy eggs with homemade chili crisp will be good even without it.


Patties

The patties here are crispy on the outside, smooth and meaty on the inside, and also $6 or more. We like them, but in an area full of excellent patties, we usually opt for something else. 


Chicken Katsu

When they actually have it, this is one of the best things on the menu. The dinner version comes on furikake brioche, but we prefer the lunchtime variety, which comes between two thick slices of milk bread, is about the size of your head, and slathered with Taiwanese-style three-cup sauce. An underrated part of consuming sandwiches is eating all the things that fall out of them after, and this sandwich produces a lot of these post-sandwich bites. 


Vegg