In [24]:
import pickle
import pinecone
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.schema.document import Document
import more_itertools
from datetime import datetime
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import os
from dotenv import load_dotenv
from tqdm.auto import tqdm
from uuid import uuid4

In [13]:
load_dotenv()

# model_name = 'text-embedding-ada-002'
# embeddings = OpenAIEmbeddings(
#                 document_model_name=model_name,
#                 query_model_name=model_name,
#                 openai_api_key=os.getenv('OPENAI_API_KEY')
#             )

# Initialize e5-large-v2 embeddings model
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

embeddings_model = HuggingFaceEmbeddings(model_name="models/e5-large-v2",
                                     model_kwargs=model_kwargs,
                                     encode_kwargs=encode_kwargs)

No sentence-transformers model found with name models/e5-large-v2. Creating a new one with MEAN pooling.


In [10]:
with open('infatuation_reviews_v2.pkl', 'rb') as file:
    resto_reviews = pickle.load(file)
    
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500,
                                     chunk_overlap=800,
                                     length_function=len)
final_docs = []
total_word_cnt = 0
for resto in resto_reviews:
    cleaned_review = resto_reviews[resto]['review'].replace('&apos;', "'").replace("&amp;", "&").replace('&quot;', '"').replace("&quot", '"')
    # Split review text into chunks
    split_review_docs = text_splitter.create_documents([cleaned_review])
    # Loop through each chunk and assign metadata
    for doc in split_review_docs:
        doc.metadata['resto_name'] = resto.replace("&amp;", "&").replace('&apos;', "'")
#         doc.metadata['food_rundown'] = resto_reviews[resto]['food_rundown'].replace("&amp;", "&").replace('&apos;', "'")
        doc.metadata['cuisine'] = resto_reviews[resto]['cuisine']
        doc.metadata['perfect_for_tags'] = resto_reviews[resto]['perfect_for_tags'].replace("&amp;", "&").replace('&apos;', "'")
        doc.metadata['price_range'] = resto_reviews[resto]['price_range']
        doc.metadata['review_date'] = resto_reviews[resto]['review_date'].split('T')[0]
        # Add document to final list of documents
        final_docs.append(doc)
        total_word_cnt += len(doc.page_content.split())

print(f'Final total of {len(final_docs)} documents from {len(resto_reviews)} reviews.')
print(f'Average word of count of each document: {total_word_cnt/len(final_docs)}\n')

Final total of 268 documents from 254 reviews.
Average word of count of each document: 159.11194029850745



In [11]:
# Initialize Pinecone
pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment=os.getenv('PINECONE_ENVIRONMENT'),
)

# Get chompt pinecome index name
index_name = os.getenv('PINECONE_INDEX_NAME')

# pinecone.create_index(
#     name=index_name,
#     metric='cosine',
#     dimension=1024
# )

In [12]:
# Split docs into batches
sub_docs = list(more_itertools.batched(final_docs, 30))
print(f"Original reviews list ({len(final_docs)} reviews) split into smaller lists ({len(sub_docs)} lists)")
print(f"Start time: {datetime.now()}")
for count, doc_list in enumerate(sub_docs):
    vector_store = Pinecone.from_documents(list(doc_list),
                                        embedding=embeddings,
                                        index_name=index_name)
    print(f"Stored sub list {count+1} in Pinecone!")
print("Finished storing Infatuation reviews in Pinecone... Broncos Country! Let's Ride!!")
print(f"End time: {datetime.now()}")

Original reviews list (268 reviews) split into smaller lists (9 reviews)
Stored sub list 1 in Pinecone!
Stored sub list 2 in Pinecone!
Stored sub list 3 in Pinecone!
Stored sub list 4 in Pinecone!
Stored sub list 5 in Pinecone!
Stored sub list 6 in Pinecone!
Stored sub list 7 in Pinecone!
Stored sub list 8 in Pinecone!
Stored sub list 9 in Pinecone!
Finished storing Infatuation reviews in Pinecone... Broncos Country! Let's Ride!!


In [61]:
pinecone.delete_index(index_name)

pinecone.create_index(
    name=index_name,
    metric='cosine',
    dimension=1024
)

index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [41]:
my_list = ['apple', 'banana', 'orange']
suffix_to_add = '_fruit'

# Loop through the list and add the suffix to each element in place
my_list = [f'The fruit is: {suffix_to_add}. Tester: ' + element for element in my_list]

print(my_list)

['The fruit is: _fruit. Tester: apple', 'The fruit is: _fruit. Tester: banana', 'The fruit is: _fruit. Tester: orange']


In [62]:
with open('infatuation_reviews_v3.pkl', 'rb') as file:
    resto_reviews = pickle.load(file)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500,
                                 chunk_overlap=800,
                                 length_function=len)

batch_limit = 30
batch_count = 1

upsert_chunks = []
upsert_metadatas = []

total_word_cnt = 0

start_time = datetime.now()
print(f"Start time: {start_time}")
for i, resto in enumerate(tqdm(resto_reviews)):
    print(f"Chunking and preparing review #{i}...")
    # Clean up review data
    cleaned_review = resto['review'].replace('&apos;', "'").replace("&amp;", "&").replace('&quot;', '"').replace("&quot", '"')
    cleaned_resto_name = resto['resto_name'].replace("&amp;", "&").replace('&apos;', "'")
    cleaned_resto_tags = resto['perfect_for_tags'].replace("&amp;", "&").replace('&apos;', "'")
    review_date = resto['review_date'].split('T')[0]
    
    # Set the metadata for this restaurant review
    metadata = {
        'resto_name': cleaned_resto_name,
        'cuisine': resto['cuisine'],
        'perfect_for_tags': cleaned_resto_tags,
        'price_range': resto['price_range'],
        'review_date': review_date,
        'image_url': resto['resto_image'],
    }
    # Split review into chunks
    review_chunks = text_splitter.split_text(cleaned_review)
    review_chunks = [f"Perfect for: {cleaned_resto_tags}. Serves {resto['cuisine']}. " + j for j in review_chunks]
    # Create metadata dicts for each chunk
    chunk_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(review_chunks)]
    # Append chunks to list of chunks (to be upserted to Pinecone)
    upsert_chunks.extend(review_chunks)
    # Append chunks metadatas to list of metadatas (also to be upserted to Pinecone)
    upsert_metadatas.extend(chunk_metadatas)
    # If we're at the batch_limit, store chunks in Pinecone
    if len(upsert_chunks) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(upsert_chunks))]
        embeddings = embeddings_model.embed_documents(upsert_chunks)
        print(f"Batch full. Upserting dangerwich batch #{batch_count}...")
        index.upsert(vectors=zip(ids, embeddings, upsert_metadatas))
        print(f"Finished upserting dangerwich batch #{batch_count}!!!")
        upsert_chunks = []
        upsert_metadatas = []

print("Upserting left over dangerwiches...")
ids = [str(uuid4()) for _ in range(len(upsert_chunks))]
embeddings = embeddings_model.embed_documents(upsert_chunks)
index.upsert(vectors=zip(ids, embeddings, upsert_metadatas))
print("Finished upserting all dangerwiches... BRONCOS COUNTRY. LET'S RIDE!!!")

end_time = datetime.now()
print(f"End time: {end_time}")
print(f"Total elapsed time: {end_time - start_time}")

Start time: 2023-12-08 20:05:38.309933


  0%|          | 0/255 [00:00<?, ?it/s]

Chunking and preparing review #0...
Chunking and preparing review #1...
Chunking and preparing review #2...
Chunking and preparing review #3...
Chunking and preparing review #4...
Chunking and preparing review #5...
Chunking and preparing review #6...
Chunking and preparing review #7...
Chunking and preparing review #8...
Chunking and preparing review #9...
Chunking and preparing review #10...
Chunking and preparing review #11...
Chunking and preparing review #12...
Chunking and preparing review #13...
Chunking and preparing review #14...
Chunking and preparing review #15...
Chunking and preparing review #16...
Chunking and preparing review #17...
Chunking and preparing review #18...
Chunking and preparing review #19...
Chunking and preparing review #20...
Chunking and preparing review #21...
Chunking and preparing review #22...
Batch full. Upserting dangerwich batch #1...
Finished upserting dangerwich batch #1!!!
Chunking and preparing review #23...
Chunking and preparing review #24..

Batch full. Upserting dangerwich batch #1...
Finished upserting dangerwich batch #1!!!
Chunking and preparing review #223...
Chunking and preparing review #224...
Chunking and preparing review #225...
Chunking and preparing review #226...
Chunking and preparing review #227...
Chunking and preparing review #228...
Chunking and preparing review #229...
Chunking and preparing review #230...
Chunking and preparing review #231...
Chunking and preparing review #232...
Chunking and preparing review #233...
Chunking and preparing review #234...
Chunking and preparing review #235...
Chunking and preparing review #236...
Chunking and preparing review #237...
Chunking and preparing review #238...
Chunking and preparing review #239...
Chunking and preparing review #240...
Chunking and preparing review #241...
Chunking and preparing review #242...
Chunking and preparing review #243...
Chunking and preparing review #244...
Chunking and preparing review #245...
Chunking and preparing review #246...
C

In [57]:
vector_store = Pinecone.from_existing_index(index_name, embeddings_model)
query = "Getting dinner with a group of friends on a Saturday night and want a restaurant with good music and vibes near the East Village"
# Restos are returned as Langchain Documents, containing appropriate metadata and reviews
top_restos = vector_store.similarity_search(query, k=4)

In [58]:
for resto in top_restos:
    resto_name = resto.metadata['resto_name']
    price_range = resto.metadata['price_range']
    perfect_for = resto.metadata['perfect_for_tags']
    review = resto.page_content
    print(f"\nRestaurant: {resto_name}\nReview: {review}\nPerfect for: {perfect_for}\nPrice Range: {price_range}\n")


Restaurant: Dora's Restaurant
Review: Perfect for: Big Groups, Walk-Ins, Peruvian. Serves Peruvian. The dinner you have before you go out is very important. It sets the tone for the rest of the evening, and if you go somewhere boring, you may end up falling asleep standing up later on the dance floor. So, the next time you’re looking for a relatively affordable pre-going out dinner in the East Village, go to Dora’s, a Peruvian spot where the sangria comes in at least six different flavors, and the music plays at a club-level volume. The large, casual space has plastic chairs, a full bar, and a Good Vibes sign that is only sometimes turned on. Order a bunch of ceviches to share, a few rounds of sangria, and maybe even a paella. And if your night begins and ends here, we think that’d be just fine with Dora.
Perfect for: Big Groups, Walk-Ins, Peruvian
Price Range: $


Restaurant: Ella Funt
Review: Perfect for: Date Night, French. Serves French. Half the city’s restaurants are currently i

In [31]:
test_str = '&quot;right&quot'
cleaned_review = test_str.replace('&quot;', '"').replace("&quot", '"')

In [33]:
print(cleaned_review)

"right"
