In [11]:
#os.environ['OPENAI_API_KEY'] = <your-api-key>
import openai
import os

api_key = os.getenv('OPENAI_API_KEY')

https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_5_to_9.ipynb

In [12]:

import pandas as pd
import re
from python_functions import data_loader
# Load the processed data
Hotel_Reviews = data_loader()

#### INDEXING ####

# Load blog
import bs4
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
blog_docs = loader.load()

# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)

# Make splits
splits = text_splitter.split_documents(blog_docs)

# Index
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

In [13]:
Hotel_Reviews

Unnamed: 0,Hotel_Name,Review_Date,Positive_Review,Negative_Review,Reviewer_Score,Reviewer_Nationality,address,price,Original_Positive_Review,Original_Negative_Review,date_object,month,num_date_object
0,11City Rooms,02-12-2023 00:00:00,On arrival we were warmly welcomed and made to...,Nothing,10.0,India,Chania Altstadt Chania,€ 311,On arrival we were warmly welcomed and made to...,Nothing,2023-02-12,2,0.117808
1,11City Rooms,10-18-2024 00:00:00,The location was great right in the middle of ...,This isn't specific to the property but parkin...,10.0,United States,Chania Altstadt Chania,€ 311,The location was great right in the middle of ...,This isn't specific to the property but parkin...,2024-10-18,10,0.800000
2,11City Rooms,10-15-2024 00:00:00,The location was perfect. I loved the fair siz...,The Small Room with Balcony is exactly that. I...,10.0,United Kingdom,Chania Altstadt Chania,€ 311,The location was perfect. I loved the fair siz...,The Small Room with Balcony is exactly that. I...,2024-10-15,10,0.791781
3,11City Rooms,10-07-2024 00:00:00,Second stay here perfect as always. Really rec...,Nothing the room was quite this time and reall...,10.0,France,Chania Altstadt Chania,€ 311,Second stay here perfect as always. Really rec...,Nothing the room was quite this time and reall...,2024-10-07,10,0.769863
4,11City Rooms,10-07-2024 00:00:00,Everything was perfect !!! Giannis was so nice...,The room is a bit noisy during the night just ...,10.0,France,Chania Altstadt Chania,€ 311,Everything was perfect !!! Giannis was so nice...,The room is a bit noisy during the night just ...,2024-10-07,10,0.769863
...,...,...,...,...,...,...,...,...,...,...,...,...,...
149230,Zorbas,08-30-2022 00:00:00,,,10.0,Austria,Soúgia,€ 283,,,2022-08-30,8,0.663014
149231,Zorbas,08-10-2022 00:00:00,,,10.0,Austria,Soúgia,€ 283,,,2022-08-10,8,0.608219
149232,Zorbas,08-05-2022 00:00:00,,,10.0,Greece,Soúgia,€ 283,,,2022-08-05,8,0.594521
149233,Zorbas,07-24-2022 00:00:00,,,10.0,Czech Republic,Soúgia,€ 283,,,2022-07-24,7,0.561644


In [14]:
from langchain_text_splitters import RecursiveCharacterTextSplitter



# Merge the columns using string concatenation
Hotel_Reviews['MergedColumn'] = (
     '' +'Hotel: ' + Hotel_Reviews['Hotel_Name'] + 
    '. Positive Guest Review: ' + Hotel_Reviews['Positive_Review'] + 
    '. ' +'Hotel: ' + Hotel_Reviews['Hotel_Name'] + 
    '. Negative Guest Review: '+ Hotel_Reviews['Negative_Review'] + "\n"
)
# Select the first 100 rows of the merged column
used_data = Hotel_Reviews['MergedColumn'][:500]
used_data = used_data.dropna()
# Specify the file name
file_name = "used_data.txt"

# Save the data to a text file
with open(file_name, 'w') as f:
    for line in used_data:
        f.write(line + '\n')

    


In [15]:

from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings

raw_documents = TextLoader('used_data.txt').load()


text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    separators=["\n"],
    chunk_size=200, 
    chunk_overlap=50)


splits = text_splitter.split_documents(raw_documents)

embeddingsAI = OpenAIEmbeddings()




# Index
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
folder_path = "./Chroma/chroma_db_reviews_crete_merged_test"
if not os.path.exists(folder_path):
    vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=embeddingsAI,persist_directory=folder_path)
else:
    vectorstore = Chroma(persist_directory=folder_path,embedding_function=embeddingsAI)

retriever = vectorstore.as_retriever()

In [7]:
len(splits)


13926

In [8]:
from langchain.prompts import ChatPromptTemplate

# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Please answer the question and provide a summary of the review your answer is based on. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [9]:
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Retrieve
question = "Which reviews talk about the internet connection in Olympic Palladium?"
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})
len(docs)

  return [loads(doc) for doc in unique_docs]


19

In [10]:
from operator import itemgetter
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatOpenAI(temperature=0)

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

print(final_rag_chain.invoke({"question":question}))

The negative guest reviews for Olympic Palladium mention issues with the TV, WiFi, and balcony view.


In [1]:
import os
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.load import dumps, loads
from operator import itemgetter

def process_question(question):
    embeddingsAI = OpenAIEmbeddings()

    folder_path = "./Chroma/chroma_db_reviews_crete_merged"

    vectorstore = Chroma(persist_directory=folder_path,embedding_function=embeddingsAI)

    retriever = vectorstore.as_retriever()

    # Multi Query: Different Perspectives
    template = """You are an AI language model assistant. Your task is to generate five 
    different versions of the given user question to retrieve relevant documents from a vector 
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search. 
    Provide these alternative questions separated by newlines. Please answer the question and provide a summary of the review your answer is based on. Original question: {question}"""
    prompt_perspectives = ChatPromptTemplate.from_template(template)

    # Generate queries
    generate_queries = (
        prompt_perspectives 
        | ChatOpenAI(temperature=0) 
        | StrOutputParser() 
        | (lambda x: x.split("\n"))
    )

    # Define a function to get unique union of documents
    def get_unique_union(documents: list[list]):
        """ Unique union of retrieved docs """
        # Flatten list of lists, and convert each Document to string
        flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
        # Get unique documents
        unique_docs = list(set(flattened_docs))
        # Return
        return [loads(doc) for doc in unique_docs]

    # Retrieve documents
    retrieval_chain = generate_queries | retriever.map() | get_unique_union
    docs = retrieval_chain.invoke({"question": question})

    # RAG
    template = """Answer the following question based on this context:

    {context}

    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(template)
    llm = ChatOpenAI(temperature=0)

    final_rag_chain = (
        {"context": retrieval_chain, 
         "question": itemgetter("question")} 
        | prompt
        | llm
        | StrOutputParser()
    )

    # Invoke the final chain and return the result
    return final_rag_chain.invoke({"question": question})

# Example usage
question = "Does the Hilton Vienna have good Wifi in the room?"
result = process_question(question)
print(result)

  vectorstore = Chroma(persist_directory=folder_path,embedding_function=embeddingsAI)
  return [loads(doc) for doc in unique_docs]


Based on the provided context, the reviews are about Hotel Arena and not Hilton Vienna. Therefore, it cannot be determined if the Hilton Vienna has good Wifi in the room based on this information.
