In [3]:
#os.environ['OPENAI_API_KEY'] = <your-api-key>
import openai
import os

api_key = os.getenv('OPENAI_API_KEY')

https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_5_to_9.ipynb

In [2]:

import pandas as pd
import re
from python_functions import data_loader


# Load the processed data
Hotel_Reviews = data_loader()

#### INDEXING ####

# Load blog
import bs4
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
blog_docs = loader.load()

# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)

# Make splits
splits = text_splitter.split_documents(blog_docs)

# Index
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter



# Merge the columns using string concatenation
Hotel_Reviews['MergedColumn'] = (
     '' +'Hotel: ' + Hotel_Reviews['Hotel_Name'] + 
    '. Positive Guest Review: ' + Hotel_Reviews['Positive_Review'] + 
    '. ' +'Hotel: ' + Hotel_Reviews['Hotel_Name'] + 
    '. Negative Guest Review: '+ Hotel_Reviews['Negative_Review'] + "\n"
)
# Select the first 100 rows of the merged column

# Specify the file name
file_name = "used_data.txt"

# Save the data to a text file
with open(file_name, 'w') as f:
    for line in used_data:
        f.write(line + '\n')

    

Define get_embeddings Function: This function takes a text input, tokenizes it, passes it through the BERT model, and returns the mean of the last hidden state as the text's embedding.
Define Topics and Keywords: It defines a dictionary of topics, each associated with a list of relevant keywords.
Example Texts: It assumes docs[:5] contains example texts to be classified.
Similarity Threshold: A threshold of 0.25 is set to determine if a text is similar enough to a topic.
Classify Texts: For each text:
It computes the embedding of the text.
For each topic, it computes the average embedding of its keywords.
It calculates the cosine similarity between the text's embedding and each topic's average keyword embedding.
It assigns the text to topics where the similarity exceeds the threshold.
Output: It prints the topics under which each text is classified or indicates if no topics match above the threshold

In [22]:
class TextSummarizer:

    # ChatGPT model that we will be using everywhere
    openai_model = "gpt-4o-mini"


    # Constructor    
    def __init__(self):
        self.apikey = self.fetch_api_key()


    # Method to get API key
    def fetch_api_key(self):
        # In here would be all the code required to fetch the api key ...
        return os.environ.get("OPENAI_API_KEY")


    # Method to take list of sentences and return summarized/average single sentence
    def average_sentences(self, list_of_sentences):
        # Instantiate the client
        client = OpenAI(api_key=self.apikey)

        # Write prompt to chatGPT to execute our task
        prompt = "Here is a list of multiple reviews that I want you to summarize and rewrite as a single review that is roughly the same length as the input reviews. The reviews are separated by newline characters \n as follows: {sentences}"
        prompt = prompt.format(sentences = "\n".join(list_of_sentences))

        # Make request to chat GPT
        completion = client.chat.completions.create(
            model=self.openai_model,
            messages=[
                {"role": "system", "content": "You are an assistant that is able to read several reviews and then combine them into a single summarized review. The reviews will be sent to you with a newline character \n separating them. You will return a single review."},
                {"role": "user", "content": prompt}
            ]
        )

        # Get the summarized sentence output from Chat GPT
        summarized_sentence = completion.choices[0].message.content

        # Close the client
        client.close()

        return summarized_sentence

    # Method to summarize a piece of text
    def summarize_text(self, input_text):
        # Instantiate the client
        client = OpenAI(api_key=self.apikey)


        # Make request to chat GPT
        completion = client.chat.completions.create(
            model=self.openai_model,
            messages=[
                {"role": "system", "content": "You are an assistant that is able to read a piece of text and summarize it. Your summary will be 4 sentences or less."},
                {"role": "user", "content": "Here is the text to be summarized below the newline character.\n {text}".format(text=input_text)}
            ]
        )

        # Get the summary of the inptu text output from Chat GPT
        summary_output = completion.choices[0].message.content

        # Close the client
        client.close()

        return summary_output        
    
    
    # Method to summarize a piece of text
    def summarize_text_topic(self, input_text):
        # Instantiate the client
        client = OpenAI(api_key=self.apikey)


        # Make request to chat GPT
        completion = client.chat.completions.create(
            model=self.openai_model,
            messages=[
                {"role": "system", "content": """You are an fair but critical assistant that is able to read a piece of text and summarize it. Please provide a one sentence general summary.
                 Additionally you will write  a summary sentence on each of the for topics: Room, Food and Drinks, Location, Internet and Work and Surprise.
                 Here are some keywords for each topic 
                "Room": ["room", "rooms", "upgrade", "clean", "tidy", "large", "bathroom", "bed", "TV", "shower"] all aspects of describing the status of the room,
                "Food and Drinks": ["drinks", "cocktails", "bottle", "breakfast", "dinner", "menu", "caffee", "tee", "delicious", "continental", "waiter","restaurant "] all aspects describing the quality of food like breakfast and bar,
                "Location": ["close", "far", "next", "park", "train", "bicicle", "car", "walk", "tee", "building", "neighborhood", "cab service", "airport", "subway", "stairs"] all aspects describing the location, surrounding and connection of the hotel,
                "Internet and Work": ["wifi", "Internet", "connection", "work", "password", "computer", "meeting", "signal"] all aspects describing abilty to work from the hotel with a focus on internet connection,
                "Surprise": ["everything", "honestly", "surprising", "change", "unfortunately", "refund"] all aspects which are supringly and not expected by the reviewer,
                Feel free to say that the reviews do not specifically address certain topics.
                 """},
                {"role": "user", "content": "Here is the text to be summarized below the newline character.\n {text}".format(text=input_text)}
            ]
        )

        # Get the summary of the inptu text output from Chat GPT
        summary_output = completion.choices[0].message.content

        # Close the client
        client.close()

        return summary_output    




In [12]:
gpt_summarizer = TextSummarizer()
avg_sentence = gpt_summarizer.average_sentences(docs[:5])

print(avg_sentence)

This hotel offers a very comfortable stay with cozy beds and pleasant facilities, including a lovely garden that’s perfect for summer evenings. The staff is friendly and exceptionally helpful, particularly in organizing daytime activities. Located in a quiet area, it provides great access to public transportation, making it easy to explore nearby restaurants and shops. While the bathrooms could use some ventilation, and the rooms are on the smaller side, the overall experience is enjoyable. The hotel also serves a delightful breakfast and has a laundrette conveniently located nearby for longer stays. Overall, it’s a great value for London, and I would definitely recommend staying here again.


In [14]:
summarized_output_text = gpt_summarizer.summarize_text(docs[:5])
summarized_output_text 

"The hotel is praised for its comfortable beds, friendly staff, and lovely garden facilities, with a quiet location that's close to public transport and local eateries. Guests appreciate the helpfulness of the staff and the bonus of a laundrette nearby for long-term travelers. While some noted the older facilities and small bathrooms, the overall experience is seen as a great value for London, with many indicating they would stay again."

0         11City Rooms
1         11City Rooms
2         11City Rooms
3         11City Rooms
4         11City Rooms
              ...     
149230          Zorbas
149231          Zorbas
149232          Zorbas
149233          Zorbas
149234          Zorbas
Name: Hotel_Name, Length: 149235, dtype: object

In [26]:
Hotel_Reviews["Hotel_Name"]
used_data = Hotel_Reviews['MergedColumn']#[:100]
used_data = used_data.dropna()

docs = used_data.tolist()


gpt_summarizer = TextSummarizer()
summarized_output_text = gpt_summarizer.summarize_text_topic()
print(summarized_output_text) 

The reviews indicate that the hotel is generally well-regarded for its comfortable accommodations, excellent staff, and convenient location, particularly for accessing public transport and local amenities.

**Room**: Most guests found the rooms to be clean, comfortable, and well-appointed, with some noting the spaciousness and great amenities like powerful showers and comfortable beds.

**Food and Drinks**: The breakfast received high praise for its variety and quality, though some guests commented on additional charges for meals beyond breakfast.

**Location**: The hotel boasts a fantastic location, being close to the Earl's Court tube station and within walking distance to many attractions, while maintaining a quieter atmosphere.

**Internet and Work**: Many reviews mentioned good internet connectivity, making it convenient for guests needing to work or stay connected during their visit.

**Surprise**: Guests appreciated unexpected perks like complimentary room upgrades and thoughtfu

In [20]:

used_data = Hotel_Reviews.dropna(subset=['MergedColumn'])[1:2000]
grouped_reviews = used_data.groupby('Hotel_Name')['MergedColumn'].apply(lambda x: ' '.join(x)).to_dict()


In [23]:

# Initialize your summarizer
gpt_summarizer = TextSummarizer()

# Summarize each hotel's reviews
summarized_reviews = {}
for hotel, reviews in grouped_reviews.items():
    summarized_reviews[hotel] = gpt_summarizer.summarize_text_topic(reviews)

# Print summarized reviews
for hotel, summary in summarized_reviews.items():
    print(f"{hotel}: {summary}")

11City Rooms: **General Summary:** The reviews for 11City Rooms in Chania highlight a mix of positive experiences regarding its location, cleanliness, and friendly owners, alongside concerns about the room size, noise from the vicinity, and some challenging amenities.

**Room:** Most guests noted that the rooms were clean and pleasantly decorated, though many found them to be quite small with limitations in bathroom space.

**Food and Drinks:** There were only occasional mentions of welcome drinks or snacks provided, with no extensive review on breakfast or dining options.

**Location:** Guests consistently praised the hotel's central location in the heart of Chania, making it convenient for exploring the city, despite some noise issues from nearby bars.

**Internet and Work:** The reviews did not specifically address internet access or working conditions within the hotel, indicating a focus on leisure rather than business amenities.

**Surprise:** Guest experiences varied greatly, wit