<a href="https://colab.research.google.com/github/anilrahultadepalli/Webchat1/blob/main/web_chat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import os
import json
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [None]:
url = "https://www.uchicago.edu/"
# url = "https://www.javatpoint.com/dynamic-programming"

### Fuction to Scrape data from URL

In [None]:
def scrapeData(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"
    }

    response = requests.get(url, headers=headers)

    print(f"Fetching {url} - Status Code: {response.status_code}")

    if response.status_code != 200:
        return {
            "url": url,
            "error": f"Failed to fetch content, status code: {response.status_code}"
        }

    soup = BeautifulSoup(response.text, "html.parser")

    # Initialize structure for extracted content
    structured_data = {
        "url": url,
        "headings": [],
        "paragraphs": [],
        "other_text": []
    }

    # Extracting headings
    for heading in range(1, 7):
        for tag in soup.find_all(f"h{heading}"):
            text = tag.get_text(strip=True)
            if text:
                structured_data["headings"].append(text)

    # Extracting paragraphs
    for p in soup.find_all("p"):
        text = p.get_text(strip=True)
        if text:
            structured_data["paragraphs"].append(text)

    # Extract other text content (excluding h1-h6 and p)
    # for tag in soup.find_all(True):
    #     if tag.name not in ["h1", "h2", "h3", "h4", "h5", "h6", "p"]:
    #         text = tag.get_text(strip=True)
    #         if text:
    #             structured_data["other_text"].append(text)

    # Remove duplicates while maintaining order
    structured_data["headings"] = list(dict.fromkeys(structured_data["headings"]))
    structured_data["paragraphs"] = list(dict.fromkeys(structured_data["paragraphs"]))
    # structured_data["other_text"] = list(dict.fromkeys(structured_data["other_text"]))

    return structured_data

### Saving Extracted data into json format

In [None]:
def save_to_json(data, output_dir):
    # Extract domain name for filename
    domain = data["url"].split("//")[-1].split("/")[0]
    file_name = f"{domain}.json"
    file_path = os.path.join(output_dir, file_name)

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

    print(f"Data saved to {file_path}")


In [None]:
pwd

'/Users/saiteja/Programming/url-chat-bot'

In [None]:
ls

[34mchroma_db[m[m/      [34mscraped_data[m[m/   web-chat.ipynb


In [None]:
output_directory = "./scraped_data"

scraped_data = scrapeData(url)
print(json.dumps(scraped_data, indent=4))
data = scraped_data

if "error" not in scraped_data:
    save_to_json(scraped_data, output_directory)
else:
    print(scraped_data["error"])

Fetching https://www.uchicago.edu/ - Status Code: 200
{
    "url": "https://www.uchicago.edu/",
    "headings": [
        "The Day Tomorrow Began",
        "Go 'Inside the Lab' at UChicago",
        "LATEST NEWS",
        "We value rigorous inquiry",
        "We foster independent thinking",
        "Transformative education",
        "Field-defining research",
        "We advance ideas and humanity",
        "Intellectual freedom",
        "Community impact",
        "Global impact",
        "We call Chicago home"
    ],
    "paragraphs": [
        "A diversity of people and ideas, coupled with free and open discourse, lays the foundation for students and scholars to bring forth original ideas that define fields and enrich human life.",
        "UChicago students develop the habits of mind and intellectual skills needed to confront complex challenges.",
        "UChicago researchers have contributed to some of the world\u2019s greatest discoveries, advancements, and bodies of knowledg

In [None]:
!pip install -q langchain pinecone-client sentence-transformers

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chromadb 0.5.23 requires tokenizers<=0.20.3,>=0.13.2, but you have tokenizers 0.21.0 which is incompatible.[0m[31m
[0m

### Chunking Data

In [None]:
from langchain.schema import Document

def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)

    combined_text = []
    if 'headings' in data:
        combined_text.extend(data['headings'])
    if 'paragraphs' in data:
        combined_text.extend(data['paragraphs'])
    if 'other_text' in data:
        combined_text.extend(data['other_text'])

    full_text = "\n\n".join(combined_text)
    chunks = text_splitter.split_text(full_text)

    # Wrap chunks in Document objects
    document_chunks = [Document(page_content=chunk) for chunk in chunks]

    return document_chunks


### Embedding and Uploading to a Vector Database (Pinecone)

In [None]:
def insert_or_fetch_embeddings(index_name, chunks):
    # importing the necessary libraries and initializing the Pinecone client
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import ServerlessSpec


    pc = pinecone.Pinecone()

    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)  # 512 works as well

    # loading from existing index
    if index_name in pc.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        # creating the index and embedding the chunks into the index
        print(f'Creating index {index_name} and embeddings ...', end='')

        # creating a new index
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
        )
        )

        # processing the input documents, generating embeddings using the provided `OpenAIEmbeddings` instance,
        # inserting the embeddings into the index and returning a new Pinecone vector store object.
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')

    return vector_store


In [None]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pc = pinecone.Pinecone()

    if index_name == 'all':
        indexes = pc.list_indexes().names()
        print('Deleting all indexes ... ')
        for index in indexes:
            pc.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pc.delete_index(index_name)
        print('Ok')


## Asking and Getting Answers

In [None]:
def ask_and_get_answer(vector_store, q, k=3):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

    answer = chain.invoke(q)
    return answer


## Using Chroma as a Vector DB

In [None]:
pip install -q chromadb

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
transformers 4.47.0 requires tokenizers<0.22,>=0.21, but you have tokenizers 0.20.3 which is incompatible.[0m[31m
Note: you may need to restart the kernel to use updated packages.


In [None]:
def create_embeddings_chroma(chunks, persist_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    # Instantiate an embedding model from OpenAI (smaller version for efficiency)
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)

    # Create a Chroma vector store using the provided text chunks and embedding model,
    # configuring it to save data to the specified directory
    vector_store = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)

    return vector_store  # Return the created vector store


In [None]:
def load_embeddings_chroma(persist_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    # Instantiate the same embedding model used during creation
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)

    # Load a Chroma vector store from the specified directory, using the provided embedding function
    vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

    return vector_store  # Return the loaded vector store


In [None]:
pip install -U langchain-community -q

Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install -q langchain_openai

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Splitting the document into chunks
chunks = chunk_data(data, chunk_size=256)

# Creating a Chroma vector store using the provided text chunks and embedding model (default is text-embedding-3-small)
vector_store = create_embeddings_chroma(chunks)

## Adding Memory (Chat History)

In [None]:
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain  # Import class for building conversational AI chains
from langchain.memory import ConversationBufferMemory  # Import memory for storing conversation history

# Instantiate a ChatGPT LLM (temperature controls randomness)
llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)

# Configure vector store to act as a retriever (finding similar items, returning top 5)
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 5})


# Create a memory buffer to track the conversation
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

crc = ConversationalRetrievalChain.from_llm(
    llm=llm,  # Link the ChatGPT LLM
    retriever=retriever,  # Link the vector store based retriever
    memory=memory,  # Link the conversation memory
    chain_type='stuff',  # Specify the chain type
    verbose=False  # Set to True to enable verbose logging for debugging
)


In [None]:
# create a function to ask questions
def ask_question(q, chain):
    result = chain.invoke({'question': q})
    return result

In [None]:
chunks = chunk_data(data, chunk_size=256)
vector_store = create_embeddings_chroma(chunks)

### Loop for asking questions

In [None]:
while True:
    q = input('Your question: ')
    if q.lower() in 'exit quit bye':
        print('Bye bye!')
        break
    result = ask_question(q, crc)
    print(result['answer'])
    print('-' * 100)


Your question:  Give a summary of the data


The data involves solving problems using a tabulation technique instead of recursion to avoid stack overflow issues and overhead. Results are stored in a matrix to keep track of intermediate values. An example is given with an array containing 0 and 1 values at specific positions.
----------------------------------------------------------------------------------------------------


Your question:  What are the key take aways


The key takeaways from using a tabulation technique instead of recursion to avoid stack overflow issues and overhead are:
1. By using tabulation, we solve problems iteratively and store results in a matrix.
2. This approach eliminates the need for recursion, thus avoiding stack overflow issues.
3. Storing intermediate results in a matrix allows for efficient reuse of values and optimization of the solution.

An example involving an array with 0 and 1 values at specific positions could be a problem where we need to find the maximum sum of non-adjacent elements in an array. By using tabulation, we can create a matrix to store the maximum sum at each index, considering whether to include the current element or skip it based on the condition of not selecting adjacent elements. This approach efficiently solves the problem without the overhead of recursion.
----------------------------------------------------------------------------------------------------


Your question:  quit


Bye bye!
