In [1]:
from hyperlink_parser import get_domain_hyperlinks
from collections import deque

# Get a list of links from a starting URL
def crawl(starting_url, domain, max_depth=100):
    queue = deque([starting_url])
    seen = {starting_url}
    while queue:
        url = queue.pop()
        next_links = get_domain_hyperlinks(domain, url)
        for link in next_links: 
            if link not in seen:
                queue.append(link)
                seen.add(link)
        if len(seen) >= max_depth:
            seen = list(seen)[:max_depth]
            break
    return seen

domain = "porsche.com/stories"
start_url = "https://www.porsche.com/stories/"

links = crawl(start_url, domain, 10)

In [2]:
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(links)
documents = loader.load()
print(len(documents))

USER_AGENT environment variable not set, consider setting it to identify your requests.


10


In [3]:
import re
# Define a function to clean document content
def clean_content(text):
    text = re.sub(r'\n+', '\n', text)  # Remove excessive newlines
    text = re.sub(r'\s{2,}', ' ', text)  # Replace multiple spaces or tabs with a single space
    return text.strip()  # Remove leading/trailing whitespace

# clean content
for doc in documents:
    doc.page_content = clean_content(doc.page_content)

Split loaded documents into smaller chunks.
Options:
- **RecursiveCharacterTextSplitter:** Chunks are created by recursively splitting text through a hierarchy of separators (e.g., paragraphs, then sentences, then words) to keep chunks coherent. Best when maintaining logical structure is important.
- **SpacyTextSplitter:** Chunks are formed based on linguistic structure, such as sentences or phrases, using spaCy’s NLP capabilities. Ideal for linguistically complex text where context is crucial.'
- **NLTKTextSplitter:** This method splits text at sentence boundaries, leveraging NLTK’s sentence tokenization. Good for sentence-level chunking when sentence coherence is needed.
- **SentenceTransformersTokenTextSplitter:** Chunks are created based on a specific token count, which aligns well with embedding models that require token-limited inputs. Useful for tasks that involve embeddings or semantic search.
- **CharacterTextSplitter:** Text is divided into chunks based solely on a character limit, regardless of sentence or word boundaries. Suitable for fast, simple chunking without regard for linguistic structure.

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Chunk size is the maximum number of characters that a chunk can contain.
# Chunk overlap is the number of characters that should overlap between two adjacent chunks.
# A smaller chunk size will result in more chunks, while a larger chunk size will result in fewer chunks. A larger chunk overlap will result in more chunks sharing common characters, while a smaller chunk overlap will result in fewer chunks sharing common characters. 
# If the text is highly structured, such as code or HTML, you may want to use a larger chunk size. If the text is less structured, such as a novel or a news article, you may want to use a smaller chunk size.

text_splitter=RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=10) #TODO: add this to list of hyperparameters
splits = text_splitter.split_documents(documents)
print(len(splits), splits[1])

264 page_content='Culture Porsche Experience Stories Discover experience stories from the world of Porsche. From track to travel experiences, join us on journeys to places that inspire our curiosity for adventure. Download zen Porsche wallpapers from The Recharge Guide by Porsche and MICHELIN Download background images of the breathtaking landscapes and towns of Provence, featuring the Porsche Macan with custom Michelin tyres and pro kitesurfer Rita Arnaus. And immerse yourself in serenity Read more The perfect' metadata={'source': 'https://porsche.com/stories/experience', 'title': 'Porsche Experience Stories', 'description': 'From road trips to events, travel to adventure, this is the Porsche experience.', 'language': 'en-international'}


In [5]:
from langchain_community.embeddings import HuggingFaceEmbeddings

# Create 
# The default embedding model in HuggingFace Embeddings is sentence-transformers/all-mpnet-base-v2 with 768 dimension. We use a smaller model all-MiniLM-L6-v2 with dimension 384 so indexing runs faster.
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cuda'})

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cuda'})
  from tqdm.autonotebook import tqdm, trange


In [ ]:
from langchain.vectorstores import Chroma
from uuid import uuid4
DB_PATH = 'vectorstore/db_chroma'
db = Chroma.from_documents(documents=splits,embedding=embeddings,collection_name="porsche_stories_new",persist_directory=DB_PATH)

# Chroma: a light-weight and in memory so it's easy to get started with and use for local development.
#FAISS (Facebook AI Similarity Search): a vector store that supports search in vectors that may not fit in RAM and is appropriate for production use. #TODO: switch to FAISS when running on AWS



In [11]:
embeddings.embed_documents("The quick brown fox jumps over the lazy dog")

[[-0.05895751714706421,
  0.05387330800294876,
  0.008316735737025738,
  0.02068045549094677,
  0.002201046794652939,
  -0.03443445265293121,
  0.08473177254199982,
  0.1077682226896286,
  0.058957356959581375,
  -0.01663867197930813,
  0.062118154019117355,
  -0.0740492194890976,
  -0.030303219333291054,
  0.03807584568858147,
  -0.004251397680491209,
  -0.00798637606203556,
  0.0031452623661607504,
  -0.07220274209976196,
  -0.015011952258646488,
  -0.07136200368404388,
  -0.08817397803068161,
  0.045202042907476425,
  0.009071126580238342,
  0.04195641726255417,
  -0.001292905188165605,
  -0.014840212650597095,
  -0.03942633792757988,
  0.007077527232468128,
  -0.036413609981536865,
  -0.042899239808321,
  -0.04982226714491844,
  0.009418354369699955,
  -0.022699307650327682,
  0.01023806445300579,
  -0.004453912377357483,
  -0.07801613956689835,
  -0.07069578021764755,
  -0.005806361325085163,
  0.03875812143087387,
  0.012042270973324776,
  -0.05325382947921753,
  -0.0963793620467

In [11]:
import langchain
from queue import Queue
from typing import Any
from langchain.llms.huggingface_text_gen_inference import HuggingFaceTextGenInference
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.schema import LLMResult
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts.prompt import PromptTemplate
from anyio.from_thread import start_blocking_portal #For model callback streaming
from langchain.vectorstores import Chroma

langchain.debug=True 

#vector db path
DB_CHROMA_PATH = 'vectorstore/db_chroma'

#Llama2 TGI models host port
LLAMA3_8B_HOSTPORT = "http://localhost:8080/" #Replace the locahost with the IP visible to the machine running the notebook

model_dict = {
    "8b-instruct" : LLAMA3_8B_HOSTPORT,
}

system_message = {"role": "system", "content": "You are a helpful assistant."}
llm = HuggingFaceTextGenInference(
    inference_server_url=LLAMA3_8B_HOSTPORT,
    max_new_tokens=512,
    top_k=10,
    top_p=0.9,
    typical_p=0.95,
    temperature=0.6,
    repetition_penalty=1,
    do_sample=True,
    streaming=True
)
system_prompt = ""

template = """
Use the following pieces of context to answer the question. If no context provided, answer like a AI assistant.
{context}
Question: {question}
""" 
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
                                       model_kwargs={'device': 'cuda'})
db = Chroma(persist_directory=DB_CHROMA_PATH)
retriever = db.as_retriever(
        search_kwargs={"k": 6}
    )
qa_chain = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=retriever,     
    chain_type_kwargs={
        "prompt": PromptTemplate(
            template=template,
            input_variables=["context", "question"],
        ),
    }
)

In [6]:
result = qa_chain({"query": "Why choose Llama?"})
print(result)

  db = Chroma(persist_directory=DB_CHROMA_PATH)


In [9]:
print(db.embeddings)

None
