In [1]:
from hyperlink_parser import get_domain_hyperlinks
from collections import deque

# Get a list of links from a starting URL
def crawl(starting_url, domain, max_depth=100):
    queue = deque([starting_url])
    seen = {starting_url}
    while queue:
        url = queue.pop()
        next_links = get_domain_hyperlinks(domain, url)
        for link in next_links: 
            if link not in seen:
                queue.append(link)
                seen.add(link)
        if len(seen) >= max_depth:
            seen = list(seen)[:max_depth]
            break
    return seen

domain = "porsche.com/stories"
start_url = "https://www.porsche.com/stories/"

links = crawl(start_url, domain, 10)

In [4]:
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(links)
documents = loader.load()
print(len(documents))

10


In [5]:
import re
# Define a function to clean document content
def clean_content(text):
    text = re.sub(r'\n+', '\n', text)  # Remove excessive newlines
    text = re.sub(r'\s{2,}', ' ', text)  # Replace multiple spaces or tabs with a single space
    return text.strip()  # Remove leading/trailing whitespace

# clean content
for doc in documents:
    doc.page_content = clean_content(doc.page_content)

Split loaded documents into smaller chunks.
Options:
- **RecursiveCharacterTextSplitter:** Chunks are created by recursively splitting text through a hierarchy of separators (e.g., paragraphs, then sentences, then words) to keep chunks coherent. Best when maintaining logical structure is important.
- **SpacyTextSplitter:** Chunks are formed based on linguistic structure, such as sentences or phrases, using spaCy’s NLP capabilities. Ideal for linguistically complex text where context is crucial.'
- **NLTKTextSplitter:** This method splits text at sentence boundaries, leveraging NLTK’s sentence tokenization. Good for sentence-level chunking when sentence coherence is needed.
- **SentenceTransformersTokenTextSplitter:** Chunks are created based on a specific token count, which aligns well with embedding models that require token-limited inputs. Useful for tasks that involve embeddings or semantic search.
- **CharacterTextSplitter:** Text is divided into chunks based solely on a character limit, regardless of sentence or word boundaries. Suitable for fast, simple chunking without regard for linguistic structure.

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Chunk size is the maximum number of characters that a chunk can contain.
# Chunk overlap is the number of characters that should overlap between two adjacent chunks.
# A smaller chunk size will result in more chunks, while a larger chunk size will result in fewer chunks. A larger chunk overlap will result in more chunks sharing common characters, while a smaller chunk overlap will result in fewer chunks sharing common characters. 
# If the text is highly structured, such as code or HTML, you may want to use a larger chunk size. If the text is less structured, such as a novel or a news article, you may want to use a smaller chunk size.

text_splitter=RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=10) #TODO: add this to list of hyperparameters
splits = text_splitter.split_documents(documents)
print(len(splits), splits[1])

254 page_content='Culture Three top Porsche Motorsport drivers on their road to success How racing dreams started early for Michael Christensen, Kévin Estre and Thomas Preining 29 April 2024 What do you need to become a top racing driver? How do you get there? And what attracts you in the first place? A trio of Porsche race stars explain Ask any young racing driver about what they love most about motorsport and the thrill of driving and the competition is usually top of their list. But these are not the only' metadata={'source': 'https://porsche.com/stories/dreams/porsche-motorsport-drivers-roads-to-success', 'title': 'Top Porsche race drivers on their roads to motorsport glory', 'description': 'How leading Porsche Motorsport racing drivers, Michael Christensen, Kévin Estre and Thomas Preining, made it to the top', 'language': 'en-international'}


In [9]:
from langchain_community.embeddings import HuggingFaceEmbeddings

# Create 
# The default embedding model in HuggingFace Embeddings is sentence-transformers/all-mpnet-base-v2 with 768 dimension. We use a smaller model all-MiniLM-L6-v2 with dimension 384 so indexing runs faster.
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") #, model_kwargs={'device': 'cuda'})

In [10]:
from langchain.vectorstores import Chroma
from uuid import uuid4
DB_PATH = 'vectorstore/db_chroma'
db = Chroma.from_documents(documents=splits,embedding=embeddings,collection_name="porsche_stories_new",persist_directory=DB_PATH)

# Chroma: a light-weight and in memory so it's easy to get started with and use for local development.
#FAISS (Facebook AI Similarity Search): a vector store that supports search in vectors that may not fit in RAM and is appropriate for production use. #TODO: switch to FAISS when running on AWS



In [15]:
import langchain
from queue import Queue
from typing import Any
from langchain.llms.huggingface_text_gen_inference import HuggingFaceTextGenInference
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.schema import LLMResult
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts.prompt import PromptTemplate
from anyio.from_thread import start_blocking_portal #For model callback streaming
from langchain.vectorstores import Chroma
from langchain_community.llms import Ollama
langchain.debug=True 

#vector db path
DB_CHROMA_PATH = 'vectorstore/db_chroma'
MODEL_NAME = "llama3.2"
system_message = {"role": "system", "content": "You are a helpful assistant."}




In [16]:
# Load CHROMA vecotre store
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = Chroma(embedding_function=embeddings,collection_name="porsche_stories_new",persist_directory=DB_CHROMA_PATH)

In [None]:

llm = Ollama(model="llama3.2", base_url="http://localhost:11434")



In [22]:
system_prompt = ""
template = """
Use the following pieces of context to answer the question. If no context provided, answer like a AI assistant.
{context}
Question: {question}
""" 

retriever = db.as_retriever(
        search_kwargs={"k": 6}
    )


In [23]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=retriever,     
    chain_type_kwargs={
        "prompt": PromptTemplate(
            template=template,
            input_variables=["context", "question"],
        ),
    }
)


In [24]:
result = qa_chain({"query": "What is porsche?"})

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "What is porsche?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "What is porsche?",
  "context": "a technical trainer teaching the next generation of Porsche Classic technicians, who tells us about ORIGINALE magazine and her tips to keep a Porsche in top condition Read more What is F.A.T. Mankei? Over 2200m up Austria’s highest mountain is an all-new version of a venue that’s been serving motorists driving up one of the world’s great roads for years Read more Community of Porsche: what does Porsche mean to you? We asked some members of our community from around the globe what Porsche means\n\nlatest Porsche Collection, this is the story of tartan at Pors

In [28]:
print(result['result'])

The possibilities are endless! Based on the provided context, here are some ideas you might consider:

1. **Take a road trip**: Explore the beautiful locations mentioned in the article, such as the Alps or Dubai, and enjoy the scenic views while driving your Porsche.
2. **Attend a Porsche event**: Look for events like the ones mentioned in the article, which showcase stunning cars, offer driving experiences, and provide opportunities to meet fellow car enthusiasts.
3. **Go camping with your Porsche**: Use your roof tent to camp in various locations, such as the great outdoors or scenic spots, and enjoy the thrill of sleeping under the stars in your vehicle.
4. **Join a Porsche Club**: Connect with other Porsche owners and enthusiasts through local clubs, which often offer events, track days, and opportunities to share knowledge and passion for the brand.
5. **Take a driving course**: Engage in track days or experience courses that allow you to push your Porsche's performance limits and

In [27]:
result = qa_chain({"query": "What ideas can I do with my porsche?"})

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "What ideas can I do with my porsche?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "What ideas can I do with my porsche?",
  "context": "chowing down on burgers, scintillating Porsche cars and unforgettable locations? You’ll find them all here in this selection of some of the best Porsche moments from our community and featured on the Porsche Instagram channel Read more Porsche in 2022: end of year review From Sean Wotherspoon’s Taycan Cross Turismo art car to gatherings of beautiful classic Porsche vehicles, an IPO to breaking lots of records, we’ve rounded up all the action from a brilliant year in one exciting fun film Read\n\nPart of its lively crea