In [174]:
from langchain_ollama import OllamaLLM
from langchain.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from bs4 import BeautifulSoup
import requests
import re
from langchain_core.runnables import RunnablePassthrough

In [175]:
import requests
from bs4 import BeautifulSoup

class Website:
    def __init__(self, url, *, extend_search=False):
        self.url = url
        self.text = ""
        text, links = self._get_contents(url)
        self.text = text
        if links:
            self.links = self._parse_links(links)

        if extend_search:
            for link in self.links:
                text, _ = self._get_contents(link)
                self.text += "".join(text)
                print("Text from a link was added")

    @staticmethod
    def _get_contents(link):
        response = requests.get(link) # get response
        body = response.content # get the content of the reponse
        soup = BeautifulSoup(body, 'html.parser') # parse the content with html parser
        
        if soup.body:
            for irr in soup.body(['script', 'style', 'img', 'input', 'iframe', 'button', 'nav', 'footer']):
                irr.decompose()
            text = "\n".join(
                [el.get_text(strip=True) for el in soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6"])
            ])
        else:
            text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        
        return text, links

    def _parse_links(self, links):
        parsed_links = []
        for link in links:
            if link:
                if link.startswith("https://") or link.startswith("www"):
                    parsed_links.append(link)
                elif not link.startswith("/"):
                    parsed_links.append(self.url+link)
                else:
                    parsed_links.append(self.url.rstrip('/')+link)
        return parsed_links

In [176]:
link_system_prompt = link_system_prompt = """
You are provided with a list of links on a webpage.
You must decide which of these links would be the most relevant to include in a brochure about the company.
Do not include Terms of Service, privacy, or email links.
Respond in valid JSON format exactly as shown in the example below.
Make sure that all keys and string values (including URLs) are enclosed in double quotes.

Example:
{{
    "links": [
        {{ "type": "about page", "url": "https://full.url/goes/here/about" }},
        {{ "type": "careers page", "url": "https://full.url/goes/here/careers" }}
    ]
}}
"""

link_user_prompt = """
    Here is the list of links on the website:
    {links}.
    
    Please decide which of these are relevant web links for a brochure about the company:
"""

def get_links(url):
    website = Website(url)
    return {"links": "\n".join(website.links)}

llm = OllamaLLM(model="mistral")

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", link_system_prompt),
        ("human", link_user_prompt)
    ]
)

In [177]:
from langchain_core.output_parsers import JsonOutputParser

chain = RunnableLambda(get_links) | prompt | llm | JsonOutputParser()
rel_links = chain.invoke('https://www.langchain.com/')

In [178]:
def get_text_for_all_links(relevant_links):
    text = ""
    for link in relevant_links["links"]:
        text += f"Page: {link['type']}"
        text += Website(link['url']).text
    return text
    
text = get_text_for_all_links(rel_links)

In [179]:
make_brochure_system_template = """
You are an expert at analyzing the contents of several pages from a company website.
Using the provided content, create a short, attractive brochure for the company.
Include details that capture the company's essence solely based on the context provided.
Return your response in Markdown format, using appropriate headings, bullet points, and emphasis.\

Context:

{content}
"""

make_brochure_user_template = """
Create a short brochure for {company_name} company:
"""

final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", make_brochure_system_template),
        ("human", make_brochure_user_template)
    ]
)

In [180]:
final_chain = final_prompt | llm | StrOutputParser()

In [181]:
brochure = final_chain.invoke({"content":text, "company_name": "LangChain"})

In [182]:
from IPython.display import Markdown, display
display(Markdown(brochure))

 **LangChain Brochure**

Welcome to LangChain - The Future of AI Applications!

Transform the way you build and deploy AI applications with our suite of powerful tools, designed to help you innovate faster and deliver reliable, high-quality solutions.

What makes us stand out?

1. **LangChain**: Our flagship product provides a seamless, scalable platform for building intelligent agents that understand, learn, and adapt to complex tasks in real-time.

2. **LangGraph**: The heart of our technology, LangGraph offers advanced features like human-in-the-loop, persistence/memory, and streaming, allowing you to create smarter, more contextually aware agents.

3. **LangSmith**: Simplify the development, testing, and monitoring of your AI applications with LangSmith - an all-in-one toolkit that speeds up the iterative process for building high-performance AI models.

4. **Interrupt: The AI Agent Conference by LangChain** - Join us this May as we gather the brightest minds in the AI industry to explore the future of AI agents and the role they will play in shaping our world.

5. **Integrations**: We offer seamless integrations with popular testing frameworks like Pytest, Vitest/Jest, allowing you to focus on building your AI applications without worrying about the underlying infrastructure.

Join the likes of LinkedIn, Uber, Replit, Elastic, and many more, who are already leveraging our technology for real-world production use cases.

Ready to revolutionize your AI development? Start your journey with LangChain today!