In [1]:
import requests
from bs4 import BeautifulSoup

def scrape_url(url):
    URL = "https://www.theweathernetwork.com/en/news/lifestyle/health/is-this-a-cold-or-do-i-have-allergies"
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find_all('p')
    for r in results:
        print(r.text.strip())
# python_jobs = results.find_all(
#     "p", string=lambda text: "python" in text.lower()
# )
# python_job_elements = [
#     p_element.parent.parent.parent for p_element in python_jobs
# ]

# for job_element in python_job_elements:
#     print(job_element)
#     # title_element = job_element.find("h2", class_="title")
#     # company_element = job_element.find("h3", class_="company")
#     # location_element = job_element.find("p", class_="location")
#     # print(title_element.text.strip())
#     # print(company_element.text.strip())
#     # print(location_element.text.strip())
#     link_url = job_element.find_all("a")[1]["href"]
#     print(f"Apply here: {link_url}\n")
#     print()


In [2]:
import qdrant_client as qc
import qdrant_client.http.models as qmodels
import uuid
import csv
import openai
import os
import gradio as gr

os.environ["OPENAI_API_KEY"] = ''
openai.api_key = os.environ["OPENAI_API_KEY"]

client = qc.QdrantClient(url="localhost")
METRIC = qmodels.Distance.DOT
DIMENSION = 1536

MODEL = "text-embedding-ada-002"

def embed_text(text):
    response = openai.Embedding.create(
        input=text,
        model=MODEL
    )
    embeddings = response['data'][0]['embedding']
    return embeddings

def create_index(collection_name):
    client.recreate_collection(
    collection_name=collection_name,
    vectors_config = qmodels.VectorParams(
            size=DIMENSION,
            distance=METRIC,
        )
    )
# create_index('ask_a_met')

def delete_index(collection_name):
    client.delete_collection(collection_name=collection_name)
    
def create_vector(content):
    vector = embed_text(content)
    id = str(uuid.uuid1().int)[:32]
    payload = {
        "text": content
    }
    return id, vector, payload

def add_doc_to_index(collection_name, content):
    ids = []
    vectors = []
    payloads = []
    
    _id, vector, payload = create_vector(content)
    ids.append(_id)
    vectors.append(vector)
    payloads.append(payload)

    client.upsert(
        collection_name=collection_name,
        points=qmodels.Batch(
            ids = ids,
            vectors=vectors,
            payloads=payloads
        ),
    )

def query_index(collection_name, query, top_k=5):
    vector = embed_text(query)
    # _filter = _generate_query_filter(query, doc_types, block_types)
    
    results = client.search(
        collection_name=collection_name,
        query_vector=vector,
        # query_filter=_filter,
        limit=top_k,
        with_payload=True,
        # search_params=_search_params,
    )

    results = [
        (
            res.payload["text"],
            res.score,
        )
        for res in results
    ]

    return results

In [3]:
import requests
from bs4 import BeautifulSoup

def scrape_url(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    p_content = soup.find_all('p')
    res = [p.text.strip() for p in p_content]
    return res
    # for r in results:
    #     print(r.text.strip())

def get_completion_openai(messages, model='gpt-3.5-turbo', temperature=0, max_tokens=200):
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens)
    return response.choices[0].message['content']        
        
def understand_page(url, query):
    collection = 'demo'
    messages = []
    contents = scrape_url(url)
    create_index(collection)
    for content in contents:
        add_doc_to_index(collection, content)
    responses = query_index(collection, query, 5)
    for response in responses:
        messages.append({'role':'system', 'content':str(response[0])})
    messages.append({'role':'user', 'content':query})
    res = get_completion_openai(messages)
    delete_index(collection)
    return res

In [4]:
with gr.Blocks() as demo:
    with gr.Row():
        md0 = gr.Markdown("""Hi, you can give me a url and ask me any question about the content in the url ^^""")
    
    with gr.Row(visible=True) as r0:
        url = gr.Textbox(label="Enter url", placeholder="https://www.bbc.com/news/world-us-canada-66076558")
        
    with gr.Row(visible=True) as r1:
        question = gr.Textbox(label="Enter your question", placeholder="how long was the search before the Canadian teen was found?")
            
    with gr.Row(visible=True) as r2:
        button = gr.Button("Continue")
        
    with gr.Row(visible=True) as r3:
        button2 = gr.Button("Restart")
        
    def submit(url, question):
        res = {md0:""}
        response = understand_page(url, question)  
        res[md0] = response
        res[r0] = gr.update(visible=False)
        res[r1] = gr.update(visible=False)
        res[r2] = gr.update(visible=False)
        return res
        
    def restart():
        res = {}
        res[md0] = """Hi, give me a url and ask me any question about the content in the url ^^"""
        res[url] = ''
        res[question] = ''
        res[r0] = gr.update(visible=True)
        res[r1] = gr.update(visible=True)
        res[r2] = gr.update(visible=True)
        return res
    
    button.click(fn=submit, inputs=[url, question], outputs=[md0, r0, r1, r2, r3])
    
    button2.click(fn=restart, inputs=None, outputs=[md0, r0, r1, r2, r3, question, url])
    
demo.launch(share=True, debug=False)

Running on local URL:  http://127.0.0.1:7862
Running on public URL: https://9d604598a2154eda17.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [None]:
understand_page("https://www.bbc.com/news/world-us-canada-66076558", 'how long was the search before the Canadian teen was found?')
understand_page("https://www.bbc.com/travel/article/20230703-sonoran-style-shrimp-tacos", 'who is husband of Marissa Gencarelli?')
