In [94]:
import pandas as pd
from pathlib import Path
import os
from llama_index.readers.web import UnstructuredURLLoader
from urllib.parse import urlparse
import re
import pickle
import requests

In [29]:
def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False


def is_sharepoint_url(url):
    return 'sharepoint.com' in url



def is_pdf(url):
    try:
        response = requests.head(url, allow_redirects=True)
        if response.headers['content-type'] == 'application/pdf':
            return True
        else:
            return False
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return False

# Load the dataset

In [36]:
filepath = os.path.join(os.path.dirname(os.getcwd()), 'agent_data', 
             'Generative_AI_Client_Questions_and_Lighthouse_Contents.xlsx')
data = pd.read_excel(filepath, sheet_name='Content Links (Future Of X)', skiprows= 3)
data.columns = data.columns.map(lambda x: re.sub(r'\W+', '_', x))
data = data[["Title", "Link"]]
data_dict = data.set_index('Link')['Title'].to_dict()

# load the questions
Qs = pd.read_excel(filepath, sheet_name='Master Data Source')
Qs = Qs.dropna(subset=['Question'])
Qs = Qs.Question.unique()
Qs_dict = {q: False for q in Qs}
article_links = data.dropna(subset=['Link'])
article_links = article_links.Link.to_list()

 # Remove invalid urls and sharepoint urls

In [73]:
valid_non_pdf_links = [url for url in article_links if is_valid_url(url) and not (is_sharepoint_url(url) or is_pdf(url))]
pdf_urls = [url for url in article_links if  is_pdf(url)]

non_pdf_url_path = os.path.join(os.path.dirname(os.getcwd()), 'agent_data', 
             'valid_non_pdf_urls.pkl')
pdf_url_path = os.path.join(os.path.dirname(os.getcwd()), 'agent_data', 
             'pdf_urls.pkl')

# Save the urls to disk
with open(non_pdf_url_path, 'wb') as f:
    pickle.dump(valid_non_pdf_links, f)
with open(pdf_url_path, 'wb') as f:
    pickle.dump(pdf_urls, f)

# Load urls

In [74]:
# load pdf pickle from disk
pdf_url_path = os.path.join(os.path.dirname(os.getcwd()), 'agent_data', 
             'pdf_urls.pkl')
with open(pdf_url_path, 'rb') as f:
    pdf_urls2 = pickle.load(f)
    

# Chunk the url contents before embedding

In [None]:
# chunk the urls
loader = UnstructuredURLLoader(
    urls=valid_non_pdf_links, continue_on_failure=True, headers={"User-Agent": "value"}
,)
valid_non_pdf_docs = loader.load_data()

# update the metdata for each document object

In [61]:
for i in range(len(valid_non_pdf_docs)):
    valid_non_pdf_docs[i].metadata = {'title': data_dict[valid_non_pdf_docs[i].metadata['source']], 
        'source': valid_non_pdf_docs[i].metadata['source']}