# testing doc loading

In [46]:
import requests
from bs4 import BeautifulSoup

In [67]:
'''
Loads sample documents into vector store, and sets retrieval
'''

from langchain_community.document_loaders import WebBaseLoader
urls = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]

docs2 = [WebBaseLoader(url).load() for url in urls]

In [68]:
docs2

[[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'title': "LLM Powered Autonomous Agents | Lil'Log", 'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final res

In [49]:
doc_list = []

In [50]:
from langchain_core.documents import Document

In [51]:
document = Document(
    page_content="content here",
    metadata={"source": "blog"}
)

In [72]:
documents = [Document(page_content="hello world", metadata={"source": "blog", "title": "Anthony Blog", "date":"2025-02-11"})]

In [73]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import CharacterTextSplitter

# loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
# documents = loader.load()

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=0
)
split_docs = text_splitter.split_documents(documents)

In [53]:
# URL to scrape
url = 'https://ant52ho.github.io/'

# Send an HTTP GET request
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Get all text
    all_text = soup.get_text(separator='\n', strip=True)

In [54]:
all_text

'Anthony Ho\nYou need to enable JavaScript to run this app.'

# get personal website

In [55]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

# Set up Chrome options for headless browsing (no GUI)
chrome_options = Options()
# chrome_options.add_argument("--headless")

# Path to your chromedriver
service = Service('/Users/anthony/Downloads/chromedriver-mac-arm64/chromedriver')  # e.g., 'chromedriver' if it's in the same directory

# Start the browser
driver = webdriver.Chrome(service=service, options=chrome_options)

# Target website
driver.get(url)

# Allow time for JavaScript to load content
time.sleep(3)  # Adjust based on how fast the site loads

# Get the page source after JS has rendered
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

# Extract all text
all_text = soup.get_text(separator='\n', strip=True)

# Close the browser
driver.quit()


In [56]:
doc_list.append(Document(page_content=all_text, metadata={"source": "blog", "title": "Anthony Blog", "date":"2025-02-11"}))

# google docs

In [58]:
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow

# Define the required scopes
SCOPES = ['https://www.googleapis.com/auth/drive.readonly', 'https://www.googleapis.com/auth/documents.readonly']

# Authentication flow
credential_path = '/Users/anthony/Downloads/client_secret_581484771789-4smnvatsvekn5rjunqs408336jf3le8j.apps.googleusercontent.com.json'

flow = InstalledAppFlow.from_client_secrets_file(credential_path, SCOPES)
creds = flow.run_local_server(port=0)

# Build API clients for Drive and Docs
drive_service = build('drive', 'v3', credentials=creds)
docs_service = build('docs', 'v1', credentials=creds)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=581484771789-4smnvatsvekn5rjunqs408336jf3le8j.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A57210%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.readonly+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocuments.readonly&state=RnDVXOA9CmxCP3upHjzni5Ja3p9TMp&access_type=offline


In [64]:
def list_all_owned_google_docs():
    query = "mimeType='application/vnd.google-apps.document' and 'me' in owners"
    all_docs = []
    page_token = None

    while True:
        response = drive_service.files().list(
            q=query,
            pageSize=100,
            fields="nextPageToken, files(id, name, createdTime)",
            pageToken=page_token
        ).execute()

        all_docs.extend(response.get('files', []))
        page_token = response.get('nextPageToken', None)

        if not page_token:
            break

    return all_docs

docs = list_all_owned_google_docs()

In [60]:
len(docs)

314

In [61]:
def extract_text_from_doc(doc_id):
    doc = docs_service.documents().get(documentId=doc_id).execute()
    content = doc.get('body', {}).get('content', [])
    
    text = ''
    for element in content:
        if 'paragraph' in element:
            for para_elem in element['paragraph'].get('elements', []):
                text += para_elem.get('textRun', {}).get('content', '')

    return text.strip()



# # Extract and print text from all documents
for doc in docs:
    text = extract_text_from_doc(doc['id'])
    doc_list.append(Document(page_content=text, metadata={"source": "Google Documents", "title": doc['name'], "date": doc['createdTime']}))

In [63]:
import pickle
# Save to pickle file
with open('doc_list.pkl', 'wb') as f:
    pickle.dump(doc_list, f)

In [76]:
doc_list

[Document(metadata={'source': 'blog', 'title': 'Anthony Blog', 'date': '2025-02-11'}, page_content='Anthony Ho\nYou need to enable JavaScript to run this app.\nAnthony Ho\nContents\nAbout\nExperience\nProjects\nBlog\nResume\nHello, I\'m\nAnthony Ho\nExperience\nProjects\nContact\nVisit on desktop for a\nbetter experience!\n**WIP**\nAbout\nHello, I\'m Anthony Ho. I\'m a 4th year student pursuing a Computer Science and Business Administration Double Degree at the University of Waterloo and Wilfrid Laurier University.\nI\'ve done a handful of data-related projects and Python-based software development. Feel free to\nview my resume\nor contact me at\nanthony52ho@gmail.com\n.\nIceland 2024 with my good friend Kevin\nExperience\nNDT Research Assistant\nUniversity of Waterloo | September 2024 ~ present\nUniversity of Waterloo\nSeptember 2024 ~ present\nDeveloping MATLAB code to control a high-power ultrasonic system from Verasonics for nondestructive evaluation of construction materials.\nMAT

In [75]:
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=0
)
split_docs = text_splitter.split_documents(doc_list)

Created a chunk of size 2400, which is longer than the specified 1000
Created a chunk of size 1765, which is longer than the specified 1000
Created a chunk of size 1365, which is longer than the specified 1000
Created a chunk of size 1453, which is longer than the specified 1000
