In [None]:
import requests
from bs4 import BeautifulSoup
from os import listdir
from os import path
from os import mkdir


In [None]:
query = "vix volatility"
url = f"https://scholar.google.com/scholar?hl=en&as_sdt=0%2C31&q={query}&btnG="
# create a dir based on the query
directory = path.join("pdfs", query.replace(" ", "_"))
print(directory)
try:
    mkdir(directory)
except FileExistsError:
    pass

In [None]:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
search_results = soup.find_all("div", attrs={"class": "gs_r gs_or gs_scl"})
print(search_results)

In [None]:
pdfs = []
for r in search_results:
    link = r.find("a")["href"]
    if link.endswith(".pdf") or link.endswith("pdf"):
        print(link)
        pdfs.append(link)

In [None]:
from pypdf import PdfReader


def add_eof_marker_to_documents(directory):
    EOF_MARKER = b"%%EOF"
    for files in listdir(directory):
        if files.endswith(".pdf"):
            with open(f"{directory}/{files}", "rb") as f:
                contents = f.read()
                if EOF_MARKER in contents:
                    print(f"found EOF_MARKER in {directory}/{files}")
                    contents = contents.replace(EOF_MARKER, b"")
                    contents = contents + EOF_MARKER
                else:
                    print(f"did not find EOF_MARKER in {directory}/{files}")
                    print(contents[-8:])  # see last chars
                    contents = contents[:-6] + EOF_MARKER
            with open(f"{directory}/{files}", "wb") as f:
                f.write(contents)

In [None]:
for url in pdfs:
    response = requests.get(url)
    file_name = url.split("/")[-1]  # get file name

    with open(f"{directory}/{file_name}", "wb") as f:
        f.write(response.content)

In [None]:
import os
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException

url = "https://finmasters.com/hedge-fund-letters-to-investors/"
retry_delay = 5  # Time to wait before retrying (in seconds)
max_retries = 1  # Maximum number of retries
chrome_driver_path = "/chrome/chromedriver.exe"  # Update this to the path where you placed the Chrome WebDriver


def make_request(url, retries=max_retries):
    while retries > 0:
        try:
            response = requests.get(url)
            return response
        except (requests.exceptions.RequestException, ConnectionError) as e:
            print(f"Error: {e}")
            retries -= 1
            if retries > 0:
                print(f"Retrying... ({max_retries - retries + 1} of {max_retries})")
                time.sleep(retry_delay)
            else:
                print("Max retries reached. Skipping this URL.")
                return None


response = make_request(url)

# Check if the request was successful
if response is not None and response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all the links to PDFs
    pdf_links = soup.find_all("a", href=lambda href: href and href.endswith(".pdf"))

    # Create a directory to save the PDFs
    output_dir = "pdfs/hedge_fund_letters"
    os.makedirs(output_dir, exist_ok=True)

    # Set up the Selenium WebDriver
    options = webdriver.ChromeOptions()
    options.add_experimental_option(
        "prefs",
        {
            "download.default_directory": os.path.abspath(output_dir),
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "plugins.always_open_pdf_externally": True,
        },
    )

    driver = webdriver.Chrome(executable_path=chrome_driver_path, options=options)

    # Download and save each PDF
    # TODO: if pdf exists skip
    for link in pdf_links:
        pdf_url = link["href"]
        pdf_name = os.path.join(output_dir, os.path.basename(pdf_url))

        try:
            driver.get(pdf_url)
            time.sleep(5)  # Wait for the download to start
        except NoSuchElementException:
            print(f"Failed to download: {pdf_url}")
            print("Opening the URL in the default web browser...")
            webbrowser.open(pdf_url)

    driver.quit()
else:
    print("Failed to fetch the webpage")

In [None]:
from paperqa import Docs
import os
from os import path
from pypdf import PdfReader, errors
import pickle
import openai


def is_valid_pdf(file_path):
    try:
        with open(file_path, "rb") as file:
            PdfReader(file)
    except (Exception, errors.PdfReadError) as e:
        print(e)
        return False
    return True


openai.api_key = os.environ["OPENAI_API_KEY"]
docs = Docs()
# pdf_path = "pdfs/hedge_fund_letters"
pdf_path = "pdfs/distributed_systems_papers"

# TODO: PdfReadError: EOF marker not found then just skip
# TODO: after downloading PDFs, check if valid, if not delete them

for files in os.listdir(pdf_path):
    if is_valid_pdf(f"{pdf_path}/{files}"):
        print(f"adding {files}")
        try:
            docs.add(f"{pdf_path}/{files}")
        except Exception as e:
            pass
    else:
        print(f"skipping {files}")
        # docs.add(f"{pdf_path}/{files}")

with open("distributed_systems_papers.docs.pickle", "wb") as handle:
    pickle.dump(docs, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import os
import pickle
from paperqa import Docs

openai.api_key = os.environ["OPENAI_API_KEY"]

docs = Docs()

with open("hedge_fund_letters.docs.pickle", "rb") as handle:
    docs = pickle.load(handle)

answer = docs.query(
    "Return a breakdown of invested stocks sorted by the hedge fund's name."
)

In [None]:
answer = docs.query("what stocks did most of the funds invest in?")
print(answer.formatted_answer)


In [None]:
import os
import pickle
from paperqa import Docs

openai.api_key = os.environ["OPENAI_API_KEY"]
docs = Docs()

with open("spx_options_papers.docs.pickle", "rb") as handle:
    docs = pickle.load(handle)

answer = docs.query("how does vix affect SPX options?")
print(answer.formatted_answer)