In [None]:
!pip install fastapi aiohttp beautifulsoup4 google-cloud-storage

import requests
from fastapi import FastAPI, HTTPException
from typing import List
import asyncio
import aiohttp
import os
import time
import logging
from bs4 import BeautifulSoup
from google.cloud import storage

TEMP_DIR: str = "/tmp/pdfs"

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")

app = FastAPI()


async def download_pdf(pdf_url: str) -> None:
    pdf_filename = os.path.basename(pdf_url)
    pdf_path = os.path.join(TEMP_DIR, pdf_filename)
    logging.info(f"Downloading {pdf_filename}...")
    start_time = time.time()
    async with aiohttp.ClientSession() as session:
        async with session.get(pdf_url) as response:
            with open(pdf_path, "wb") as f:
                while True:
                    chunk = await response.content.read(1024)
                    if not chunk:
                        break
                    f.write(chunk)
    end_time = time.time()
    logging.info(f"Downloaded {pdf_filename} in {end_time - start_time:.2f} seconds")


async def download_pdfs(pdf_urls: List[str]) -> None:
    logging.info(f"Downloading {len(pdf_urls)} PDFs...")
    await asyncio.gather(*(download_pdf(url) for url in pdf_urls))


def list_pdf_files() -> List[str]:
    return [filename for filename in os.listdir(TEMP_DIR) if filename.endswith(".pdf")]


async def upload_pdf(filename: str, gcs_bucket_name: str, prefix: str) -> None:
    client = storage.Client()
    bucket = client.get_bucket(gcs_bucket_name)
    blob = bucket.blob(f"{prefix}/{filename}")
    logging.info(f"Uploading {filename}...")
    start_time = time.time()
    blob.upload_from_filename(os.path.join(TEMP_DIR, filename))
    end_time = time.time()
    logging.info(f"Uploaded {filename} in {end_time - start_time:.2f} seconds")


async def upload_to_gcs(pdf_filenames: List[str], gcs_bucket_name: str, prefix: str) -> None:
    logging.info(f"Uploading {len(pdf_filenames)} PDFs to Google Cloud Storage...")
    await asyncio.gather(*(upload_pdf(filename, gcs_bucket_name, prefix) for filename in pdf_filenames))


def cleanup_temporary_directory(pdf_filenames: List[str]) -> None:
    for filename in pdf_filenames:
        file_path = os.path.join(TEMP_DIR, filename)
        os.remove(file_path)
    os.rmdir(TEMP_DIR)
    logging.info("Temporary directory cleaned up")


@app.post("/process-pdfs/")
async def process_pdfs(web_location: str, gcs_bucket_name: str, prefix: str) -> None:
    # Create temporary directory if it doesn't exist
    os.makedirs(TEMP_DIR, exist_ok=True)
    response = requests.get(web_location)
    if response.status_code != 200:
        raise HTTPException(status_code=500, detail="Failed to fetch web location")
    soup = BeautifulSoup(response.text, "html.parser")
    links = soup.find_all("a", href=True)
    pdf_urls = [web_location + link["href"] for link in links if link["href"].endswith(".pdf")]

    await download_pdfs(pdf_urls)
    pdf_filenames = list_pdf_files()
    await upload_to_gcs(pdf_filenames, gcs_bucket_name, prefix)
    cleanup_temporary_directory(pdf_filenames)
    logging.info("PDF download and upload process completed")
    return {"message": "PDF download and upload process completed successfully"}
