# Running ETL to Build the Document Corpus

This notebook walks through the process for setting up the corpus of Full Stack documents that the bot searches over.

In each case, we have to
- Extract data from its natural habitat, like YouTube or GitHub
- Transform it into a format that is useful for our purposes
- Load it into our database in that format

hence the acronym "ETL".

In [1]:
!make secrets  # you'll need credentials for Mongo and Modal to run this

###
# 🥞: Loaded config from .env
###
python3 -m pip install -qqq -r requirements.txt
###
# 🥞: If you haven't gotten a Modal token yet, run make modal-token
###
Verifying token against [4;34mhttps://api.modal.com[0m
[32mToken verified successfully[0m
Token written to [35m/Users/candidosales/[0m[95m.modal.toml[0m
Created a new secret [32m'mongodb-fsdl'[0m with the keys [32m'MONGODB_USER'[0m, [32m'MONGODB_URI'[0m,
[32m'MONGODB_PASSWORD'[0m

Use it in to your Modal app using:

[40m                                                                                [0m
[92;40m@stub[0m[91;40m.[0m[97;40mfunction[0m[97;40m([0m[97;40msecret[0m[91;40m=[0m[97;40mmodal[0m[91;40m.[0m[97;40mSecret[0m[91;40m.[0m[97;40mfrom_name[0m[97;40m([0m[93;40m"[0m[93;40mmongodb-fsdl[0m[93;40m"[0m[97;40m)[0m[97;40m)[0m[40m                   [0m
[96;40mdef[0m[97;40m [0m[92;40msome_function[0m[97;40m([0m[97;40m)[0m[97;40m:[0m[40m                         

In [2]:
import json
from pathlib import Path
import pprint

from etl import markdown, pdfs, shared, videos
from etl.shared import display_modal_image

pp = pprint.PrettyPrinter(indent=2)

## PDFs: arXiV Papers

```bash
!modal run etl/pdfs.py --json-path data/llm-papers.json
```

In [3]:
display_modal_image(shared.image)

In [4]:
display_modal_image(pdfs.image)

In [None]:
papers_path = Path("data") / "llm-papers.json"

with open(papers_path) as f:
    pdf_infos = json.load(f)

pdf_urls = [pdf["url"] for pdf in pdf_infos]

with pdfs.stub.run():
    documents = shared.unchunk(  # each pdf creates multiple documents, so we flatten
        # after we run the extract_pdf function on Modal
        pdfs.extract_pdf.map(pdf_urls, return_exceptions=True)
    )

In [None]:
pp.pprint(documents[0]["metadata"])

In [None]:
from IPython.display import IFrame

IFrame(src=documents[0]["metadata"]["source"], width=800, height=400)

In [None]:
with shared.stub.run():
    # we split our document list into 10 pieces, so that we don't open too many connections
    chunked_documents = shared.chunk_into(documents, 10)
    list(shared.add_to_document_db.map(chunked_documents))

In [None]:
with shared.stub.run():
   # pull only arxiv papers
  query = { "metadata.source": { "$regex": "arxiv\.org", "$options": "i" } }
  # project out the text field, it can get large
  projection = {"text": 0}
  # get just one result to show it worked
  result = shared.query_one_document_db.call(query, projection)

pp.pprint(result)

## Markdown Files: Lectures

```bash
!modal run etl/markdown.py --json-path data/lectures-2022.json
```

In [None]:
display_modal_image(markdown.image)

In [None]:
markdown_path = Path("data") / "lectures-2022.json"

with open(markdown_path) as f:
  markdown_corpus = json.load(f)

website_url, md_url = (
  markdown_corpus["website_url_base"],
  markdown_corpus["md_url_base"],
)

lectures = markdown_corpus["lectures"]

lectures[0]

In [None]:
with markdown.stub.run():
    documents = (
        shared.unchunk(  # each lecture creates multiple documents, so we flatten
            markdown.to_documents.map(
                lectures,
                kwargs={"website_url": website_url, "md_url": md_url},
                return_exceptions=True,
            )
        )
    )

In [None]:
pp.pprint(documents[1]["metadata"])

In [None]:
from IPython.display import IFrame

IFrame(src=documents[1]["metadata"]["source"], width=800, height=400)

In [None]:
with shared.stub.run():
    chunked_documents = shared.chunk_into(documents, 10)
    list(shared.add_to_document_db.map(chunked_documents))

In [None]:
with shared.stub.run():
  # pull only lectures
  query = { "metadata.source": { "$regex": "lecture", "$options": "i" } }
  # project out the text field, it can get large
  projection = {"text": 0}
  # get just one result to show it worked
  result = shared.query_one_document_db.call(query, projection, collection="ask-fsdl")

pp.pprint(result)

## Videos: YouTube Transcripts

In [None]:
display_modal_image(videos.image)

In [None]:
videos_path = Path("data") / "videos.json"

with open(videos_path) as f:
    video_infos = json.load(f)

video_ids = [video["id"] for video in video_infos]

video_infos[0]

In [None]:
with videos.stub.run():
    documents = (
        shared.unchunk(  # each lecture creates multiple documents, so we flatten
            videos.extract_subtitles.map(
                video_ids,
                return_exceptions=True,
            )
        )
    )

In [None]:
pp.pprint(documents[1]["metadata"])

In [None]:
from IPython.display import YouTubeVideo

id_str, time_str = documents[1]["metadata"]["source"].split("?v=")[1].split("&t=")
YouTubeVideo(id_str, start=int(time_str.strip("s")), width=800, height=400)

In [None]:
with shared.stub.run():
    chunked_documents = shared.chunk_into(documents, 10)
    list(shared.add_to_document_db.map(chunked_documents))

In [None]:
with shared.stub.run():
  # pull only lectures
  query = { "metadata.source": { "$regex": "youtube", "$options": "i" } }
  # project out the text field, it can get large
  projection = {"text": 0}
  # get just one result to show it worked
  result = shared.query_one_document_db.call(query, projection, collection="ask-fsdl")

pp.pprint(result)