In [2]:
from bs4 import BeautifulSoup
import requests
import re
import json
from google.colab import drive

# ✅ Mount Google Drive
drive.mount('/content/drive')

# ✅ Target folder in your Drive
save_path = "/content/drive/MyDrive/bens_bible/jefferson_verses.json"

# ✅ Pages to scrape
pages = [15, 30, 45, 60, 75, 82]
base_url = "https://thejeffersonbible.com/index.php/tot{}"

all_verses = []

def clean_and_expand(raw):
    """Convert a raw opentjblink string into a clean list of verses."""
    verses = []
    parts = [p.strip() for p in raw.split(";") if p.strip()]
    for part in parts:
        part = part.replace("%20", " ").strip()
        if "," in part:
            book = part.split()[0].capitalize()
            chapter = part.split()[1].split(":")[0]
            refs = part.split()[1].split(":")[1].split(",")
            for r in refs:
                r = r.strip()
                if ":" in r:
                    verses.append(f"{book} {r}")
                else:
                    verses.append(f"{book} {chapter}:{r}")
        else:
            tokens = part.split()
            tokens[0] = tokens[0].capitalize()
            verses.append(" ".join(tokens))
    return verses

# ✅ Loop through all pages
for page in pages:
    url = base_url.format(page)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    for a in soup.find_all("a", href=True):
        if "javascript:opentjblink" in a["href"]:
            match = re.search(r"opentjblink\('([^']+)'\)", a["href"])
            if match:
                all_verses.extend(clean_and_expand(match.group(1)))

# ✅ Save as JSON in your Google Drive
with open(save_path, "w", encoding="utf-8") as f:
    json.dump(all_verses, f, indent=2)

print(f"✅ Saved {len(all_verses)} verses to {save_path}")


Mounted at /content/drive
✅ Saved 220 verses to /content/drive/MyDrive/bens_bible/jefferson_verses.json


In [15]:
import json
import requests
from collections import defaultdict
import re

# ✅ Paths
json_path = "/content/drive/MyDrive/bens_bible/jefferson_verses.json"
output_path = "/content/drive/MyDrive/bens_bible/jefferson_bible_web_offline.txt"

# ✅ Normalize book names
BOOK_MAP = {
    "Mathew": "Matthew",
    "Matthew": "Matthew",
    "Mark": "Mark",
    "Luke": "Luke",
    "John": "John"
}

# ✅ Load Jefferson references
with open(json_path, "r", encoding="utf-8") as f:
    jeff_refs = json.load(f)
print(f"Loaded {len(jeff_refs)} Jefferson references")

# ✅ Download and parse TehShrike WEB JSON (including poetry lines)
base_url = "https://raw.githubusercontent.com/TehShrike/world-english-bible/refs/heads/master/json/"
books = ["Matthew", "Mark", "Luke", "John"]

verse_map = defaultdict(lambda: defaultdict(dict))

for book in books:
    url = f"{base_url}{book.lower()}.json"
    print(f"Downloading {url} ...")
    r = requests.get(url)
    book_data = r.json()

    temp_verses = defaultdict(lambda: defaultdict(list))

    for entry in book_data:
        if entry.get("type") in ["paragraph text", "verse", "paragraph end", "line text"]:
            if "chapterNumber" not in entry or "verseNumber" not in entry:
                continue
            c = int(entry["chapterNumber"])
            v = int(entry["verseNumber"])
            value = entry["value"].strip()
            if value:  # skip completely blank lines
                temp_verses[c][v].append(value)

    # Merge multiple lines for each verse
    for c, verses in temp_verses.items():
        for v, parts in verses.items():
            verse_map[book][c][v] = " ".join(parts).strip()

print("✅ Finished loading WEB Bible locally.")

# ✅ Build Jefferson Bible text
output_lines = []

for ref in jeff_refs:
    match = re.match(r"(\w+) (\d+):(\d+)(?:-(\d+))?", ref)
    if not match:
        continue
    raw_book = match.group(1)
    book = BOOK_MAP.get(raw_book, raw_book).capitalize()
    chap = int(match.group(2))
    start = int(match.group(3))
    end = int(match.group(4)) if match.group(4) else start

    for vnum in range(start, end + 1):
        text = verse_map[book].get(chap, {}).get(vnum)
        if text and text.strip():
            output_lines.append(text)

# ✅ Save clean output to Drive
with open(output_path, "w", encoding="utf-8") as f:
    f.write("\n".join(output_lines))

print(f"✅ Jefferson Bible (WEB) saved to {output_path} with {len(output_lines)} lines.")


Loaded 220 Jefferson references
Downloading https://raw.githubusercontent.com/TehShrike/world-english-bible/refs/heads/master/json/matthew.json ...
Downloading https://raw.githubusercontent.com/TehShrike/world-english-bible/refs/heads/master/json/mark.json ...
Downloading https://raw.githubusercontent.com/TehShrike/world-english-bible/refs/heads/master/json/luke.json ...
Downloading https://raw.githubusercontent.com/TehShrike/world-english-bible/refs/heads/master/json/john.json ...
✅ Finished loading WEB Bible locally.
✅ Jefferson Bible (WEB) saved to /content/drive/MyDrive/bens_bible/jefferson_bible_web_offline.txt with 1086 lines.


In [16]:
import re

input_path = "/content/drive/MyDrive/bens_bible/jefferson_bible_web_offline.txt"
output_path = "/content/drive/MyDrive/bens_bible/jefferson_bible_latex.tex"

# Simple LaTeX escaper
def escape_latex(text):
    specials = {
        "&": r"\&",
        "%": r"\%",
        "$": r"\$",
        "#": r"\#",
        "_": r"\_",
        "{": r"\{",
        "}": r"\}",
        "~": r"\textasciitilde{}",
        "^": r"\textasciicircum{}",
        "\\": r"\textbackslash{}"
    }
    for k, v in specials.items():
        text = text.replace(k, v)
    return text

lines = []
with open(input_path, "r", encoding="utf-8") as f:
    verses = [escape_latex(line.strip()) for line in f if line.strip()]

page_counter = 1
new_page = True

for i, verse in enumerate(verses):
    if new_page:
        lines.append("\\chapterornament")
        lines.append(f"\\section*{{Page {page_counter}}}\n")
        # Add drop cap for first letter
        first_letter = verse[0]
        rest = verse[1:]
        lines.append(f"\\lettrine{{{first_letter}}}{{{rest}}}\n")
        new_page = False
    else:
        lines.append(verse + "\n")

    # Example rule: every ~20 verses = new page (tweak later to Jefferson's actual pages)
    if (i + 1) % 20 == 0:
        page_counter += 1
        new_page = True
        lines.append("\n")

# Write to file
with open(output_path, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print(f"✅ LaTeX file saved to {output_path}")


✅ LaTeX file saved to /content/drive/MyDrive/bens_bible/jefferson_bible_latex.tex


In [17]:
import json
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
import re

# ---- SETTINGS ----
pages = [15, 30, 45, 60, 75, 82]
base_html = "https://thejeffersonbible.com/index.php/tot{}"
base_web = "https://raw.githubusercontent.com/TehShrike/world-english-bible/refs/heads/master/json/"
books = ["Matthew", "Mark", "Luke", "John"]

output_path = "/content/drive/MyDrive/bens_bible/jefferson_bible_latex.tex"

BOOK_MAP = {
    "Mathew": "Matthew",
    "Matthew": "Matthew",
    "Mark": "Mark",
    "Luke": "Luke",
    "John": "John"
}

# ---- STEP 1: SCRAPE DESCRIPTIONS & VERSES ----
sections = []

for p in pages:
    url = base_html.format(p)
    print(f"Scraping {url} ...")
    soup = BeautifulSoup(requests.get(url).text, "html.parser")

    for row in soup.find_all("tr"):
        desc_tag = row.find("td", class_="column-2")
        link_tag = row.find("a", href=True)

        if not desc_tag or not link_tag:
            continue

        description = " ".join(desc_tag.text.strip().split())
        href = re.search(r"opentjblink\('([^']+)'\)", link_tag["href"])
        if not href:
            continue

        # Expand verses
        raw = href.group(1)
        raw = raw.replace("%20", " ")
        parts = [p.strip() for p in raw.split(";") if p.strip()]
        verses = []

        for part in parts:
            if "," in part:
                book = part.split()[0].capitalize()
                chapter = part.split()[1].split(":")[0]
                for v in part.split()[1].split(":")[1].split(","):
                    v = v.strip()
                    if ":" in v:
                        verses.append(f"{book} {v}")
                    else:
                        verses.append(f"{book} {chapter}:{v}")
            else:
                tokens = part.split()
                tokens[0] = tokens[0].capitalize()
                verses.append(" ".join(tokens))

        sections.append({"description": description, "verses": verses})

print(f"✅ Found {len(sections)} sections.")

# ---- STEP 2: LOAD WEB BIBLE (TehShrike JSON) ----
verse_map = defaultdict(lambda: defaultdict(dict))

for book in books:
    url = f"{base_web}{book.lower()}.json"
    print(f"Downloading {url} ...")
    data = requests.get(url).json()

    temp = defaultdict(lambda: defaultdict(list))
    for entry in data:
        if entry.get("type") in ["paragraph text", "verse", "paragraph end", "line text"]:
            if "chapterNumber" not in entry or "verseNumber" not in entry:
                continue
            c = int(entry["chapterNumber"])
            v = int(entry["verseNumber"])
            text = entry["value"].strip()
            if text:
                temp[c][v].append(text)

    for c, verses in temp.items():
        for v, parts in verses.items():
            verse_map[book][c][v] = " ".join(parts).strip()

print("✅ WEB Bible loaded.")

# ---- STEP 3: BUILD LATEX OUTPUT ----
def escape_latex(text):
    specials = {
        "&": r"\&", "%": r"\%", "$": r"\$", "#": r"\#",
        "_": r"\_", "{": r"\{", "}": r"\}", "~": r"\textasciitilde{}",
        "^": r"\textasciicircum{}", "\\": r"\textbackslash{}"
    }
    for k, v in specials.items():
        text = text.replace(k, v)
    return text

latex_lines = []

for sec in sections:
    latex_lines.append("\\chapterornament")
    latex_lines.append(f"\\section*{{{escape_latex(sec['description'])}}}\n")

    first = True
    for ref in sec["verses"]:
        match = re.match(r"(\w+) (\d+):(\d+)(?:-(\d+))?", ref)
        if not match:
            continue
        raw_book = match.group(1)
        book = BOOK_MAP.get(raw_book, raw_book).capitalize()
        chap = int(match.group(2))
        start = int(match.group(3))
        end = int(match.group(4)) if match.group(4) else start

        for vnum in range(start, end + 1):
            text = verse_map[book].get(chap, {}).get(vnum)
            if not text:
                continue
            text = escape_latex(text)
            if first:
                latex_lines.append(f"\\lettrine{{{text[0]}}}{{{text[1:]}}}\n")
                first = False
            else:
                latex_lines.append(text + "\n")

# ---- STEP 4: SAVE LATEX ----
with open(output_path, "w", encoding="utf-8") as f:
    f.write("\n".join(latex_lines))

print(f"✅ LaTeX Jefferson Bible saved to {output_path}")


Scraping https://thejeffersonbible.com/index.php/tot15 ...
Scraping https://thejeffersonbible.com/index.php/tot30 ...
Scraping https://thejeffersonbible.com/index.php/tot45 ...
Scraping https://thejeffersonbible.com/index.php/tot60 ...
Scraping https://thejeffersonbible.com/index.php/tot75 ...
Scraping https://thejeffersonbible.com/index.php/tot82 ...
✅ Found 82 sections.
Downloading https://raw.githubusercontent.com/TehShrike/world-english-bible/refs/heads/master/json/matthew.json ...
Downloading https://raw.githubusercontent.com/TehShrike/world-english-bible/refs/heads/master/json/mark.json ...
Downloading https://raw.githubusercontent.com/TehShrike/world-english-bible/refs/heads/master/json/luke.json ...
Downloading https://raw.githubusercontent.com/TehShrike/world-english-bible/refs/heads/master/json/john.json ...
✅ WEB Bible loaded.
✅ LaTeX Jefferson Bible saved to /content/drive/MyDrive/bens_bible/jefferson_bible_latex.tex


In [3]:
import json

# Load the same JSON or scraped data that created the LaTeX
json_path = "/content/drive/MyDrive/bens_bible/jefferson_verses.json"

with open(json_path, "r", encoding="utf-8") as f:
    jeff_sections = json.load(f)

print(f"Total sections: {len(jeff_sections)}\n")

for i, section in enumerate(jeff_sections, 1):
    desc = section.get("description", "").strip()
    print(f"{i}. {desc}")


Total sections: 220



AttributeError: 'str' object has no attribute 'get'

In [4]:
import requests
from bs4 import BeautifulSoup

pages = [15, 30, 45, 60, 75, 82]
base_html = "https://thejeffersonbible.com/index.php/tot{}"

descriptions = []

for p in pages:
    soup = BeautifulSoup(requests.get(base_html.format(p)).text, "html.parser")
    for row in soup.find_all("tr"):
        desc_tag = row.find("td", class_="column-2")
        if desc_tag:
            desc = " ".join(desc_tag.text.strip().split())
            if desc:
                descriptions.append(desc)

print(f"Total descriptions: {len(descriptions)}\n")
for i, d in enumerate(descriptions, 1):
    print(f"{i}. {d}")


Total descriptions: 66

1. Mary and Joseph travel to Bethlehem, where Jesus is born. he is circumcised & named & they return to Nazareth. at 12 years of age he accompanies his parents to Jerusalem and returns.
2. John baptizes in Jordan. Jesus is baptized at 30 years of age.
3. drives the traders out of the temple. he baptises but retires into Galilee on the death of John.
4. he teaches in the Synagogue.
5. explains the Sabbath. call of his disciples.
6. the Sermon on the Mount. exhorts.
7. a woman anointeth him.
8. precepts.
9. parable of the fig tree.
10. precepts
11. parable of the fig tree.
12. precepts
13. parable of the Sower.
14. precepts parable of the Tares.
15. precepts parable of the Tares.
16. precepts. parable of new wine in old bottles.
17. precepts. parable of new wine in old bottles.
18. a prophet hath no honor in his own country.
19. mission, instruction, return of apostles.
20. precepts
21. precepts
22. parable of the wicked servant.
23. parable of the wicked servant.