<a href="https://colab.research.google.com/github/avocadopelvis/nagamese-english/blob/main/scrape/bible.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# load libraries
import requests
from bs4 import BeautifulSoup as bs

import csv
from itertools import zip_longest

import re
import pandas as pd

In [None]:
# load webpage content
naga = requests.get("https://cdn.door43.org/u/Door43-Catalog/nag_isv/master/print_all.html")
eng = requests.get("https://cdn.door43.org/u/WycliffeAssociates/en_ulb/3fe47367e0/print_all.html")

# convert to a beautiful soup object
soup_naga = bs(naga.content)
soup_eng = bs(eng.content)

# print(soup.prettify())

In [None]:
# get text from a book
def text(soup, div):
  # soup: beautiful soup object
  # div: div id of the book | pass as string
  book = soup.find("div", attrs = {"id": div})

  # Remove text between heading tags
  for heading_tag in book.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
      if heading_tag.string:
          heading_tag.string.extract()

  # Remove all div elements with class="footnotes"
  footnotes = book.find_all('div', class_='footnotes')
  for footnote in footnotes:
      footnote.decompose()

  # Extract remaining text
  text = book.get_text(separator=' ')

  # Replace consecutive whitespace characters with a single space
  text = re.sub(r'\s+', ' ', text)

  # Remove square brackets and anything between them
  text = re.sub(r'\[[^\]]*\]', '', text)

  output = text.strip()

  return output

# splits the corpus based on verse number
def split_corpus(corpus):
  # corpus: body of text

  # split the text into sentences when encountering a number
  sentences = re.split(r'\d+', corpus)

  # remove any empty sentences and leading/trailing whitespaces
  sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

  return sentences

# save eng-naga text as csv
def save_csv(path, header, eng_verses, naga_verses):
  # path: file path | pass as string
  # header: names of the column header | pass as list containing headers
  with open(path, "w", newline="") as file:
      writer = csv.writer(file)

      # Write the header row
      writer.writerow(header)

      # Iterate over the lists and write each row
      for item1, item2 in zip_longest(eng_verses, naga_verses, fillvalue = ""):
          writer.writerow([item1, item2])

      return print("CSV file created successfully.")

In [None]:
# get div names
divs = soup_naga.find_all("div", id = lambda value: value and '.html' in value)
id_names = [div['id'] for div in divs]

print(id_names)
print(len(id_names))

['41-MAT.html', '42-MRK.html', '43-LUK.html', '44-JHN.html', '45-ACT.html', '46-ROM.html', '47-1CO.html', '48-2CO.html', '49-GAL.html', '50-EPH.html', '51-PHP.html', '52-COL.html', '53-1TH.html', '54-2TH.html', '55-1TI.html', '56-2TI.html', '57-TIT.html', '58-PHM.html', '59-HEB.html', '60-JAS.html', '61-1PE.html', '62-2PE.html', '63-1JN.html', '64-2JN.html', '65-3JN.html', '66-JUD.html', '67-REV.html']
27


The new testament contains 27 books.

In [None]:
div = '48-2CO.html'
eng = text(soup_eng, div)
naga = text(soup_naga, div)

eng_verses = split_corpus(eng)
naga_verses = split_corpus(naga)

print(len(eng_verses), len(naga_verses))

# save corpus
# with open("text.txt", 'w') as file:
#     file.write(eng)

# save as csv
filename = "second-corinthians"
save_csv(f"/content/drive/MyDrive/MTP/{filename}.csv", ["English", "Nagamese"], eng_verses, naga_verses)

257 256
CSV file created successfully.
