In [36]:
from bs4 import BeautifulSoup, SoupStrainer
import requests
import re
from tqdm.auto import tqdm

with open('../bin/textbook-first-pages.txt', 'r') as f:
    textbook_first_pages = f.read().splitlines()

In [7]:
def key_terms(soup):
    for a in soup.find_all('a', href=True):
        if a.text == "Next" and a.get('href'):
            return a.get('href')

def next_section(soup):
    for a in soup.find_all('a', href=True):
        print(a)
        if a.text == "Next" and a.get('href'):
            return a.get('href')

def section_generator(url):
    while url:
        page = requests.get(url)
        soup = BeautifulSoup(page.text, features='html.parser')
        # keyterms = key_terms(soup)
        url = next_section(soup)
        print(url)
        # sectiontext = soup.text

None


## Class for Scraping OpenStax

### Assumptions and Shortcuts
 - Ignoring any pages that do not start with digit-dash-digit, so '1-1-introduction' is fine but '1-Introudction' and 'Index' are ignored
 - Ignoring linebreaks. While these might be informative of document structure, I do not want to deal with them right now.

In [71]:
class StaxBook:
    def __init__(self, first_page_url):
        self.first_page_url = first_page_url
        self.base_url = first_page_url.split('/pages/')[0] # a non-functional URL used to build other URLs
        self.title = self.base_url.split('/')[-1]
        self.sections = []
        self.get_section_urls()
        
    def get_section_urls(self):
        soup = BeautifulSoup(requests.get(self.first_page_url).text,
                             features='html.parser')
        
        for a in soup.find_all('a',
                               class_='styled__ContentLink-sc-18yti3s-1 cRIWDW',
                               href=True):
            match = re.search(r'^\d+-\d+', a['href'])
            if match:
                self.sections.append({
                    'section': match[0],
                    'url': '/'.join([self.base_url, 'pages', a['href']]),
                })
        
    def scrape_one(self, soup):
        text_elements = []
        keyterm_spans = []
        total_str_len = 0

        for e in soup.strings:
            if isinstance(e, str) and not e.isspace():
                text = re.sub('\s+', ' ', e.text) + ' '
                if e.parent.attrs.get('data-type', None) == 'term':
                    keyterm_spans.append((total_str_len, total_str_len + len(text) - 1 ))
                text_elements.append(text)
                total_str_len += len(text)
                
        return text_elements, keyterm_spans
                     
    def scrape_all(self):
        for section in tqdm(self.sections):
            soup = BeautifulSoup(requests.get(section['url']).content.decode('utf-8'),
                                 features='html5lib').find('div', id='main-content')
            section_text, section_labels = self.scrape_one(soup)
            section['text'] = ''.join(section_text)
            section['labels'] = section_labels

In [72]:
stax_books = [StaxBook(url) for url in textbook_first_pages]

In [77]:
for book in stax_books:
    book.scrape_all()

  0%|          | 0/79 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/43 [00:00<?, ?it/s]

  0%|          | 0/43 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/73 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

  0%|          | 0/169 [00:00<?, ?it/s]

  0%|          | 0/155 [00:00<?, ?it/s]

  0%|          | 0/208 [00:00<?, ?it/s]

  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/114 [00:00<?, ?it/s]

  0%|          | 0/114 [00:00<?, ?it/s]

  0%|          | 0/244 [00:00<?, ?it/s]

  0%|          | 0/244 [00:00<?, ?it/s]

  0%|          | 0/82 [00:00<?, ?it/s]

  0%|          | 0/127 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

  0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/77 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/90 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  0%|          | 0/82 [00:00<?, ?it/s]

  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/53 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/110 [00:00<?, ?it/s]

  0%|          | 0/113 [00:00<?, ?it/s]

In [163]:
import pandas as pd

def make_book_frame(texts):
    book_frames = [pd.DataFrame(book.sections) for book in texts]
    df = (pd
          .concat(book_frames)
          .reset_index(drop=True)
          .assign(book=lambda x: x.url.str.split('/', expand=True)[4],
                  terms=lambda x: x.apply(
                      lambda row: [row.text[st:end] for (st, end) in row.labels], axis=1
                  )
                 )
          .rename(columns={'labels': 'term_indices'})
         )
    return df[['book', 'section', 'url', 'text', 'terms', 'term_indices']]
        
book_frame = make_book_frame(stax_books)

In [169]:
book_frame.sample(15)

Unnamed: 0,book,section,url,text,terms,term_indices
1830,college-physics-2e,13-3,https://openstax.org/books/college-physics-2e/...,Learning Objectives By the end of this section...,"[ideal gas law, Boltzmann constant, mole, Avog...","[(3548, 3561), (3811, 3829), (9307, 9311), (95..."
2167,college-physics-ap-courses-2e,25-3,https://openstax.org/books/college-physics-ap-...,Learning Objectives By the end of this section...,"[refraction, index of refraction, law of refra...","[(595, 605), (4931, 4950), (10033, 10050)]"
3192,principles-economics-2e,16-2,https://openstax.org/books/principles-economic...,Learning Objectives By the end of this sectio...,"[Insurance, premiums, private insurance, Unemp...","[(293, 302), (507, 515), (4826, 4843), (5098, ..."
2126,college-physics-ap-courses-2e,20-2,https://openstax.org/books/college-physics-ap-...,Learning Objectives By the end of this section...,"[Ohm’s law, resistance, ohmic, ohm, simple cir...","[(1021, 1030), (1447, 1457), (2076, 2081), (24..."
3631,psychology-2e,7-3,https://openstax.org/books/psychology-2e/pages...,Learning Objectives By the end of this section...,"[problem-solving strategy, trial and error, al...","[(1250, 1274), (1445, 1460), (2491, 2500), (32..."
146,calculus-volume-2,3-7,https://openstax.org/books/calculus-volume-2/p...,Learning Objectives 3.7.1 Evaluate an integral...,"[improper integral, traffic accidents, probabi...","[(2226, 2243), (5896, 5913), (6327, 6338), (21..."
1990,college-physics-2e,34-5,https://openstax.org/books/college-physics-2e/...,Learning Objectives By the end of this section...,"[complexity, Chaos]","[(841, 851), (4845, 4850)]"
803,statistics,11-3,https://openstax.org/books/statistics/pages/11...,Tests of independence involve using a conting...,"[contingency table, test of independence]","[(39, 56), (111, 131)]"
3394,principles-macroeconomics-ap-courses-2e,7-3,https://openstax.org/books/principles-macroeco...,Learning Objectives By the end of this section...,"[macro economy, cyclical unemployment, labor m...","[(1049, 1062), (1610, 1631), (1857, 1869), (33..."
4317,introduction-intellectual-property,1-4,https://openstax.org/books/introduction-intell...,Learning Objectives After completing this sect...,[],[]


In [167]:
book_frame.to_parquet('../data/open-stax-texts-terms.parquet', index=False)