In [1]:
from bs4 import BeautifulSoup, SoupStrainer
import requests
import re
from tqdm.auto import tqdm

with open('../bin/textbook-first-pages.txt', 'r') as f:
    textbook_first_pages = f.read().splitlines()

## Class for Scraping OpenStax

### Assumptions and Shortcuts
 - Ignoring any pages that do not start with digit-dash-digit, so '1-1-introduction' is fine but '1-Introudction' and 'Index' are ignored
 - Ignoring linebreaks. While these might be informative of document structure, I do not want to deal with them right now.

In [2]:
class StaxBook:
    def __init__(self, first_page_url):
        self.first_page_url = first_page_url
        self.base_url = first_page_url.split('/pages/')[0] # a non-functional URL used to build other URLs
        self.title = self.base_url.split('/')[-1]
        self.sections = []
        self.get_section_urls()
        
    def get_section_urls(self):
        soup = BeautifulSoup(requests.get(self.first_page_url).text,
                             features='html.parser')
        
        for a in soup.find_all('a',
                               class_='styled__ContentLink-sc-18yti3s-1 cRIWDW',
                               href=True):
            match = re.search(r'^\d+-\d+', a['href'])
            if match:
                self.sections.append({
                    'section': match[0],
                    'url': '/'.join([self.base_url, 'pages', a['href']]),
                })
        
    def scrape_one(self, soup):
        text_elements = []
        keyterm_spans = []
        total_str_len = 0

        for e in soup.strings:
            if isinstance(e, str) and not e.isspace():
                text = re.sub('\s+', ' ', e.text) + ' ' # add a space at the end of each element
                if e.parent.attrs.get('data-type', None) == 'term':
                    try:
                        keyterm_spans.append((total_str_len,
                                              total_str_len + len(text) - 1, # -1 because we do not want the space we added earlier
                                              e.parent.attrs.get('class', [''])[0] # returns list, so set default to list containing None and pull first elem
                                             ))
                    except Exception as e:
                        print(e)
                text_elements.append(text)
                total_str_len += len(text)
                
        return text_elements, keyterm_spans
                     
    def scrape_all(self):
        for section in tqdm(self.sections, leave=False):
            soup = BeautifulSoup(requests.get(section['url']).content.decode('utf-8'),
                                 features='html5lib').find('div', id='main-content')
            section_text, section_labels = self.scrape_one(soup)
            section['text'] = ''.join(section_text)
            section['labels'] = section_labels

In [3]:
stax_books = [StaxBook(url) for url in textbook_first_pages]

In [4]:
for book in tqdm(stax_books):
    book.scrape_all()

  0%|          | 0/51 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/43 [00:00<?, ?it/s]

  0%|          | 0/43 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/73 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

  0%|          | 0/169 [00:00<?, ?it/s]

  0%|          | 0/155 [00:00<?, ?it/s]

  0%|          | 0/208 [00:00<?, ?it/s]

  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/114 [00:00<?, ?it/s]

  0%|          | 0/114 [00:00<?, ?it/s]

  0%|          | 0/244 [00:00<?, ?it/s]

  0%|          | 0/244 [00:00<?, ?it/s]

  0%|          | 0/82 [00:00<?, ?it/s]

  0%|          | 0/127 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

  0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/77 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/90 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  0%|          | 0/82 [00:00<?, ?it/s]

  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/53 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/110 [00:00<?, ?it/s]

  0%|          | 0/113 [00:00<?, ?it/s]

In [135]:
import pandas as pd

def make_book_frame(texts):
    book_frames = [pd.DataFrame(book.sections) for book in texts]
    df = (pd
          .concat(book_frames)
          .reset_index(drop=True)
          .assign(book=lambda x: x.url.str.split('/', expand=True)[4],
                  bold_term_offsets=lambda x: x.apply(
                      lambda row: [(st, end) for (st, end, label)
                                   in row.labels if label==None], axis=1),
                  no_bold_term_offsets=lambda x: x.apply(
                      lambda row: [(st, end) for (st, end, label)
                                   in row.labels if label=='no-emphasis'], axis=1)
                 )
          .assign(bold_terms=lambda x: x.apply(lambda row: [row.text[st:end] 
                                   for (st, end) in row.bold_term_offsets], axis=1),
                  no_bold_terms=lambda x: x.apply(lambda row: [row.text[st:end] 
                                   for (st, end) in row.no_bold_term_offsets], axis=1),
                 )
         )
    return df[['book', 'section', 'url', 'text', 'bold_terms', 'no_bold_terms', 'bold_term_offsets', 'no_bold_term_offsets']]
        
book_frame = make_book_frame(stax_books)

In [137]:
book_frame.sample(12)

Unnamed: 0,book,section,url,text,bold_terms,no_bold_terms,bold_term_offsets,no_bold_term_offsets
2681,university-physics-volume-2,11-5,https://openstax.org/books/university-physics-...,Learning Objectives By the end of this section...,"[Motors, magnetic dipole, magnetic dipole moment]",[commutator],"[(264, 270), (5066, 5081), (5118, 5140)]","[(815, 825)]"
1481,biology-ap-courses,30-4,https://openstax.org/books/biology-ap-courses/...,"Learning Objectives In this section, you will ...",[],[],[],[]
2946,introduction-anthropology,15-7,https://openstax.org/books/introduction-anthro...,"Learning Outcomes By the end of this section, ...",[],"[Broadcast media, Heath, Abu-Lughod, King Faro...",[],"[(373, 388), (2125, 2130), (3269, 3279), (3684..."
535,introductory-statistics,1-6,https://openstax.org/books/introductory-statis...,Stats Lab Sampling Experiment Class Time: Nam...,[],[],[],[]
1599,chemistry-2e,16-1,https://openstax.org/books/chemistry-2e/pages/...,Learning Objectives By the end of this section...,"[spontaneous process, nonspontaneous process]",[],"[(553, 572), (632, 654)]",[]
3621,psychology-2e,5-3,https://openstax.org/books/psychology-2e/pages...,Learning Objectives By the end of this section...,"[cornea, pupil, iris, lens, fovea, retina, pho...","[vision, color vision]","[(1351, 1357), (1537, 1542), (2003, 2007), (21...","[(1238, 1244), (10879, 10891)]"
693,precalculus-2e,4-4,https://openstax.org/books/precalculus-2e/page...,"Learning Objectives In this section, you will:...",[],"[parent function, horizontal shift, vertical s...",[],"[(10294, 10309), (10558, 10574), (13682, 13696..."
1572,chemistry-2e,10-6,https://openstax.org/books/chemistry-2e/pages/...,Learning Objectives By the end of this section...,"[unit cell, simple cubic structure, simple cub...","[Bragg, Crick, Watson, Wilkins, Franklin]","[(1748, 1757), (2535, 2557), (2593, 2615), (34...","[(29086, 29091), (31419, 31424), (31437, 31443..."
4456,organizational-behavior,7-2,https://openstax.org/books/organizational-beha...,Describe a content theory of motivation. The t...,"[need, Hedonism,, manifest need, Instincts, pr...",[],"[(199, 203), (535, 544), (990, 1003), (1097, 1...",[]
978,anatomy-and-physiology-2e,26-3,https://openstax.org/books/anatomy-and-physiol...,Learning Objectives By the end of this section...,"[Hyponatremia, Hypernatremia, Hypokalemia, Hyp...",[],"[(4061, 4073), (5333, 5346), (6430, 6441), (72...",[]


In [None]:
book_frame.sample(10).to_csv('../data/open-stax-texts-terms-10-sample.csv', index=False)

In [138]:
book_frame.to_parquet('../data/open-stax-texts-terms-10-sample.parquet', index=False)

# Descriptives



|  | Value |
|---|---|
| textbooks | 50 |
| sections | 4,892 |
|Mean Word Count | 3,098 |
|Total Keyterms (bold) | 27,930 |
|Total Keyterms (no bold) | 18,876 |
|Total Keyterms (total) | 46,903 |
|Mean Keyterms (bold) | 5.71 |
|Mean Keyterms (no bold) | 3.86 |
|Mean Keyterms (total) | 9.6 |


In [1]:
import pandas as pd
book_frame = pd.read_parquet('../data/open-stax-texts-terms.parquet')
display(book_frame)

Unnamed: 0,book,section,url,text,terms,term_indices
0,algebra-and-trigonometry-2e,1-1,https://openstax.org/books/algebra-and-trigono...,"Learning Objectives In this section, you will:...","[natural numbers, whole numbers, integers, rat...","[[2133, 2148], [2561, 2574], [2682, 2690], [33..."
1,algebra-and-trigonometry-2e,1-2,https://openstax.org/books/algebra-and-trigono...,"Learning Objectives In this section, you will:...","[scientific notation, scientific notation, sci...","[[23288, 23307], [24718, 24737], [26614, 26633]]"
2,algebra-and-trigonometry-2e,1-3,https://openstax.org/books/algebra-and-trigono...,"Learning Objectives In this section, you will:...","[principal square root, radical, radicand, rad...","[[1512, 1533], [1766, 1773], [1817, 1825], [18..."
3,algebra-and-trigonometry-2e,1-4,https://openstax.org/books/algebra-and-trigono...,"Learning Objectives In this section, you will:...","[polynomial, coefficient, term of a polynomial...","[[1599, 1609], [1821, 1832], [2005, 2025], [21..."
4,algebra-and-trigonometry-2e,1-5,https://openstax.org/books/algebra-and-trigono...,"Learning Objectives In this section, you will:...","[greatest common factor, greatest common facto...","[[1731, 1753], [2370, 2392], [6703, 6721]]"
...,...,...,...,...,...,...
4887,principles-management,18-3,https://openstax.org/books/principles-manageme...,What are external sources of technology and in...,"[Mergers/acquisitions (M&A), Joint ventures, s...","[[1241, 1267], [1765, 1779], [2117, 2132], [21..."
4888,principles-management,18-4,https://openstax.org/books/principles-manageme...,What are internal sources of technology and in...,[research and development (R&D)],"[[192, 222]]"
4889,principles-management,18-5,https://openstax.org/books/principles-manageme...,How and why do entrepreneurs need to develop M...,"[Entrepreneurial activities, value proposition]","[[57, 83], [223, 240]]"
4890,principles-management,18-6,https://openstax.org/books/principles-manageme...,"No matter what method is used, what skills do ...","[Organizational learning, explicit knowledge, ...","[[1084, 1107], [1472, 1490], [1547, 1562], [50..."


In [64]:
book_frame.assign(
    word_count = lambda x: x.text.str.split().str.len(),
    num_terms_bold = lambda x: x.terms.apply(len),
    num_terms_no_bold = lambda x: x.no_emphasis_terms.apply(len),
    num_terms_total = lambda x: x.term_indices.apply(len),
).drop(columns=['book', 'section', 'url', 'text', 'terms', 'no_emphasis_terms', 'term_indices']).sum()


book                 algebra-and-trigonometry-2ealgebra-and-trigono...
section              1-11-21-31-41-51-62-12-22-32-42-52-62-73-13-23...
terms                [natural numbers, whole numbers, integers, rat...
no_emphasis_terms    [Pythagorean Theorem, rational number, rationa...
term_indices         [(2133, 2148, None), (2561, 2574, None), (2682...
word_count                                                    15154646
num_terms_bold                                                   27930
num_terms_no_bold                                                18876
num_terms_total                                                  46903
dtype: object