In [1]:
from bs4 import BeautifulSoup, SoupStrainer
import requests
import re
from tqdm.auto import tqdm

with open('../bin/textbook-first-pages.txt', 'r') as f:
    textbook_first_pages = f.read().splitlines()

## Class for Scraping OpenStax

### Assumptions and Shortcuts
 - Ignoring any pages that do not start with digit-dash-digit, so '1-1-introduction' is fine but '1-Introudction' and 'Index' are ignored
 - Ignoring linebreaks. While these might be informative of document structure, I do not want to deal with them right now.

In [2]:
class StaxBook:
    def __init__(self, first_page_url):
        self.first_page_url = first_page_url
        self.base_url = first_page_url.split('/pages/')[0] # a non-functional URL used to build other URLs
        self.title = self.base_url.split('/')[-1]
        self.sections = []
        self.get_section_urls()
        
    def get_section_urls(self):
        soup = BeautifulSoup(requests.get(self.first_page_url).text,
                             features='html.parser')
        
        for a in soup.find_all('a',
                               class_='styled__ContentLink-sc-18yti3s-1 cRIWDW',
                               href=True):
            match = re.search(r'^\d+-\d+', a['href'])
            if match:
                self.sections.append({
                    'section': match[0],
                    'url': '/'.join([self.base_url, 'pages', a['href']]),
                })
        
    def scrape_one(self, soup):
        text_elements = []
        keyterm_spans = []
        total_str_len = 0

        for e in soup.strings:
            if isinstance(e, str) and not e.isspace():
                text = re.sub('\s+', ' ', e.text) + ' ' # add a space at the end of each element
                if e.parent.attrs.get('data-type', None) == 'term':
                    try:
                        keyterm_spans.append((total_str_len,
                                              total_str_len + len(text) - 1, # -1 because we do not want the space we added earlier
                                              e.parent.attrs.get('class', [''])[0] # returns list, so set default to list containing None and pull first elem
                                             ))
                    except Exception as e:
                        print(e)
                text_elements.append(text)
                total_str_len += len(text)
                
        return text_elements, keyterm_spans
                     
    def scrape_all(self):
        for section in tqdm(self.sections, leave=False):
            soup = BeautifulSoup(requests.get(section['url']).content.decode('utf-8'),
                                 features='html5lib').find('div', id='main-content')
            section_text, section_labels = self.scrape_one(soup)
            section['text'] = ''.join(section_text)
            section['labels'] = section_labels

In [3]:
stax_books = [StaxBook(url) for url in textbook_first_pages]

In [4]:
for book in tqdm(stax_books):
    book.scrape_all()

  0%|          | 0/51 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/43 [00:00<?, ?it/s]

  0%|          | 0/43 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/73 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

  0%|          | 0/169 [00:00<?, ?it/s]

  0%|          | 0/155 [00:00<?, ?it/s]

  0%|          | 0/208 [00:00<?, ?it/s]

  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/114 [00:00<?, ?it/s]

  0%|          | 0/114 [00:00<?, ?it/s]

  0%|          | 0/244 [00:00<?, ?it/s]

  0%|          | 0/244 [00:00<?, ?it/s]

  0%|          | 0/82 [00:00<?, ?it/s]

  0%|          | 0/127 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

  0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/77 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/90 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  0%|          | 0/82 [00:00<?, ?it/s]

  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/53 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/110 [00:00<?, ?it/s]

  0%|          | 0/113 [00:00<?, ?it/s]

In [77]:
import pandas as pd

def make_book_frame(texts):
    book_frames = [pd.DataFrame(book.sections) for book in texts]
    df = (pd
          .concat(book_frames)
          .reset_index(drop=True)
          .assign(book=lambda x: x.url.str.split('/', expand=True)[4],
                  terms=lambda x: x.apply(
                      lambda row: [row.text[st:end] 
                                   for (st, end, label) in row.labels
                                   if not label], axis=1
                  ),
                  no_emphasis_terms=lambda x: x.apply(
                      lambda row: [row.text[st:end] 
                                   for (st, end, label) in row.labels
                                   if label=='no-emphasis'],
                      axis=1),
                 )
          .rename(columns={'labels': 'term_indices'})
         )
    return df[['book', 'section', 'url', 'text', 'terms', 'no_emphasis_terms', 'term_indices']]
        
book_frame = make_book_frame(stax_books)

In [None]:
book_frame.sample(10).to_csv('../data/open-stax-texts-terms-10-sample.csv', index=False)

In [29]:
import pyarrow as pa
TermType = pa.map_(pa.uint64(), pa.uint64(), pa.string())
# pa.array((st, end, label), type=TermType) 
{'st': st, 'end': end, 'label': label}

In [93]:
import pyarrow as pa

TermType = pa.struct([
    ('st', pa.uint64()),
    ('end', pa.uint64()),
    pa.field('label', pa.string(), nullable=True),
])


In [None]:
for section in book_frame.term_indices[:500]:
    for (st, end, label) in section:
        try:
            pa.array((st, end, label), type=TermType)
        except Exception as e:
            print(e)
            print(int(st), end, label)

In [94]:
test_frame = book_frame.assign(term_indices = lambda x: [
    pa.array(
    [{'st': st, 'end': end, 'label': label} for (st, end, label) in y],
        type=TermType)
    for y in x.term_indices
])
test_frame.sample(10)

Unnamed: 0,book,section,url,text,terms,no_emphasis_terms,term_indices
560,introductory-statistics,5-3,https://openstax.org/books/introductory-statis...,The exponential distribution is often concer...,"[exponential distribution, continuous random v...",[],"((st, end, label), (st, end, label), (st, end,..."
3835,writing-guide,2-4,https://openstax.org/books/writing-guide/pages...,"Learning Outcomes By the end of this section, ...",[],[National Association for the Advancement of C...,"((st, end, label), (st, end, label))"
2378,microbiology,13-1,https://openstax.org/books/microbiology/pages/...,Learning Objectives By the end of this section...,"[fomite, autoclave, sterilization, sterilant, ...","[botulism, endospore, biological safety levels...","((st, end, label), (st, end, label), (st, end,..."
3609,psychology-2e,3-2,https://openstax.org/books/psychology-2e/pages...,Learning Objectives By the end of this section...,"[nervous system, Glial cells, Neurons, semiper...","[Neurons, neuron, action potential, neurotrans...","((st, end, label), (st, end, label), (st, end,..."
1752,college-physics-2e,1-4,https://openstax.org/books/college-physics-2e/...,Learning Objectives By the end of this section...,[approximations],[],"((st, end, label))"
2975,introduction-political-science,2-2,https://openstax.org/books/introduction-politi...,"Learning Outcomes By the end of this section, ...","[rational, irrational, heuristics, fake news]","[Empirical, Kahneman, thinking fast, thinking ...","((st, end, label), (st, end, label), (st, end,..."
599,introductory-statistics,12-4,https://openstax.org/books/introductory-statis...,"The correlation coefficient, r , tells us abo...",[],[],()
2976,introduction-political-science,2-3,https://openstax.org/books/introduction-politi...,"Learning Outcomes By the end of this section, ...",[motivated reasoning],"[empirical, Green Party, Thunberg, Cuban Missi...","((st, end, label), (st, end, label), (st, end,..."
155,calculus-volume-2,5-4,https://openstax.org/books/calculus-volume-2/p...,Learning Objectives 5.4.1 Use the comparison t...,"[comparison test, limit comparison test]",[],"((st, end, label), (st, end, label))"
4710,principles-finance,7-3,https://openstax.org/books/principles-finance/...,"Learning Outcomes By the end of this section, ...",[single payment or lump sum],[Excel],"((st, end, label), (st, end, label))"


In [95]:
test_frame.term_indices[4799]

<pyarrow.lib.StructArray object at 0x7f848fe22440>
-- is_valid: all not null
-- child 0 type: uint64
  [
    142,
    326,
    521,
    1465,
    2503,
    2607,
    2827,
    4102
  ]
-- child 1 type: uint64
  [
    180,
    343,
    556,
    1491,
    2520,
    2619,
    2841,
    4119
  ]
-- child 2 type: string
  [
    null,
    null,
    null,
    null,
    null,
    null,
    null,
    null
  ]

In [98]:
test_frame = book_frame.assign(st = lambda x: x.term_indices.astype(str))
test_frame.sample(10)

Unnamed: 0,book,section,url,text,terms,no_emphasis_terms,term_indices
1665,chemistry-atoms-first-2e,6-4,https://openstax.org/books/chemistry-atoms-fir...,Learning Objectives By the end of this section...,"[mass percentage, volume percentage, mass-volu...",[],"[(906, 921, None), (5200, 5217, None), (6504, ..."
2926,introduction-anthropology,11-4,https://openstax.org/books/introduction-anthro...,"Learning Outcomes By the end of this section, ...","[Marriage, incest taboo, Monogamy, Serial mono...","[Miriam Zeitzen, Bao, bride price, Sitlhou, Ne...","[(810, 818, None), (2601, 2613, None), (5074, ..."
4182,introduction-business,1-3,https://openstax.org/books/introduction-busine...,What are the primary features of the world’s e...,"[economic system, Economics, Capitalism, commu...","[General Motors, Federal Reserve Board, Genera...","[(264, 279, None), (460, 469, None), (1192, 12..."
1674,chemistry-atoms-first-2e,8-4,https://openstax.org/books/chemistry-atoms-fir...,Learning Objectives By the end of this section...,"[mean free path, diffusion, rate of diffusion,...",[],"[(711, 725, None), (1216, 1225, None), (2375, ..."
954,anatomy-and-physiology-2e,23-3,https://openstax.org/books/anatomy-and-physiol...,Learning Objectives By the end of this section...,"[oral cavity, labia, labial frenulum, oral ves...",[],"[(747, 758, None), (889, 894, None), (1316, 13..."
311,college-algebra-corequisite-support-2e,7-4,https://openstax.org/books/college-algebra-cor...,"Learning Objectives In this section, you will:...",[partial fraction],"[partial fraction decomposition, rational expr...","[(9566, 9596, 'no-emphasis'), (9737, 9756, 'no..."
956,anatomy-and-physiology-2e,23-5,https://openstax.org/books/anatomy-and-physiol...,Learning Objectives By the end of this section...,"[small intestine, duodenum, hepatopancreatic a...",[],"[(1108, 1123, None), (2299, 2307, None), (2761..."
4370,introductory-business-statistics,2-6,https://openstax.org/books/introductory-busine...,Consider the following data set. 4; 5; 6; 6; ...,[],[],[]
2048,college-physics-ap-courses-2e,9-6,https://openstax.org/books/college-physics-ap-...,Learning Objectives By the end of this section...,[],[],[]
304,college-algebra-corequisite-support-2e,6-5,https://openstax.org/books/college-algebra-cor...,"Learning Objectives In this section, you will:...","[Product Property, Quotient Property, Power Pr...","[pH, one-to-one, product rule for logarithms, ...","[(1022, 1038, None), (1335, 1352, None), (1681..."


In [99]:
pa.Table.from_pandas(test_frame)

pyarrow.Table
book: string
section: string
url: string
text: string
terms: list<item: string>
  child 0, item: string
no_emphasis_terms: list<item: string>
  child 0, item: string
term_indices: string
----
book: [["algebra-and-trigonometry-2e","algebra-and-trigonometry-2e","algebra-and-trigonometry-2e","algebra-and-trigonometry-2e","algebra-and-trigonometry-2e",...,"principles-management","principles-management","principles-management","principles-management","principles-management"]]
section: [["1-1","1-2","1-3","1-4","1-5",...,"18-3","18-4","18-5","18-6","18-7"]]
url: [["https://openstax.org/books/algebra-and-trigonometry-2e/pages/1-1-real-numbers-algebra-essentials","https://openstax.org/books/algebra-and-trigonometry-2e/pages/1-2-exponents-and-scientific-notation","https://openstax.org/books/algebra-and-trigonometry-2e/pages/1-3-radicals-and-rational-exponents","https://openstax.org/books/algebra-and-trigonometry-2e/pages/1-4-polynomials","https://openstax.org/books/algebra-and-tri

In [96]:
test_frame.to_parquet('../data/open-stax-texts-terms.parquet', index=False)

ArrowInvalid: ('Could not convert <pyarrow.lib.StructArray object at 0x7f84a43c4d60>\n-- is_valid: all not null\n-- child 0 type: uint64\n  [\n    2133,\n    2561,\n    2682,\n    3314,\n    5577,\n    7536,\n    8365,\n    9922,\n    10036,\n    10157,\n    ...\n    19791,\n    21654,\n    21843,\n    22253,\n    22628,\n    26530,\n    26586,\n    26701,\n    30672,\n    31190\n  ]\n-- child 1 type: uint64\n  [\n    2148,\n    2574,\n    2690,\n    3330,\n    5595,\n    7548,\n    8381,\n    9937,\n    10049,\n    10165,\n    ...\n    19812,\n    21683,\n    21878,\n    22281,\n    22662,\n    26538,\n    26594,\n    26721,\n    30680,\n    31197\n  ]\n-- child 2 type: string\n  [\n    null,\n    null,\n    null,\n    null,\n    null,\n    null,\n    null,\n    null,\n    null,\n    null,\n    ...\n    null,\n    null,\n    null,\n    null,\n    null,\n    null,\n    null,\n    null,\n    null,\n    null\n  ] with type pyarrow.lib.StructArray: did not recognize Python value type when inferring an Arrow data type', 'Conversion failed for column term_indices with type object')

In [37]:
book_frame.to_csv('../data/open-stax-texts-terms.csv', index=False)

# Descriptives



|  | Value |
|---|---|
| textbooks | 50 |
| sections | 4,892 |
|Mean Word Count | 3,098 |
|Total Keyterms (bold) | 27,930 |
|Total Keyterms (no bold) | 18,876 |
|Total Keyterms (total) | 46,903 |
|Mean Keyterms (bold) | 5.71 |
|Mean Keyterms (no bold) | 3.86 |
|Mean Keyterms (total) | 9.6 |


In [1]:
import pandas as pd
book_frame = pd.read_parquet('../data/open-stax-texts-terms.parquet')
display(book_frame)

Unnamed: 0,book,section,url,text,terms,term_indices
0,algebra-and-trigonometry-2e,1-1,https://openstax.org/books/algebra-and-trigono...,"Learning Objectives In this section, you will:...","[natural numbers, whole numbers, integers, rat...","[[2133, 2148], [2561, 2574], [2682, 2690], [33..."
1,algebra-and-trigonometry-2e,1-2,https://openstax.org/books/algebra-and-trigono...,"Learning Objectives In this section, you will:...","[scientific notation, scientific notation, sci...","[[23288, 23307], [24718, 24737], [26614, 26633]]"
2,algebra-and-trigonometry-2e,1-3,https://openstax.org/books/algebra-and-trigono...,"Learning Objectives In this section, you will:...","[principal square root, radical, radicand, rad...","[[1512, 1533], [1766, 1773], [1817, 1825], [18..."
3,algebra-and-trigonometry-2e,1-4,https://openstax.org/books/algebra-and-trigono...,"Learning Objectives In this section, you will:...","[polynomial, coefficient, term of a polynomial...","[[1599, 1609], [1821, 1832], [2005, 2025], [21..."
4,algebra-and-trigonometry-2e,1-5,https://openstax.org/books/algebra-and-trigono...,"Learning Objectives In this section, you will:...","[greatest common factor, greatest common facto...","[[1731, 1753], [2370, 2392], [6703, 6721]]"
...,...,...,...,...,...,...
4887,principles-management,18-3,https://openstax.org/books/principles-manageme...,What are external sources of technology and in...,"[Mergers/acquisitions (M&A), Joint ventures, s...","[[1241, 1267], [1765, 1779], [2117, 2132], [21..."
4888,principles-management,18-4,https://openstax.org/books/principles-manageme...,What are internal sources of technology and in...,[research and development (R&D)],"[[192, 222]]"
4889,principles-management,18-5,https://openstax.org/books/principles-manageme...,How and why do entrepreneurs need to develop M...,"[Entrepreneurial activities, value proposition]","[[57, 83], [223, 240]]"
4890,principles-management,18-6,https://openstax.org/books/principles-manageme...,"No matter what method is used, what skills do ...","[Organizational learning, explicit knowledge, ...","[[1084, 1107], [1472, 1490], [1547, 1562], [50..."


In [64]:
book_frame.assign(
    word_count = lambda x: x.text.str.split().str.len(),
    num_terms_bold = lambda x: x.terms.apply(len),
    num_terms_no_bold = lambda x: x.no_emphasis_terms.apply(len),
    num_terms_total = lambda x: x.term_indices.apply(len),
).drop(columns=['book', 'section', 'url', 'text', 'terms', 'no_emphasis_terms', 'term_indices']).sum()


book                 algebra-and-trigonometry-2ealgebra-and-trigono...
section              1-11-21-31-41-51-62-12-22-32-42-52-62-73-13-23...
terms                [natural numbers, whole numbers, integers, rat...
no_emphasis_terms    [Pythagorean Theorem, rational number, rationa...
term_indices         [(2133, 2148, None), (2561, 2574, None), (2682...
word_count                                                    15154646
num_terms_bold                                                   27930
num_terms_no_bold                                                18876
num_terms_total                                                  46903
dtype: object