Code to scrape Wikipedia pages for all [mathematics fields](https://en.wikipedia.org/wiki/Category:Fields_of_mathematics). Can be used as context for word/document embeddings.

## Preliminaries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install requests beautifulsoup4 tqdm nltk



In [None]:
import sys
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from collections import Counter
from urllib.parse import urljoin

In [None]:
tqdm.pandas()

## Functions

In [None]:
def get_category_members(category, cmtype='subcat|page'):
    """
    Fetch members of a Wikipedia category, including all subcategories and pages.
    """
    api = 'https://en.wikipedia.org/w/api.php'
    params = {
        "action": "query",
        "list": "categorymembers",
        "cmtitle": f"Category:{category}",
        "cmtype": cmtype,
        "cmlimit": "max",
        "format": "json"
    }
    members = []

    while True:
        response = requests.get(api, params=params)
        data = response.json()
        members.extend(data['query']['categorymembers'])
        if 'continue' in data:
            params.update(data['continue'])
        else:
            break

    return members

def get_page_urls(category, skip_categories, collected_pages=None, processed_categories=None, categories_level1=[]):
    """
    Recursively gets pages from a given category and all subcategories.

    Args:
        skip_pages (list) : list of categories to skip pages
        skip_categories (list) : list of categories to skip
        skip_subcategories (list) : list of categories to skip subcategories,
            but add pages
        collected_pages (list) : URLs for "final" pages
        processed_categories (list) : list of categories that have already been
            processed, to avoid infinite loops
    """
    if collected_pages is None:
        collected_pages = []
    if processed_categories is None:
        processed_categories = set()

    if category in processed_categories:
        return collected_pages

    processed_categories.add(category)
    members = get_category_members(category)

    base_url = 'https://en.wikipedia.org/wiki/'

    for member in members:
        if member['title'].startswith('Category:'):
            subcategory = member['title'].replace('Category:', '').strip()
            if subcategory in categories_level1:
                print('\nPRIMARY CATEGORY:', subcategory)
            if subcategory not in skip_categories and all(fragment not in subcategory for fragment in skip_categories):
                print(f'Recursively processing subcategory [{subcategory}] with parent category [{category}]...')
                get_page_urls(subcategory, skip_categories, collected_pages, processed_categories)
            else:
                if subcategory not in processed_categories:
                    processed_categories.add(subcategory)
        else:
            page_url = base_url + member['title'].replace(' ', '_')
            collected_pages.append(page_url)

    return collected_pages

def get_wikipedia_page_id(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "titles": title
    }
    response = requests.get(url, params=params)
    data = response.json()
    page_id = None
    pages = data["query"]["pages"]
    for page in pages.values():
        page_id = page.get("pageid")
        break
    return page_id

def scrape_wikipedia_page(url):
    url_data_dict = {
        'section-title' : [],
        'section-content' : []
    }
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Replace all images with alt text
    for img in soup.find_all('img'):
        try:
            if img is not None and 'mwe-math-element' in img.get('class', []):
                alt_text = img.get('alt', [])
                img.replace_with(alt_text)
            elif img is not None:
                img.decompose()
        except AttributeError:
            continue

    content_div = soup.find_all(['p', 'h2'])
    current_header = ''
    current_content = []
    for element in content_div:
        if element.name == 'h2':
            current_header = element.get_text(strip=True)
            if '[edit]' in current_header:
                current_header = current_header[:-6]
            url_data_dict['section-title'].append(current_header)
            url_data_dict['section-content'].append(' '.join(current_content))
            current_content = []
        else:
            current_content.append(element.get_text())

    df_url_data = pd.DataFrame(url_data_dict)
    df_url_data['section-title'] = df_url_data['section-title'].shift(1)
    df_url_data = df_url_data[~df_url_data['section-title'].isin([None, 'See also', 'References'])]
    df_url_data['page-title'] = soup.find('h1').get_text(strip=True)
    df_url_data['page-url'] = url

    # Preprocess data for Top2Vec model
    # df_url_data['content-cleaned'] = df_url_data['section-content'].apply(clean_content, remove_char=True)
    # overshared_words, undershared_words = get_shared_words(df_url_data['content-cleaned'].tolist())
    # print(df_url_data['content-cleaned'].tolist())
    # print('Overshared words:', overshared_words)
    # print('Undershared words:', undershared_words)

    return df_url_data

def scrape_wikipedia_urls(url_list, output_name):
    df = pd.DataFrame()
    with tqdm(total=len(url_list)) as pbar:
        for url in url_list:
            df = pd.concat([df, scrape_wikipedia_page(url)], axis=0)
            pbar.update(1)
            df.to_json(output_name, orient='records')
    df = df[~df['section-content'].isin([None, ''])].reset_index(drop=True)
    df.to_json(output_name, orient='records')
    return df

In [None]:
###  DATA PREPROCESSING  ###

def clean_content(content, remove_char=False):
    # Make lowercase and remove special characters
    content = re.sub(r'[^a-z\s]', ' ', content.lower())
    if remove_char:
        # Remove all single characters
        content = re.sub(r'\b[a-z0-9]\b', '', content)
    # Remove excess whitespace and new-line indicators
    content = re.sub(r'\s+', ' ', content)
    return content

def count_docs_per_word(docs):
    count = Counter()
    for doc in docs:
        # unique_words = set(doc.split(' '))
        unique_words = set(doc)
        count.update(unique_words)
    return count

def get_shared_words(docs, overshare_percent=0.7, undershare_value=2):
    num_docs = len(docs)
    num_docs_long = num_docs # sum(1 for doc in docs if len(doc) > 20)
    overshared_words = set()
    undershared_words = set()
    docs_per_word = count_docs_per_word(docs)
    for word, count in docs_per_word.items():
        if count / num_docs_long >= overshare_percent:
            overshared_words.add(word)
        elif count < undershare_value:
            undershared_words.add(word)
        # elif count / num_docs < undershare_percent:
        #     undershared_words.add(word)
    return overshared_words, undershared_words

def remove_words(df, remove_list, col_in, col_out):
    remove_pattern = re.compile(r'\b(?:{})\b'.format('|'.join(map(re.escape, remove_list))))
    df[col_out] = df[col_in].str.replace(remove_pattern, '', regex=True)
    return df

In [None]:
def download_df(df, output_name, format='json', index=False):
    if format == 'csv':
        df.to_csv(output_name, index=index)
    elif format == 'xlsx':
        df.to_excel(output_name, index=index)
    else:
        df.to_json(output_name, orient='records')
    files.download(output_name)

## Get Wikipedia corpus for math topics

### Get URLs for math topics

In [None]:
with open('wiki-categories-remove.txt', 'r') as file:
    skip_categories = file.read().split('\n')
with open('categories-level1.txt', 'r') as file:
    categories_level1 = file.read().split('\n')

math_wiki_urls = get_page_urls('Fields_of_mathematics', skip_categories, categories_level1=categories_level1)

In [None]:
# Save page URLs as text file
with open('math-wiki-urls.txt', 'w') as file:
    for url in math_wiki_urls:
        file.write(f'{url}\n')

### Scrape pages

In [None]:
# Load Wikipedia page URLs
with open('urls-categories-remove.txt', 'r') as file:
    math_wiki_urls = set(file.read().split('\n'))

print('Number of pages to scrape:', len(math_wiki_urls))

Number of pages to scrape: 16829


In [None]:
df_wiki = scrape_wikipedia_urls(list(math_wiki_urls), 'data-wiki.json')

In [None]:
download_df(df_wiki, 'data-wiki.json')

### Clean content

In [None]:
df_wiki = pd.read_json('data-wiki.json')
page_titles = set(df_wiki['page-title'].to_list())
print('Number of unique pages:', len(page_titles))

Number of unique pages: 16196


In [None]:
title_to_id_dict = {}
for title in tqdm(page_titles, desc='Fetching page IDs'):
    title_to_id_dict[title] = get_wikipedia_page_id(title)

Fetching page IDs: 100%|██████████| 16196/16196 [1:12:23<00:00,  3.73it/s]


In [None]:
df_wiki['content-cleaned'] = df_wiki['section-content'].progress_apply(clean_content, remove_char=True)

100%|██████████| 74300/74300 [00:25<00:00, 2962.65it/s]


In [None]:
download_df(df_wiki, 'data-wiki-cleaned.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Form corpus
* Remove stop words
* Remove words that appear in more than 10 percent of documents
* Remove words that appear in less than 4 documents

In [None]:
df_wiki = pd.read_json('data-wiki-cleaned.json')
wiki_docs = df_wiki['content-cleaned'].str.split().to_list()
print('Number of documents:', len(wiki_docs))

Number of documents: 74300


In [None]:
# Get English stop words
import nltk
nltk.download('stopwords')

stopwords_list = stopwords.words('english')
stopwords_list = [re.sub(r'[^a-zA-Z0-9]', '', word) for word in stopwords_list]
print(stopwords_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'youre', 'youve', 'youll', 'youd', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'shes', 'her', 'hers', 'herself', 'it', 'its', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'thatll', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
overshared_words, undershared_words = get_shared_words(wiki_docs, overshare_percent=0.1, undershare_value=3)
print('Number of overshared words', len(overshared_words))
print('Overshared words:', overshared_words)
print('\nNumber of undershared words', len(undershared_words))
print('Undershared words:', undershared_words)

Number of overshared words 98
Overshared words: {'non', 'known', 'finite', 'may', 'into', 'only', 'is', 'also', 'real', 'however', 'no', 'their', 'by', 'form', 'there', 'defined', 'point', 'using', 'both', 'its', 'general', 'given', 'left', 'called', 'which', 'number', 'mathematics', 'have', 'order', 'the', 'theory', 'function', 'all', 'as', 'functions', 'with', 'can', 'field', 'such', 'are', 'that', 'some', 'on', 'each', 'or', 'right', 'an', 'dimensional', 'in', 'more', 'be', 'but', 'so', 'any', 'linear', 'every', 'to', 'complex', 'above', 'these', 'thus', 'we', 'first', 'example', 'at', 'set', 'when', 'space', 'than', 'many', 'two', 'this', 'displaystyle', 'of', 'over', 'they', 'group', 'not', 'it', 'was', 'following', 'and', 'same', 'numbers', 'other', 'between', 'theorem', 'case', 'from', 'since', 'for', 'then', 'has', 'if', 'used', 'where', 'one', 'let'}

Number of undershared words 42746


In [None]:
remove_words = ['operatorname','displaystyle'] + list(overshared_words) + list(undershared_words) + stopwords_list
remove_words = list(set(remove_words))
print('Number of words to remove:', len(remove_words))

Number of words to remove: 42965


In [None]:
# Remove words with parallel processing
import concurrent.futures
remove_words_set = set(remove_words)

def filter_words(doc):
    return [word for word in doc if word not in remove_words_set]

with concurrent.futures.ProcessPoolExecutor() as executor:
    futures = [executor.submit(filter_words, doc) for doc in wiki_docs]
    wiki_corpus = [future.result() for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures))]

 23%|██▎       | 16817/74300 [3:22:09<11:31:00,  1.39it/s]
100%|██████████| 74300/74300 [00:41<00:00, 1788.73it/s]


In [None]:
with open('wikipedia-docs.txt', 'w') as file:
    for doc in wiki_corpus:
        file.write(f"{' '.join(doc)}\n")