In [37]:
import requests
from bs4 import BeautifulSoup
import re
from sympy import preview 
import pandas as pd
import os

In [31]:
# Step 1: Fetch the Mathematics Portal Page
url = "https://en.wikipedia.org/wiki/Portal:Mathematics"
response = requests.get(url)

# Step 2: Parse the HTML Content
soup = BeautifulSoup(response.content, 'html.parser')

# Step 3: Extract Links
links = soup.find_all("a")

# Step 4: Filter and Validate Links
valid_links = []
for link in links:
    href = link.get("href", "")
    # Check if the link is a valid Wikipedia link related to mathematics
    if re.match(r'^/wiki/', href) and not re.search(r'/\w+:', href):
        valid_links.append(href)

https_links = [f"https://en.wikipedia.org{link}" for link in valid_links]

In [46]:
len(https_links)

703

In [35]:
def parse_for_equations(url):
    page = url.split("/")[-1]
    # Fetch the HTML content of a Wikipedia page
    response = requests.get(url)
    html_content = response.content
    
    # Regular expression pattern to match the "displaystyle" block
    pattern = r'{\\displaystyle .*?}'
    
    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Find all LaTeX equations and their corresponding images
    equation_strings = []
    latex_equations = soup.find_all("math")
    for i, equation in enumerate(latex_equations):
        latex_text = equation.text
        
        match = re.search(pattern, latex_text)
        if match:
            cleaned_equation = latex_text[match.start():-2]
            try:
                preview(f"$${cleaned_equation}$$", viewer='file', filename=f'latex_images/{page}_{i}.png', euler=False)
                equation_strings.append({"name": f"{page}_{i}", "latex": cleaned_equation})
            except RuntimeError as e:
                print(f"Equation {repr(cleaned_equation)} failed to render")
    
    df = pd.DataFrame(data=equation_strings)
    df.to_csv(f'latex_strings/{page}.csv', index=False)

In [45]:
for url in https_links:
    page = url.split("/")[-1]
    existing_pages = [s[:-4] for s in os.listdir("latex_strings")]
    if page not in existing_pages:
        print(url)
        parse_for_equations(url)

https://en.wikipedia.org/wiki/Main_Page
https://en.wikipedia.org/wiki/Euler%27s_Identity
https://en.wikipedia.org/wiki/Mathematics
https://en.wikipedia.org/wiki/Mathematical_notation
https://en.wikipedia.org/wiki/Mathematical_proof
https://en.wikipedia.org/wiki/Mathematical_object
https://en.wikipedia.org/wiki/Number
Equation '{\\displaystyle {\\text{∞}}}' failed to render
https://en.wikipedia.org/wiki/Point_(geometry)
https://en.wikipedia.org/wiki/Euclidean_space
Equation '{\\displaystyle {\\begin{aligned}|BC|^{2}&={\\overrightarrow {BC}}\\cdot {\\overrightarrow {BC}}{\\vphantom {\\frac {(}{}}}\\\\[2mu]&={\\Bigl (}{\\overrightarrow {BA}}+{\\overrightarrow {AC}}{\\Bigr )}\\cdot {\\Bigl (}{\\overrightarrow {BA}}+{\\overrightarrow {AC}}{\\Bigr )}\\\\[4mu]&={\\overrightarrow {BA}}\\cdot {\\overrightarrow {BA}}+{\\overrightarrow {AC}}\\cdot {\\overrightarrow {AC}}-2{\\overrightarrow {AB}}\\cdot {\\overrightarrow {AC}}\\\\[6mu]&={\\overrightarrow {AB}}\\cdot {\\overrightarrow {AB}}+{\\overr