In [65]:
from bs4 import BeautifulSoup
import re

def is_mathjax_present(all_text):
    # Regular expression pattern to match MathJax expressions
    mathjax_pattern = r'\$\$([^\r\n]+?)\$\$|\\\(([^)]*?)\\\)|\\\[(.*?)\\\]'

    # Search for MathJax expressions in the text
    mathjax_matches = re.finditer(mathjax_pattern, all_text)

    matched_texts = []
    for match in mathjax_matches:
        start, end = match.span()
        span_text = all_text[start:end]
        # there should be at least 1 alphabetical character in span
        if not any([text.isalpha() for text in span_text]):
            print(f"skipped({start}-{end}):\t{span_text}")
        else:
            matched_texts.append(span_text)
            print(f"matched ({start}-{end}):\t{span_text}")
    
    # If any MathJax expressions are found, return True; otherwise, return False
    return len(matched_texts)

# Example usage
html_content = """This is a MathJax equation: $$x^2 + y^2 = r^2$$ and another one: \[E=mc^2\]."""

example_2 = """Test $$$$"""

print(is_mathjax_present(html_content))
print("="*20)
print(is_mathjax_present(example_2))

matched (28-47):	$$x^2 + y^2 = r^2$$
matched (65-75):	\[E=mc^2\]
2
0


In [51]:
# dataset at: bigcode/cc-sample

data_path = "./data/cc_minhash.jsonl"

In [68]:
import json
from tqdm import tqdm_notebook as tqdm

counter = 0
full_counter = 0
characters = 0

with open(data_path, "r") as f:
    for line in tqdm(f):
        full_counter += 1
        data = json.loads(line)
        if is_mathjax_present(data["content"])>0:
            print("============")
            counter += 1
            characters+=len(data["content"])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for line in tqdm(f):


0it [00:00, ?it/s]

matched (2138-2162):	\(\text{sin} \times 35\)
matched (2248-2262):	\(\text{sin}\)
matched (2282-2289):	\(\pi\)
matched (6574-6588):	\(\text{sin}\)
matched (6593-6607):	\(\text{cos}\)
matched (8317-8331):	\(\text{tan}\)
skipped(423-428):	$$$$$
skipped(6962-6967):	$$$$$
matched (7644-7651):	\(\le\)
matched (8214-8223):	\(\beta\)
matched (14868-14888):	\(3 \times 10^{-6}\)
matched (14911-14931):	\(3 \times 10^{-5}\)
matched (15013-15033):	\(1 \times 10^{-8}\)
matched (199-204):	\(n\)
matched (323-328):	\(p\)
matched (332-339):	\([n]\)
matched (477-482):	\(c\)
matched (956-967):	\(1+\ln n\)
matched (197-209):	\(z_{TWAS}\)
matched (326-333):	\(z_T\)
matched (411-470):	$$ z_{TWAS} = \frac{w^T_{ge}z_T}{\sqrt{w^T_{ge}Vw_{ge}}} $$
matched (477-487):	\(w_{ge}\)
matched (560-567):	\(z_T\)
matched (634-646):	\(z_{EWAS}\)
matched (691-750):	$$ z_{EWAS} = \frac{w^T_{me}z_T}{\sqrt{w^T_{me}Vw_{me}}} $$
matched (757-767):	\(w_{me}\)
matched (18961-18971):	$$$ and $$
skipped(2562-2567):	$$$$$
skipped(53

In [69]:
print(f"All docs:      {full_counter}")
print(f"MathJax docs:  {counter}")
print(f"Percentage:    {counter/full_counter*100:.2f}%")
print(f"Characters:    {characters}")
print(f"Tokens (app.): {characters/3.5:.2f}")

All docs:      1008080
MathJax docs:  289
Percentage:    0.03%
Characters:    5119511
Tokens (app.): 1462717.43
