In [80]:
import bz2
import networkx as nx
import numpy as np
import os
import time

from typing import Tuple, List

INDEX = 'dumps/enwiktionary-latest-pages-articles-multistream-index.txt.bz2'
ARTICLES = 'dumps/enwiktionary-latest-pages-articles-multistream.xml.bz2'


def split_index(index_string: str) -> Tuple[int, int, str]:
    """
    Split an index string into its components.

    Parameters
    ----------
    index_string : str
        A string containing the byte offset, article ID, and title of an article.

    Returns
    -------
    Tuple[int, int, str]
        A tuple containing the byte offset, article ID, and title of the article.
    """
    parts = index_string.split(':')
    byte_offset = int(parts[0])
    article_id = int(parts[1])
    title = parts[2].strip()
    return byte_offset, article_id, title


def get_article(index_string: str) -> str:
    """
    Retrieve the content of an article given its index string.

    Parameters
    ----------
    index_string : str
        A string containing the byte offset, article ID, and title of an article.

    Returns
    -------
    str
        The content of the article, unprocessed.
    """
    decompressor = bz2.BZ2Decompressor()
    byte_offset, _, title = split_index(index_string)

    with open(ARTICLES, 'rb') as f:
        f.seek(byte_offset)
        decompressed_data = b''

        while True:
            chunk = f.read(1024)
            if not chunk:
                break
            decompressed_data += decompressor.decompress(chunk)
            if b'</page>' in decompressed_data:
                break

    # Convert bytes to string after decompression
    article = decompressed_data.decode('utf-8', errors='replace')

    # Extract the specific article content from the decompressed data
    end_tag = f'</page>'
    start_index = article.find(f'<title>{title}</title>')
    end_index = article.find(end_tag, start_index) + len(end_tag)

    if start_index != -1 and end_index != -1:
        # back up a bit for start_index to include the stuff before the title,
        # which is 11 characters (len('<page>') + 4 spaces + 1 newline)
        return article[start_index - 11:end_index]
    else:
        return "Article not found or incomplete."

In [81]:
def split_article(article: str) -> List[str]:
    """
    Splits an article up, grouping by language sections.

    Parameters
    ----------
    article : str
        The content of an article.

    Returns
    -------
    List[str]
        A list of strings, each containing the content of a language section.
    """
    return []


In [82]:
start = time.perf_counter()
with bz2.open(INDEX, 'rt', encoding='utf-8') as f:
    indices = f.readlines()
end = time.perf_counter()
print(f'Loaded {len(indices)} indices in {end - start:.2f} seconds.')

Loaded 9292639 indices in 15.08 seconds.


In [91]:
import re
import random
test_index = 22964
article = get_article(indices[test_index])
print(article)

<page>
    <title>ta</title>
    <ns>0</ns>
    <id>26555</id>
    <revision>
      <id>80102692</id>
      <parentid>79640642</parentid>
      <timestamp>2024-06-03T03:25:47Z</timestamp>
      <contributor>
        <username>WingerBot</username>
        <id>2024159</id>
      </contributor>
      <minor />
      <comment>replace &lt;* {{audio|vi|LL-Q9199 (vie)-Penn Zero MSSJ-ta.wav|a=Hà Nội}}&gt; with &lt;* {{audio|vi|LL-Q9199 (vie)-Penn Zero MSSJ-ta.wav|a=Hanoi}}&gt; (clean up audio captions)</comment>
      <origin>80102692</origin>
      <model>wikitext</model>
      <format>text/x-wiki</format>
      <text bytes="53250" sha1="tjcuvgg9n8jkk100f7u44nzwjm08w13" xml:space="preserve">{{also|Appendix:Variations of &quot;ta&quot;}}
==Translingual==

===Symbol===
{{mul-symbol}}

# {{ISO 639|1}}

==English==

===Etymology 1===
{{unc|en}}, but possibly young child's pronunciation of {{m-lite|en|thanks}}&lt;ref&gt;{{R:OED Online|code=196719|date=1989}}&lt;/ref&gt;&lt;ref&gt;&quot;[https://en

In [155]:



tags = set()
for section in get_language_sections(article):
    # print(section)
    for tag in get_tags_from_section(section):
        tags.add(tag)
    if '==English==' in section:
        break
for tag in tags:
    print(tag)

('Symbol', '{{mul-symbol}}\n\n# {{ISO 639|1}}')
('Etymology 1', "{{unc|en}}, but possibly young child's pronunciation of {{m-lite|en|thanks}}&lt;ref&gt;{{R:OED Online|code=196719|date=1989}}&lt;/ref&gt;&lt;ref&gt;&quot;[https://en.oxforddictionaries.com/definition/ta?locale=en ta]&quot; in Oxford Living Dictionaries&lt;/ref&gt; or an acronym for 'thanks a lot'.\n\nAlternatively, derived from {{bor-lite|en|da|tak}}, from {{der-lite|en|non|þǫkk}}, from {{der-lite|en|gem-pro|sc=Latn|*þankō}}, {{m-lite|gem-pro|sc=Latn|*þankaz}}.")
('Alternative forms', '* {{l-lite|en|taa}}')
('Pronunciation', '* {{IPA-lite|en|/tɑː/|[tʰɑː]}}\n* {{audio|en|en-au-ta.ogg|a=AU}}\n* {{rhymes-lite|en|ɑː|s=1}}')
('Interjection', "{{en-interj}}\n\n# {{lb|en|colloquial|chiefly|Commonwealth}} [[thanks|Thanks]].\n#: {{ux-lite|en|'''Ta''' for the cup of tea.}}\n# {{lb|en|Canada|baby talk}} [[give]] (imperative)\n#: {{ux-lite|en|Mommy needs the bottle back. '''Ta'''!}}")
('Usage notes', "The expression ''[[ta ta]]'' dif