In [None]:
import bz2
from bs4 import BeautifulSoup
import os
import bisect
import re
import time

# Plan is to build a data pipeline for wiki data tables
# --> What tool should be used for this lineage? How to 'solve'
#  -> the data-pipeline // lineage problem. 
# Step 1. Download the wiki files (assume completed atm)
# Step 2. Unzip/load the index file
# -> Extract the "key" space file (the perspective defining)
# Step 3. "turn the .bz2 text file data (per key)"

# oh, i want a fork in the DAG. maybe i should use dagster
# ooh, i kinda want (or wish) that whatever notebook I write here
# can get turned into an executable, monitorable, and reproducible pipeline.
# Step 4. From each key, get the boolean of "contains a table"
# Step 5. (this contains a table) requires a "is a template file" -> template-lookup -> template-execute
#  -> Templates are also.. a DAG right, in theory? (can templates lookup other templates)
#   this is something like a  .groupby(page_key).get('templates-referenced') and then .mapto(template) 
#   (apply the tempaltes to those keys)

# -> Extract the "text-block" (with templates applied) for any document
# and then keep the documents when they have a table in them (filter)

# # From documents with tables on it, extract all of the tables into pandas dataframes


In [None]:
mountpath = '/media/share/DataSaves'
indexpath = f"{mountpath}/enwiki-20220101-pages-articles-multistream/enwiki-20220101-pages-articles-multistream-index.txt"
DUMP_FILE = f"{mountpath}/enwiki-20220101-pages-articles-multistream/enwiki-20220101-pages-articles-multistream.xml.bz2"
OUTPUT_WRITING_PATH = '.'

In [None]:
# utilities (?)

def get_byte_after(start_bytes, goal):
    test = bisect.bisect_left(start_bytes, goal)
    return start_bytes[test], start_bytes[test+1]

def get_index_for_title(index_lookup, index_file, title):
    try:
        return index_lookup[title]
    except:
        for x in index_file:
            if title in x:
                return int(x.split(':')[0])

def get_page_from_byte_offset(start, end):
    raw = get_xml_pages(start, end)
    soup = BeautifulSoup(raw, "lxml")
    return soup.find_all("page")
    
def get_page(index_lookup, start_bytes, title, index_file=None):
    index = get_index_for_title(index_lookup, index_file, title)
    pages = get_page_from_byte_offset(*get_byte_after(start_bytes, index))
    print(len(pages))
    for page in pages:
        if title.lower() in page.title.string.lower():
            return page
    print(f"couldn't find [{title}] in...")
    for i, page in enumerate(pages):
        print(page.title.string)
    print("=========== RETURNING NONE =================")
    return pages

def get_xml_pages(start, end):
    decomp = bz2.BZ2Decompressor()
    with open(DUMP_FILE, 'rb') as f:
        f.seek(start)
        readback = f.read(end - start - 1)
        page_xml = decomp.decompress(readback).decode()
    return page_xml

# page iterator
def page_iterator(start_bytes, start=0, end=None):
    for i in range(start, end or len(start_bytes)-1):
        pages = get_page_from_byte_offset(start_bytes[i], start_bytes[i+1])
        for page in pages:
            yield page

In [None]:
with open(indexpath, "r") as f:
    index_file = f.readlines()
start_bytes = list(sorted(set([int(x.split(":")[0]) for x in index_file])))
start_bytes.append(os.path.getsize(DUMP_FILE) + 1)

In [None]:
start_bytes[:5]

In [None]:
index_lookup = {}
for i in index_file:
    a, b, *title = i.split(':')
    index_lookup[':'.join(title).rstrip()] = int(a)

In [None]:
# create a "document" perspective feature, start_bytes index on 'wiki-page'

In [None]:
# Attempt to find all the template files referenced, count them
template_names = []
for i in index_lookup:
    if i.startswith('Template:'):
        template_names.append(i)
# omg, ~767802~ 731503 template!? ('template' is in i) vs. (i.startswith('Template:'))

In [None]:
template_keys = set([x[9:] for x in template_names])

In [None]:
template_indexes = [index_lookup[x] for x in template_names]

In [None]:
print(f"Percentage of blocks that have template pages: {len(set(template_indexes)) / len(start_bytes)}")

In [None]:
# 80%, not much speedup to not just go through and full table-scan (seems like.. that's ~16 hours? from first glance)
# 0.0 % 0.23284292221069336 EST 5059508.585977554 s
# 0.04602123419745871 % 28.618114471435547 EST 62149.757650656386 s
# 0.09204246839491742 % 58.4118857383728 EST 63400.3052304324 s
# going to assume it's not worth writing the cost of the complexity of this algorithm (20% speedup)
#  -> just go through and full table-scan (this is the O(N) loop)
# hm. it's just a set-intersection though... sorta. (start_bytes & template_indexes) | (start_bytes & (template_indexes +1))
# oh, but, that's enough for all the information, not enough for the "spans"
# spans requires 2: (start_bytes & template_indexes), (start_bytes & (template_indexes +1))
# maybe flatten them or something? and then add to the "start_bytes" iterator, if it sees a duplicate, assume this is a "break" and go to the next segment
# keeps it monotonoically increasing, a "fault-taulent" (0->0 is okay request to subsystem), then the fallback is the the "full set"
# so, like 0 1 2 .. 45 46, and then "indexing" into it with [0 0 5 5 9 12 12]  when parsed by neighbor (0, 0), (0, 5), (5, 9), (9, 12), (12, 12), covers the full "set that contains"
# but then in a "coded" style parser, (0, 0) include row 0, then skip to (5, 5), just 5, then skip to include (9, 12)
# --> I wonder what the fastest version of this algorithm is (like, some sorta subset identifiying label using "monotonically increasing" integer of spans)
# i guess like, at _worst case_, you have the noisy subset (eg. 50% of them are included, in the peak of the 2^N of subset space)
# -- in that case, the "worst-case" solution, just [0 0 1 1 2 2 3 3 4 4 5 5 ...] .. is used. 
#   but in best case, [55 56] (single index, for instance), its pretty cheap to give the span
# --> This is a distraction --- I need to go back to original problem (maybe this is why i said (first) but didn't listen)

In [None]:
# FULL SCAN OPERATION, LIKELY TO BE EXPENSIVE

# estimated_total = len(start_bytes)*100
# st = time.time()
# template_pages = {}
# for i, page in enumerate(page_iterator(start_bytes)):
#     if page.title.text.startswith('Template:'):
#         template_pages[page.title.text] = page
#         print("Found a template page")
#         if len(template_pages) > 1000:
#             print("we're up to 1000")
#             break
#     if i%10000 == 0:
#         # estimated time remaining:
#         print(i*100 / estimated_total, '%', time.time() - st, 'EST', (time.time() - st) * (estimated_total - i) / (i+1), 's')

In [None]:
# for match in re.finditer(r'\{\{(.*)?\}\}', template_pages['Template:Periodic table'].text):
#     template_internals = match.groups()[0]
#     if template_internals in template_keys:
#         print(template_internals)

In [None]:
len(start_bytes)*100

In [None]:
# r'\{\{(.*)?\}\}' -> get tuples of all matches for "expressions"
# r'(.*)' -> matches anything

def get_matches(start_bytes, expression, start=0, end=None, pagelimit=None, limit=None):
    c = 0
    for i, page in enumerate(page_iterator(start_bytes, start=start, end=end)):
        # extract all '{{.*}}' sections from page.text using regex
        for j, match in enumerate(re.finditer(expression, page.text)):
            template_internals = match.groups()[0]
            c += 1
            yield (page, page.title.text, j, template_internals) # page may not be necessary
        if pagelimit:
            if i > pagelimit:
                break
        if limit:
            if c > limit:
                break


In [None]:
exampleofdc = set()
for anydoublecurly in get_matches(start_bytes, r'\{\{(.*)?\}\}', start=0, end=None):
    exampleofdc.add(anydoublecurly)
    if len(exampleofdc) > 5000:
        break

In [None]:
for x in exampleofdc:
    if 'table' in x:
        print(x)


In [None]:
#  '{| ' to ' |}'

trial = set()
for anydoublecurly in get_matches(start_bytes, r'\{\|(.*)?\|\}', start=0, end=None, limit=100000):
    trial.add(anydoublecurly)
    if len(trial) > 5000:
        break

In [None]:
#  class="wikitable

trial = set()
for anydoublecurly in get_matches(start_bytes, r'(class="wikitable)', start=0, end=None, limit=100000):
    trial.add(anydoublecurly)
    if len(trial) > 5000:
        break

In [None]:
len(trial)

In [None]:
page, *_ = trial

In [None]:
page, *meta= page

In [None]:
# def extract_tables(page):
#     # class="wikitable)
#     for match in re.finditer(r'\{\|([\S\s]*?)\|\}', page.text):
#         if 'table' in match.groups()[0]:
#             yield match.groups()[0]

# # print(page.text)

In [None]:
import pandas as pd
goal_path = os.path.join(OUTPUT_WRITING_PATH, 'wikitable_extract_220814_test')
os.makedirs(goal_path, exist_ok=True)
block_of_writing = []
for i, (page, page_title_text, j, template_internals) in enumerate(get_matches(start_bytes, r'\{\|([\S\s]*?)\|\}', start=0, end=None, limit=100000)):
    if 'wikitable' in template_internals:
        block_of_writing.append((page_title_text, j, template_internals, page.text))
        if len(block_of_writing) > 1000:
            print("Writing block")
            pd.DataFrame(block_of_writing, columns=('page_title_text', 'j', 'template_internals', 'page_text')).to_parquet(os.path.join(goal_path, f'{i}.parquet'), engine='pyarrow', compression='gzip')
            block_of_writing = []


In [None]:
# !rm -rf ./wikitable_extract_220814_test/*
!ls -lh ./wikitable_extract_220814_test

In [None]:
table_candidates[0]

In [None]:
print(page.text)

In [None]:
'|' in page.text

In [None]:
# Wikidata is a graph database (which is not what I'm looking at right now)
# I wonder if there's some... English <-> Sparkle writer that could "enumerate" a lot of the useful 
# language things, and also on search, attempt to modify from them for new ones. 
#-> i just want to be able to query wikidata with text i guess is what im saying
#  unlock the power of the language, and unlock the power of the world  # I laughed at Co-pilot

In [None]:
page.text

In [None]:
pages_with_tables = []
for i, page in enumerate(page_iterator(start_bytes)):
    if 'wikitable' in page.text:
        pages_with_tables.append(page)
        print(f"{i} {page.title.string}")
        if len(pages_with_tables) > 1000:
            break

In [None]:
len(pages_with_tables)

In [None]:
page = get_page(index_lookup, start_bytes, template_names[0])

In [None]:
print(get_page(index_lookup, start_bytes, template_names[1211]).text)

In [None]:
print(page.text)

In [None]:
len(pages)

In [None]:
template_names[0]

In [None]:
get_byte_after(13130921384)

In [None]:
from functools import cache

In [None]:
test = get_page('NBA player statistics start')

In [None]:
print(test.text)

In [None]:
import pandas as pd
test = pd.read_html('https://en.wikipedia.org/wiki/List_of_Daredevil_(TV_series)_characters')

In [None]:
test[0]