In [4]:
import bz2
from bs4 import BeautifulSoup
import os
import bisect
import re
import time

# Plan is to build a data pipeline for wiki data tables
# --> What tool should be used for this lineage? How to 'solve'
#  -> the data-pipeline // lineage problem. 
# Step 1. Download the wiki files (assume completed atm)
# Step 2. Unzip/load the index file
# -> Extract the "key" space file (the perspective defining)
# Step 3. "turn the .bz2 text file data (per key)"

# oh, i want a fork in the DAG. maybe i should use dagster
# ooh, i kinda want (or wish) that whatever notebook I write here
# can get turned into an executable, monitorable, and reproducible pipeline.
# Step 4. From each key, get the boolean of "contains a table"
# Step 5. (this contains a table) requires a "is a template file" -> template-lookup -> template-execute
#  -> Templates are also.. a DAG right, in theory? (can templates lookup other templates)
#   this is something like a  .groupby(page_key).get('templates-referenced') and then .mapto(template) 
#   (apply the tempaltes to those keys)

# -> Extract the "text-block" (with templates applied) for any document
# and then keep the documents when they have a table in them (filter)

# # From documents with tables on it, extract all of the tables into pandas dataframes


In [110]:
mountpath = '/media/share/DataSaves'
indexpath = f"{mountpath}/enwiki-20220101-pages-articles-multistream/enwiki-20220101-pages-articles-multistream-index.txt"
DUMP_FILE = f"{mountpath}/enwiki-20220101-pages-articles-multistream/enwiki-20220101-pages-articles-multistream.xml.bz2"
OUTPUT_WRITING_PATH = '.'

In [6]:
# utilities (?)

def get_byte_after(start_bytes, goal):
    test = bisect.bisect_left(start_bytes, goal)
    return start_bytes[test], start_bytes[test+1]

def get_index_for_title(index_lookup, index_file, title):
    try:
        return index_lookup[title]
    except:
        for x in index_file:
            if title in x:
                return int(x.split(':')[0])

def get_page_from_byte_offset(start, end):
    raw = get_xml_pages(start, end)
    soup = BeautifulSoup(raw, "lxml")
    return soup.find_all("page")
    
def get_page(index_lookup, start_bytes, title, index_file=None):
    index = get_index_for_title(index_lookup, index_file, title)
    pages = get_page_from_byte_offset(*get_byte_after(start_bytes, index))
    print(len(pages))
    for page in pages:
        if title.lower() in page.title.string.lower():
            return page
    print(f"couldn't find [{title}] in...")
    for i, page in enumerate(pages):
        print(page.title.string)
    print("=========== RETURNING NONE =================")
    return pages

def get_xml_pages(start, end):
    decomp = bz2.BZ2Decompressor()
    with open(DUMP_FILE, 'rb') as f:
        f.seek(start)
        readback = f.read(end - start - 1)
        page_xml = decomp.decompress(readback).decode()
    return page_xml

# page iterator
def page_iterator(start_bytes, start=0, end=None):
    for i in range(start, end or len(start_bytes)-1):
        pages = get_page_from_byte_offset(start_bytes[i], start_bytes[i+1])
        for page in pages:
            yield page

In [7]:
with open(indexpath, "r") as f:
    index_file = f.readlines()
start_bytes = list(sorted(set([int(x.split(":")[0]) for x in index_file])))
start_bytes.append(os.path.getsize(DUMP_FILE) + 1)

In [8]:
start_bytes[:5]

[600, 676575, 2075813, 3540751, 4490493]

In [9]:
index_lookup = {}
for i in index_file:
    a, b, *title = i.split(':')
    index_lookup[':'.join(title).rstrip()] = int(a)

In [50]:
# create a "document" perspective feature, start_bytes index on 'wiki-page'

In [10]:
# Attempt to find all the template files referenced, count them
template_names = []
for i in index_lookup:
    if i.startswith('Template:'):
        template_names.append(i)
# omg, ~767802~ 731503 template!? ('template' is in i) vs. (i.startswith('Template:'))

In [11]:
template_keys = set([x[9:] for x in template_names])

In [12]:
template_indexes = [index_lookup[x] for x in template_names]

In [13]:
print(f"Percentage of blocks that have template pages: {len(set(template_indexes)) / len(start_bytes)}")

Percentage of blocks that have template pages: 0.8126613619524048


In [14]:
# 80%, not much speedup to not just go through and full table-scan (seems like.. that's ~16 hours? from first glance)
# 0.0 % 0.23284292221069336 EST 5059508.585977554 s
# 0.04602123419745871 % 28.618114471435547 EST 62149.757650656386 s
# 0.09204246839491742 % 58.4118857383728 EST 63400.3052304324 s
# going to assume it's not worth writing the cost of the complexity of this algorithm (20% speedup)
#  -> just go through and full table-scan (this is the O(N) loop)
# hm. it's just a set-intersection though... sorta. (start_bytes & template_indexes) | (start_bytes & (template_indexes +1))
# oh, but, that's enough for all the information, not enough for the "spans"
# spans requires 2: (start_bytes & template_indexes), (start_bytes & (template_indexes +1))
# maybe flatten them or something? and then add to the "start_bytes" iterator, if it sees a duplicate, assume this is a "break" and go to the next segment
# keeps it monotonoically increasing, a "fault-taulent" (0->0 is okay request to subsystem), then the fallback is the the "full set"
# so, like 0 1 2 .. 45 46, and then "indexing" into it with [0 0 5 5 9 12 12]  when parsed by neighbor (0, 0), (0, 5), (5, 9), (9, 12), (12, 12), covers the full "set that contains"
# but then in a "coded" style parser, (0, 0) include row 0, then skip to (5, 5), just 5, then skip to include (9, 12)
# --> I wonder what the fastest version of this algorithm is (like, some sorta subset identifiying label using "monotonically increasing" integer of spans)
# i guess like, at _worst case_, you have the noisy subset (eg. 50% of them are included, in the peak of the 2^N of subset space)
# -- in that case, the "worst-case" solution, just [0 0 1 1 2 2 3 3 4 4 5 5 ...] .. is used. 
#   but in best case, [55 56] (single index, for instance), its pretty cheap to give the span
# --> This is a distraction --- I need to go back to original problem (maybe this is why i said (first) but didn't listen)

In [15]:
# FULL SCAN OPERATION, LIKELY TO BE EXPENSIVE

# estimated_total = len(start_bytes)*100
# st = time.time()
# template_pages = {}
# for i, page in enumerate(page_iterator(start_bytes)):
#     if page.title.text.startswith('Template:'):
#         template_pages[page.title.text] = page
#         print("Found a template page")
#         if len(template_pages) > 1000:
#             print("we're up to 1000")
#             break
#     if i%10000 == 0:
#         # estimated time remaining:
#         print(i*100 / estimated_total, '%', time.time() - st, 'EST', (time.time() - st) * (estimated_total - i) / (i+1), 's')

In [17]:
# for match in re.finditer(r'\{\{(.*)?\}\}', template_pages['Template:Periodic table'].text):
#     template_internals = match.groups()[0]
#     if template_internals in template_keys:
#         print(template_internals)

In [18]:
len(start_bytes)*100

21729100

In [133]:
# r'\{\{(.*)?\}\}' -> get tuples of all matches for "expressions"
# r'(.*)' -> matches anything

def get_matches(start_bytes, expression, start=0, end=None, pagelimit=None, limit=None):
    c = 0
    for i, page in enumerate(page_iterator(start_bytes, start=start, end=end)):
        # extract all '{{.*}}' sections from page.text using regex
        for j, match in enumerate(re.finditer(expression, page.text)):
            template_internals = match.groups()[0]
            c += 1
            yield (page, page.title.text, j, template_internals) # page may not be necessary
        if pagelimit:
            if i > pagelimit:
                break
        if limit:
            if c > limit:
                break


In [21]:
exampleofdc = set()
for anydoublecurly in get_matches(start_bytes, r'\{\{(.*)?\}\}', start=0, end=None):
    exampleofdc.add(anydoublecurly)
    if len(exampleofdc) > 5000:
        break

In [26]:
for x in exampleofdc:
    if 'table' in x:
        print(x)


In [34]:
#  '{| ' to ' |}'

trial = set()
for anydoublecurly in get_matches(start_bytes, r'\{\|(.*)?\|\}', start=0, end=None, limit=100000):
    trial.add(anydoublecurly)
    if len(trial) > 5000:
        break

KeyboardInterrupt: 

In [39]:
#  class="wikitable

trial = set()
for anydoublecurly in get_matches(start_bytes, r'(class="wikitable)', start=0, end=None, limit=100000):
    trial.add(anydoublecurly)
    if len(trial) > 5000:
        break

In [41]:
len(trial)

43

In [43]:
page, *_ = trial

In [45]:
page, *meta= page

In [139]:
# def extract_tables(page):
#     # class="wikitable)
#     for match in re.finditer(r'\{\|([\S\s]*?)\|\}', page.text):
#         if 'table' in match.groups()[0]:
#             yield match.groups()[0]

# # print(page.text)

In [140]:
import pandas as pd
goal_path = os.path.join(OUTPUT_WRITING_PATH, 'wikitable_extract_220814_test')
os.makedirs(goal_path, exist_ok=True)
block_of_writing = []
for i, (page, page_title_text, j, template_internals) in enumerate(get_matches(start_bytes, r'\{\|([\S\s]*?)\|\}', start=0, end=None, limit=100000)):
    if 'wikitable' in template_internals:
        block_of_writing.append((page_title_text, j, template_internals, page.text))
        if len(block_of_writing) > 1000:
            print("Writing block")
            pd.DataFrame(block_of_writing, columns=('page_title_text', 'j', 'template_internals', 'page_text')).to_parquet(os.path.join(goal_path, f'{i}.parquet'), engine='pyarrow', compression='gzip')
            block_of_writing = []


Writing block
Writing block
Writing block
Writing block
Writing block
Writing block
Writing block
Writing block
Writing block
Writing block
Writing block
Writing block
Writing block
Writing block
Writing block
Writing block
Writing block
Writing block


In [136]:
# !rm -rf ./wikitable_extract_220814_test/*
!ls -lh ./wikitable_extract_220814_test

total 5.1M
-rw-rw-r-- 1 jawaugh jawaugh 1.2M Aug 15 03:58 103.parquet
-rw-rw-r-- 1 jawaugh jawaugh 1.5M Aug 15 03:58 230.parquet
-rw-rw-r-- 1 jawaugh jawaugh 1.3M Aug 15 03:58 357.parquet
-rw-rw-r-- 1 jawaugh jawaugh 1.3M Aug 15 03:58 465.parquet


In [99]:
table_candidates[0]

(<page>
 <title>Anarchism</title>
 <ns>0</ns>
 <id>12</id>
 <revision>
 <id>1062508884</id>
 <parentid>1062326807</parentid>
 <timestamp>2021-12-28T22:10:54Z</timestamp>
 <contributor>
 <username>Whoop whoop pull up</username>
 <id>13157623</id>
 </contributor>
 <minor></minor>
 <model>wikitext</model>
 <format>text/x-wiki</format>
 <text bytes="98670" xml:space="preserve">{{short description|Political philosophy and movement}}
 {{other uses}}
 {{redirect2|Anarchist|Anarchists|other uses|Anarchist (disambiguation)}}
 {{pp-semi-indef}}
 {{good article}}
 {{use British English|date=August 2021}}
 {{use dmy dates|date=August 2021}}
 {{anarchism sidebar}}
 {{basic forms of government}}
 '''Anarchism''' is a [[political philosophy]] and [[Political movement|movement]] that is sceptical of [[authority]] and rejects all involuntary, coercive forms of [[Social hierarchy|hierarchy]]. Anarchism calls for the abolition of the [[State (polity)|state]], which it holds to be unnecessary, undesirable

In [70]:
print(page.text)


Aristotle
0
308

1062509248
1060996698
2021-12-28T22:13:55Z

Whoop whoop pull up
13157623


wikitext
text/x-wiki
{{Short description|Classical Greek philosopher and polymath, founder of the Peripatetic School}}
{{other uses}}
{{pp-protected|small=yes}}
{{Use British English Oxford spelling|date=March 2020}}
{{good article}}
{{Use dmy dates|date=March 2020}}
{{Infobox philosopher
 | name=Aristotle
 | image=Aristotle Altemps Inv8575.jpg
 | caption=Roman copy in marble of a Greek bronze [[Bust (sculpture)|bust]] of Aristotle by [[Lysippos]], c. 330 BC, with modern [[alabaster]] [[mantle (clothing)|mantle]]
 | birth_date=384 BC{{efn-ua|That these dates (the first half of the Olympiad year 384/383 BC, and in 322 shortly before the death of Demosthenes) are correct was shown by [[August Boeckh]] (''Kleine Schriften'' VI 195); for further discussion, see [[Felix Jacoby]] on ''[[FGrHist]]'' 244 F 38. Ingemar Düring, ''Aristotle in the Ancient Biographical Tradition'', Göteborg, 1957, {{p.|253

In [58]:
'|' in page.text

True

In [1]:
# Wikidata is a graph database (which is not what I'm looking at right now)
# I wonder if there's some... English <-> Sparkle writer that could "enumerate" a lot of the useful 
# language things, and also on search, attempt to modify from them for new ones. 
#-> i just want to be able to query wikidata with text i guess is what im saying
#  unlock the power of the language, and unlock the power of the world  # I laughed at Co-pilot

In [79]:
page.text

'\nAccessibleComputing\n0\n10\n\n\n1002250816\n854851586\n2021-01-23T15:15:01Z\n\nElli\n20842734\n\n\nshel\nwikitext\ntext/x-wiki\n#REDIRECT [[Computer accessibility]]\n\n{{rcat shell|\n{{R from move}}\n{{R from CamelCase}}\n{{R unprintworthy}}\n}}\nkmysdltgexdwkv2xsml3j44jb56dxvn\n\n'

In [75]:
pages_with_tables = []
for i, page in enumerate(page_iterator(start_bytes)):
    if 'wikitable' in page.text:
        pages_with_tables.append(page)
        print(f"{i} {page.title.string}")
        if len(pages_with_tables) > 1000:
            break

1 Anarchism
17 Albedo
58 A
65 Alabama
67 Achilles
69 Abraham Lincoln
70 Aristotle
72 Academy Award for Best Production Design
73 Academy Awards
94 ASCII
98 Apollo
99 Andre Agassi
101 Austroasiatic languages
103 Afroasiatic languages
104 Andorra
107 American Football Conference
113 Alaska
115 Agriculture
120 Algae
123 Alkane
134 Acid
138 Apollo 11
142 Alkali metal
159 Asia
160 Aruba
161 Articles of Confederation
165 Atlantic Ocean
167 Angola
168 Demographics of Angola
170 Economy of Angola
172 Angolan Armed Forces
173 Foreign relations of Angola
176 Alberta
181 Actinopterygii
183 Albert Einstein
185 Albania
192 Aikido
209 Alexander the Great
212 Asparagales
216 Asteroid
224 Arabic
237 Altaic languages
238 Austrian German
243 Aegean Sea
245 Amsterdam
247 Audi
253 Apple Inc.
254 Aberdeenshire
258 American Civil War
266 Motor neuron disease
267 Abjad
268 Abugida
273 MessagePad
274 A. E. van Vogt
275 Anna Kournikova
280 Arsenic
281 Antimony
282 Actinium
283 Americium
284 Astatine
286 Arable

In [76]:
len(pages_with_tables)

1001

In [55]:
page = get_page(index_lookup, start_bytes, template_names[0])

100


In [60]:
print(get_page(index_lookup, start_bytes, template_names[1211]).text)

100

Template:Greek myth (other gods)
10
851350

1055664590
1055664549
2021-11-17T03:46:11Z

Dave12121212
38957032

wikitext
text/x-wiki
{{sidebar
|width = 16em
|bodystyle = background:ivory; border-collapse:collapse;
|navbarstyle = padding-right:0.3em;
|titlestyle = background:darkseagreen
|title = [[Greek mythology|Greek deities]]<br />series
|headingstyle = background:#B1CBB1

| heading1 = 
| content1 = {{plainlist|
* [[Greek primordial deities|Primordial deities]]
* [[Titans (mythology)|Titans]] and [[Twelve Olympians|Olympians]]
* [[Greek sea gods|Aquatic deities]]
* [[Chthonic|Chthonic deities]]
* [[List of Mycenaean deities|Mycenaean deities]]
}}

| heading2 = Other deities
| content2style = text-align:left
| content2 = 
{{div col|colwidth=6em}}
* [[Anemoi]]
* [[Apate]]
* [[Asclepius]]
* [[Iris (mythology)|Iris]]
* [[Leto]]
* [[Keres]]
* [[Muses]]
* [[Nymph]]s
* [[Pan (god)|Pan]]
* [[Cupid and Psyche|Psyche]]
{{div col end}}
|name = Greek myth (other gods)
}}<noinclude>{{documen

In [58]:
print(page.text)


Template:Periodic table
10
96081

1040800501
1014823581
2021-08-26T19:01:02Z

DePiep
199625

cleanup categoryies into block scheme
wikitext
text/x-wiki
<noinclude>{{pp-protected|reason=Persistent [[WP:Disruptive editing|disruptive editing]] on a high visibility template per request at RfPP.|small=yes}}</noinclude><div style="background:{{element color|table background}}; font-size:90%; border:1px solid {{element color|table border}}; width:100%; max-width:1800px; margin:0 auto; padding:2px; text-align:center; vertical-align:top; box-sizing: border-box;">
<div style="background:{{element color|table title}}; padding:2px 1em;font-weight:bold;">{{#invoke:navbar|navbar|collapsible=0|[[Periodic table]]}}</div><!--

--><div class="mw-collapsible-content">
{| style="border-spacing:1px;display:block;overflow:auto;width:100%;margin:0 !important"
|- style="line-height:125%; vertical-align:top;"
! style="text-align:left; width:5%" | <small>[[Group (periodic table)|Group]]</small>
! style="backgr

In [53]:
len(pages)

9

In [48]:
template_names[0]

'Template:Periodic table\n'

In [18]:
get_byte_after(13130921384)

(13130921384, 13130962509)

In [38]:
from functools import cache

In [76]:
test = get_page('NBA player statistics start')

100


In [79]:
print(test.text)


Template:NBA player statistics start
10
18180598

1051188539
975684223
2021-10-22T01:55:02Z

MusikBot II
29539620


Changed protection settings for "[[Template:NBA player statistics start]]": [[Wikipedia:High-risk templates|High-risk template or module]]: 3184 transclusions ([[User:MusikBot II/TemplateProtector|more info]]) ([Edit=Require extended confirmed access] (indefinite) [Move=Require administrator access] (indefinite))
wikitext
text/x-wiki
{| class="wikitable sortable" style="text-align:right;"
|+{{{caption|}}}
! scope="col"|Year
! scope="col"|Team
! scope="col"|{{abbr|GP|Games played}}
! scope="col"|{{abbr|GS|Games started}}
! scope="col"|{{abbr|MPG|Minutes per game}}
! scope="col"|{{abbr|FG%|Field goal percentage}}
! scope="col"|{{abbr|3P%|3-point field-goal percentage}}
! scope="col"|{{abbr|FT%|Free-throw percentage}}
! scope="col"|{{abbr|RPG|Rebounds per game}}
! scope="col"|{{abbr|APG|Assists per game}}
! scope="col"|{{abbr|SPG|Steals per game}}
! scope="col"|{{abbr|BPG|B

In [66]:
import pandas as pd
test = pd.read_html('https://en.wikipedia.org/wiki/List_of_Daredevil_(TV_series)_characters')

In [70]:
test[0]

Unnamed: 0_level_0,Character,Portrayed by,Appearances,Appearances,Appearances,Appearances
Unnamed: 0_level_1,Character,Portrayed by,First,Season 1,Season 2,Season 3
Unnamed: 0_level_2,Main characters,Main characters,Main characters,Main characters,Main characters,Main characters
0,Matt MurdockDaredevil,Charlie Cox,"""Into the Ring""",Main,Main,Main
1,Karen Page,Deborah Ann Woll,"""Into the Ring""",Main,Main,Main
2,Foggy Nelson,Elden Henson,"""Into the Ring""",Main,Main,Main
3,James Wesley,Toby Leonard Moore,"""Into the Ring""",Main,,
4,Leland Owlsley,Bob Gunton,"""Into the Ring""",Main,,
5,Wilson FiskKingpin,Vincent D'Onofrio,"""Into the Ring""",Main[a],Main[a],Main[a]
6,Claire Temple,Rosario Dawson,"""Cut Man""",Main,Main,
7,Vanessa Marianna-Fisk,Ayelet Zurer,"""Rabbit in a Snowstorm""",Main,,Guest
8,Ben Urich,Vondie Curtis-Hall,"""Rabbit in a Snowstorm""",Main,,
9,Frank CastlePunisher,Jon Bernthal,"""Bang""",,Main,
