In [1]:
from lxml import etree
import pandas as pd
import swifter
import numpy as np
import re
import mwparserfromhell as mwp
import pandoc
from tqdm import notebook

In [2]:
LINK_REGEX = r"\[\[[\w\s\-]*[\|\w]+\]\]"

In [3]:
parse = etree.iterparse("data/wiki_coords_utf8.xml", huge_tree=True, tag='page')

In [4]:
dicts = []
for tag, page in parse:
    _dict = {}
    for node in page:
        _dict[node.tag] = node.text

    dicts.append(_dict)


In [5]:
df = pd.DataFrame(dicts)

In [6]:
df.loc[:, "title"] = df.title.str.lower()

In [7]:
df.loc[:, "lnkd_art"] = df.text.swifter.apply(lambda x: [match.lstrip("[").rstrip("]").lower().split("|") for match in re.findall(LINK_REGEX, x)])

Pandas Apply:   0%|          | 0/1390234 [00:00<?, ?it/s]

In [13]:
df.loc[:, "lnkd_art_txt"] = df.lnkd_art.swifter.apply(lambda x: [(el[1].strip() if len(el)>1 else el[0].strip())for el in x])

Pandas Apply:   0%|          | 0/1390234 [00:00<?, ?it/s]

In [14]:
df.loc[:, "lnkd_art_nms"] = df.lnkd_art.swifter.apply(lambda x: [el[0].strip() for el in x])

Pandas Apply:   0%|          | 0/1390234 [00:00<?, ?it/s]

In [15]:
all_art_names = []
for art_names in df.lnkd_art_nms.values: all_art_names += art_names

unique_art_names = set(all_art_names)

In [16]:
unique_art_names = [name.strip() for name in unique_art_names]

In [17]:
art_intersection = set(unique_art_names).intersection(df.title.str.strip().to_list())

In [18]:
intersecter = set(art_intersection)

In [13]:
df.loc[:, "present_geo_arts"] = df.lnkd_art_nms.swifter.progress_bar(True).apply(lambda x: list(intersecter.intersection(x)))

Pandas Apply:   0%|          | 0/1390234 [00:00<?, ?it/s]

In [14]:
df.loc[:, "parsed_text"] = df.text.swifter.apply(lambda x: mwp.parse(x).strip_code())

Pandas Apply:   0%|          | 0/1390234 [00:00<?, ?it/s]

In [15]:
# df.to_csv('wiki_coords_article_article_matched.gz', chunksize=100000)
df.to_parquet("data/wiki_coords_article_matched.parquet")

In [16]:
df.head(20)

Unnamed: 0,title,coords,text,lnkd_art,lnkd_art_nms,present_geo_arts,parsed_text
0,alabama,{{coord|32.7794|-86.8287|dim:300000_region:US-...,{{short description|State in the southeastern ...,"[[northern flicker, yellowhammer], [dixie], [a...","[northern flicker, dixie, audemus jura nostra ...","[yokohama tire lpga classic, maxwell air force...",Alabama () is a state in the Southeastern regi...
1,algeria,{{coord|28|N|2|E|scale:10000000_type:country_r...,{{short description|Country in North Africa}}\...,"[[arabic], [kassaman], [algiers], [state relig...","[arabic, kassaman, algiers, state religion, ch...","[mediterranean sea, didouche mourad, nigeria, ...","Algeria, officially the People's Democratic Re..."
2,andorra,"{{coord|42|30|N|1|31|E|display=inline,title}}",{{distinguish|text = the Italian town of [[And...,"[[andora], [latin], [el gran carlemany], [ando...","[andora, latin, el gran carlemany, andorra la ...","[radio andorra, la massana, pont dels escalls,...","Andorra, officially the Principality of Andorr..."
3,alaska,{{Coord|64|50|N|147|43|W}},{{short description|State of the United States...,"[[ahtna language, ahtna], [alutiiq language, a...","[ahtna language, alutiiq language, english lan...","[trans-alaska pipeline system, worthington gla...",Alaska (; ; ; ; Yup'ik: Alaskaq; ) is a U.S. s...
4,american national standards institute,{{Coordinates|38|54|14|N|77|02|35|W}},{{Redirect2|American Standards Association|ANS...,"[[non-profit organization], [american english,...","[non-profit organization, american english, ce...","[american society of mechanical engineers, new...",The American National Standards Institute (ANS...
5,apollo 11,{{Coord|13|19|N|169|9|W|type:event|name=Apollo...,{{short description|First crewed space mission...,"[[buzz aldrin], [neil armstrong], [nasa], [gru...","[buzz aldrin, neil armstrong, nasa, grumman, s...","[ford island, goldstone deep space communicati...","Apollo 11 (July 16–24, 1969) was the spaceflig..."
6,apollo 8,{{Coord|8|8|N|165|1|W|type:event|name=Apollo 8...,{{Short description|First crewed space mission...,"[[earthrise], [william anders], [nasa], [north...","[earthrise, william anders, nasa, north americ...","[kennedy space center, atlantic ocean, supreme...","Apollo 8 (December 21–27, 1968) was the first ..."
7,aruba,{{Coord|12|31|07|N|70|02|09|W|type:city}},{{About|the island country}}\n{{short descript...,"[[aruba dushi tera], [sovereign state], [kingd...","[aruba dushi tera, sovereign state, kingdom of...","[bushiribana and balashi, hooiberg, hispaniola...","Aruba ( , , ) is an island country in the mid-..."
8,atlantic ocean,{{coord|0|N|25|W|region:ZZ_type:waterbody|disp...,"{{Short description|Ocean between Europe, Afri...","[[arctic], [antarctic], [list of ports and har...","[arctic, antarctic, list of ports and harbours...","[gulf of venezuela, sargasso sea, walvis ridge...",[[File:Atlantic Ocean to Africa.ogv|thumb|This...
9,angola,{{Coord|12|30|S|18|30|E|display=title}},{{Short description|Country on the west coast ...,"[[angola avante], [luanda], [christianity], [p...","[angola avante, luanda, christianity, protesta...","[cabinda province, portugal, carthage film fes...","Angola (; ), officially the Republic of Angola..."
