In [1]:
from lxml import etree
import pandas as pd
import numpy as np
import re
import geopandas
import mwparserfromhell as mwp
import pandoc

In [2]:
LINK_REGEX = r"\[\[[\w\s\-]*[\|\w]+\]\]"

In [3]:
parse = etree.iterparse("wiki_coords.xml", huge_tree=True, tag='page')

In [4]:
dicts = []
for tag, page in parse:
    _dict = {}
    for node in page:
        _dict[node.tag] = node.text
    
    dicts.append(_dict)
        

In [5]:
df = pd.DataFrame(dicts)

In [6]:
df.loc[:, "title"] = df.title.str.lower()

In [7]:
df.loc[:, "lnkd_art"] = df.text.apply(lambda x: [match.lstrip("[").rstrip("]").lower().split("|") for match in re.findall(LINK_REGEX, x)])

In [45]:
df.loc[:, "lnkd_art_nms"] = df.lnkd_art.apply(lambda x: [el[0].strip() for el in x])

In [46]:
all_art_names = []
for art_names in df.lnkd_art_nms.values: all_art_names += art_names
    
unique_art_names = np.unique(all_art_names)

In [47]:
unique_art_names = [name.strip() for name in unique_art_names]

In [48]:
art_intersection = np.intersect1d(unique_art_names, df.title)

In [49]:
df.loc[:, "present_geo_arts"] = df.lnkd_art_nms.apply(lambda x: np.intersect1d(art_intersection, x))

In [51]:
df.to_json('wiki_coords_article_article_matched.json')

In [2]:
import modin.pandas as mpd
import ray
ray.init()

In [3]:
df = pd.read_json('wiki_coords_article_article_matched.json')


    import ray
    ray.init()

To request implementation, send an email to feature_requests@modin.org.


In [4]:
def parse_wiki(wikitext):
    try:
        return pandoc.write(pandoc.read(wikitext, format='mediawiki'), format='plain')
    except:
        return ''

In [None]:
df.loc[:, "parsed_text"] = df.text.apply(parse_wiki)

In [None]:
df.to_json('wiki_coords_article_article_matched.json')

In [22]:
df.sample(20)

Unnamed: 0,title,coords,text,lnkd_art,lnkd_art_nms
136966,sarab-e khamzan-e kuchek,{{Coord missing|Kohgiluyeh and Boyer-Ahmad Pro...,{{Infobox settlement\n|official_name =Sarab-e ...,"[[list of countries, country], [provinces of i...","[list of countries, provinces of iran, countie..."
87624,keck hospital of usc,{{Coord|34.062362|N|118.202147|W|display=title}},{{Coord|34.062362|N|118.202147|W|display=title...,"[[university of southern california, usc], [un...","[university of southern california, university..."
55204,marine corps museum,{{Coord|38|52|30|N|76|59|32|W|display=title}},{{For|the new museum|National Museum of the Ma...,"[[marine corps historical society], [washingto...","[marine corps historical society, washington n..."
68282,"school of chemistry, university of edinburgh",{{Coord|55.924|-3.176|display=title}},{{Use dmy dates|date=July 2017}}\n{{Use Britis...,"[[edinburgh], [united kingdom], [university of...","[edinburgh, united kingdom, university of edin..."
126557,little wind river (wyoming),{{Coord|43.0102349|-108.8817904|type:river|dis...,"{{short description|River in Wyoming, United S...","[[united states], [wyoming], [wind river range...","[united states, wyoming, wind river range, wyo..."
62065,mit electrical engineering and computer scienc...,{{Coord|42|21|40.2|N|71|5|32.5|W|display=title}},The '''MIT Electrical Engineering and Computer...,"[[massachusetts institute of technology], [mas...","[massachusetts institute of technology, massac..."
51507,frauenfeld district,{{Coord|47|33|N|8|53|E|source:eowiki_region:CH...,{{Infobox settlement\n | official_name ...,"[[countries of the world, country], [cantons o...","[countries of the world, cantons of switzerlan..."
38173,zoo knoxville,{{Coord|35.9999|-83.8880|type:landmark_region:...,{{Infobox zoo\n|logo= Zoo Knoxville.png\n|zoo_...,"[[tennessee], [association of zoos and aquariu...","[tennessee, association of zoos and aquariums,..."
127076,uawa county,"{{Coord|38|22.3|S|178|17.815|E|display=inline,...",{{one source|date=May 2012}}\n{{Infobox Former...,"[[new zealand], [tolaga bay], [gisborne region...","[new zealand, tolaga bay, gisborne region, cou..."
88474,ãstã­ nad labem zoo,{{Coord|50|39|51.27|N|14|3|41.51|E|source:cswi...,{{Use dmy dates|date=February 2020}}\n{{Infobo...,"[[czech republic], [european association of zo...","[czech republic, european association of zoos ..."
