# Download data from WikiVoyage

In [None]:
import os
from pprint import pprint
import urllib
import json

# https://en.wikivoyage.org/wiki/Wikivoyage:Database_dump
url = 'https://dumps.wikimedia.org/enwikivoyage/latest/enwikivoyage-latest-pages-articles.xml.bz2'
if not os.path.isfile(url.split('/')[-1]):
    #this step takes some time
    urllib.urlretrieve (url, url.split('/')[-1])

# https://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python

# Decompress data

In [None]:
from bz2 import BZ2Decompressor

decompressed = ('.').join(url.split('/')[-1].split('.')[:-1])

if not os.path.isfile(decompressed):
    with open(decompressed, 'wb') as new_file, open(url.split('/')[-1], 'rb') as file:
        decompressor = BZ2Decompressor()
        #this step takes some time
        for data in iter(lambda : file.read(100 * 1024), b''):
            new_file.write(decompressor.decompress(data))
        
# https://stackoverflow.com/questions/16963352/decompress-bz2-files

# Convert XML to dict, process to ignore redirects and get articles only

In [None]:
import sys
!{sys.executable} -m pip install xmltodict
# https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/

import xmltodict

from collections import defaultdict
completed =0
articles = defaultdict(list)


if not os.path.isfile('wikivoyage_latest_articles_text.json'):
    #this step takes some time
    with open('enwikivoyage-latest-pages-articles.xml') as fd:
        doc = xmltodict.parse(fd.read())

    data = doc['mediawiki']['page']
    print('To process %s records' %len(data))

    for item in data:
        if 'redirect' not in item:
            try:
                articles[item['title']].append(item['revision']['text']['#text'])
            except KeyError:
                continue

        completed+=1
        if completed%10000==0 or completed==len(data):
            print('Completed %s' %completed)

    print(len(articles))
    for article, text in articles.iteritems():
        articles[article] = "".join(text)

    with open('wikivoyage_latest_articles_text.json', 'w') as f:
        json.dump(articles, f)

    del doc
    del data
    del articles

# Extract lat long and parent article from text

In [None]:
import unicodedata
import re
with open('wikivoyage_latest_articles_text.json', 'r') as f:
    consolidated = json.load(f)

print(len(consolidated))

cleaned = {}
completed = 0
issues = 0

for article_name in consolidated:
#   ignore articles which are not destinations (from article name and article tags)
    if not article_name.startswith('Module') and not article_name.startswith('Template:') and not article_name.startswith('Category:')\
    and not article_name.startswith('File:') and not article_name.startswith('Wikivoyage:') and not article_name.startswith('MediaWiki:') and not article_name in ['Moon', 'Space']\
        and len(re.findall('{{outlinetopic}}', consolidated[article_name].lower()))==0 and len(re.findall('{{usabletopic}}', consolidated[article_name].lower()))==0\
        and len(re.findall('{{guidetopic}}', consolidated[article_name].lower())) == 0 and len(re.findall('{{startopic}}', consolidated[article_name].lower()))==0\
        and len(re.findall('{{disamb}}', consolidated[article_name].lower()))==0 and len(re.findall('{{disambig}}', consolidated[article_name].lower()))==0 and len(re.findall('{{disambiguation}}', consolidated[article_name].lower()))==0\
        and len(re.findall('{{itinerary}}', consolidated[article_name].lower()))==0 \
        and len(re.findall('{{usablephrasebook}}', consolidated[article_name].lower()))==0 and len(re.findall('{{phrasebookguide}}', consolidated[article_name].lower()))==0 \
        and len(re.findall('{{Title-Index page}}', consolidated[article_name]))==0 \
        and len(re.findall('{{GalleryPageOf.*}}', consolidated[article_name]))==0 \
        and len(re.findall('{{stub}}', consolidated[article_name].lower())) == 0 \
        and len(re.findall('{{historical}}', consolidated[article_name].lower())) == 0:


        IsPartOf = re.findall('{{IsPartOf.*}}', consolidated[article_name]) + re.findall('{{isPartOf.*}}', consolidated[article_name])


        geo = re.findall('{{geo.*}}', consolidated[article_name].lower())


        city = re.findall('{{usablecity}}', consolidated[article_name].lower()) + re.findall('{{outlinecity}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidecity}}', consolidated[article_name].lower()) + re.findall('{{starcity}}', consolidated[article_name].lower()) \
                    + re.findall('{{ussblecity}}', consolidated[article_name].lower())

        country = re.findall('{{usablecountry}}', consolidated[article_name].lower()) + re.findall('{{outlinecountry}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidecountry}}', consolidated[article_name].lower()) + re.findall('{{starcountry}}', consolidated[article_name].lower())

        district = re.findall('{{usabledistrict}}', consolidated[article_name].lower()) + re.findall('{{outlinedistrict}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidedistrict}}', consolidated[article_name].lower())+ re.findall('{{stardistrict}}', consolidated[article_name].lower())

        region = re.findall('{{usableregion}}', consolidated[article_name].lower()) + re.findall('{{outlineregion}}', consolidated[article_name].lower()) \
                    + re.findall('{{guideregion}}', consolidated[article_name].lower()) + re.findall('{{extraregion\|subregion=yes}}', consolidated[article_name].lower()) \
                    + re.findall('{{starregion}}', consolidated[article_name].lower()) + re.findall('{{extraregion\|subregion=no}}', consolidated[article_name].lower()) \
                    + re.findall('{{extraregion}}', consolidated[article_name].lower())

        airport = re.findall('{{usableairport}}', consolidated[article_name].lower()) + re.findall('{{outlineairport}}', consolidated[article_name].lower())\
                    + re.findall('{{guideairport}}', consolidated[article_name].lower())

        park = re.findall('{{usablepark}}', consolidated[article_name].lower()) + re.findall('{{outlinepark}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidepark}}', consolidated[article_name].lower()) + re.findall('{{starpark}}', consolidated[article_name].lower())

        diveguide = re.findall('{{usablediveguide}}', consolidated[article_name].lower()) + re.findall('{{outlinediveguide}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidediveguide}}', consolidated[article_name].lower()) + re.findall('{{stardiveguide}}', consolidated[article_name].lower())

        continent = re.findall('{{usablecontinent}}', consolidated[article_name].lower()) + re.findall('{{outlinecontinent}}', consolidated[article_name].lower())


        if len(geo)>0 and len(diveguide)==0 and article_name not in ['Commonwealth of Independent States']: #skip dive guides
            article_name = article_name.replace('_', ' ').split('{{')[0].strip().lower()
            
            if unicodedata.normalize('NFKD', article_name).encode('ascii', 'ignore') == 'brac':
                article_name = 'brac'
            elif unicodedata.normalize('NFKD', article_name).encode('ascii', 'ignore') == 'rugen':
                article_name = 'rugen'
            
            cleaned[article_name] = {}

            # get lat long
            if len(geo)>0:
                cleaned[article_name]['latitude'] = geo[-1].split('|')[1]
                cleaned[article_name]['longitude'] = geo[-1].split('|')[2]

            # get parents
            cleaned[article_name]['ispartof'] = []
            for parts in IsPartOf:
                parent = parts.split('|')[1].replace('}','').replace('_', ' ').split('{{')[0].strip().lower()

                
                #fixes for inconsistent data
                if parent == 'ko pha ngan':
                    parent = 'ko pha-ngan'
                elif parent in ['lowland shandong', 'highland shandong', 'coastal shandong']:
                    parent = 'shandong'
                elif parent in ['southern delaware', 'northern delaware', 'central delaware']:
                    parent = 'delaware'
                elif parent in ['burgraviate', 'puster valley', 'eisack valley']:
                    parent = 'south tyrol'
                elif parent == 'bohemian-moravian highlands':
                    parent = 'highlands (czech republic)'
                elif parent == 'brahmanbaria district':
                    parent = 'chittagong division'
                elif parent == 'eastern desert':
                    parent = 'eastern desert (jordan)'
                elif parent == 'caribbean coast':
                    parent = 'caribbean coast (guatemala)'
                elif parent == 'santander (colombia)':
                    parent = 'santander (department, colombia)'
                elif parent == 'tripolitania':
                    parent = 'libya'
                elif parent == 'wooster area ohio':
                    parent = 'wooster area'
                elif parent == 'tatra mountains (poland)':
                    parent = 'tatra national park (poland)'
                elif parent == 'salcette':
                    parent = 'salcete'
                elif parent == 'eastern barbados':
                    parent = 'central eastern barbados'
                elif parent == 'east khasi hills':
                    parent = 'meghalaya'
                elif parent == 'samar':
                    parent = 'samar (philippines)'
                elif parent == 'chikmagalur (district)' and article_name != 'chikmagalur' :
                    parent = 'chikmagalur'
                elif unicodedata.normalize('NFKD', parent).encode('ascii', 'ignore') == 'rugen':
                    parent = 'rugen'
                elif article_name == 'chikmagalur':
                    parent = 'karnataka'
        
                cleaned[article_name]['ispartof'].append(parent)
                
                
            # get destination type
            if len(airport)>0:
                cleaned[article_name]['type']='airport'
            elif len(city)>0:
                cleaned[article_name]['type']='city'
            elif len(continent)>0:
                cleaned[article_name]['type']='continent'
            elif len(country)>0:
                cleaned[article_name]['type']='country'
            elif len(district)>0:
                cleaned[article_name]['type']='district'
            elif len(park)>0:
                cleaned[article_name]['type']='park'
            elif len(region)>0:
                cleaned[article_name]['type']='region'



    completed +=1
    if completed%1000==0 or completed==len(consolidated):
        print('Completed: %s' %completed)


print('Total sorted: %s' %len(cleaned))

with open('destination_details.json', 'w') as f:
    json.dump(cleaned, f)

del consolidated

# Map out parent child relationship for all articles into a dictionary

In [None]:

with open('destination_details.json', 'r') as f:
    cleaned = json.load(f)


def map_destinations(mapped, current_dict):
    global destination_mapping, parent, destination
    if not mapped:
        if parent in current_dict:
            current_dict[parent].update({destination: {}})
            mapped = True
            
            #find if any of the heads match this article
            if destination in destination_mapping:
                current_dict[parent][destination] = destination_mapping.pop(destination)
        
        #can only happen at top level
        elif destination in current_dict and current_dict==destination_mapping:
            current_dict[parent] = {}
            current_dict[parent][destination] = current_dict.pop(destination)
            mapped = True
            attached = False
            #find if any of the tails match this parent
            step_through_dict(attached, destination_mapping)

        else:
            for next_level in current_dict:
                mapped, current_dict[next_level] = map_destinations(mapped, current_dict[next_level])
    
    return mapped, current_dict


def step_through_dict(attached, current_dict):
    global destination_mapping, destination, parent

    iter_list = current_dict.keys()
    for item in iter_list:
        if not attached:
            if item == parent and current_dict!=destination_mapping:
                current_dict[parent][destination] = destination_mapping.pop(parent)[destination]
                attached = True

            elif len(current_dict[item])>0:
                attached = step_through_dict(attached, current_dict[item])

    return attached
        
    
destination_mapping = {}
print('To process %s records' %len(cleaned))
processed = 0
for destination in cleaned:
    for parent in cleaned[destination]['ispartof']:
            
        mapped = False
        mapped, destination_mapping = map_destinations(mapped, destination_mapping)
        
        if not mapped:
            destination_mapping[parent] = {}
            destination_mapping[parent][destination] = {}
    processed+=1
    if processed%1000==0 or processed==len(cleaned):
        print('Completed: %s' %processed)

with open('destination_mapping.json', 'w') as f:
    json.dump(destination_mapping, f)
    
del destination_mapping

# Examine data and return to previous step to fix inconsistent spellings

In [None]:
with open('destination_mapping.json', 'r') as f:
    destination_mapping = json.load(f)
for item in destination_mapping:
    print item


# Retrive parent chain for destination input

In [None]:
destination = 'thailand'

with open('destination_details.json', 'r') as f:
    details = json.load(f)


def get_parent(current, chain=''):
    if chain is '':
        chain=current.lower()
        current=current.lower()
    try:
        for parent in details[current]['ispartof']:
            chain = '%s|%s' %(parent, chain)
            chain = get_chain(parent, chain)
    except KeyError:
        return chain
    else:
        return chain
print get_parent(destination)

del details

# Get child articles from input

In [None]:
with open('destination_details.json', 'r') as f:
    details = json.load(f)
    
def get_child(search):
    child_articles = []
    for article in details:
        for parent in details[article]['ispartof']:
            if parent == search.lower():
                child_articles.append(article)
    return child_articles
for item in get_child('thailand'):
    print item
del details

# Search destinations containing input

In [None]:
with open('destination_details.json', 'r') as f:
    details = json.load(f)
    
def search(input):
    results = []
    for item in details:
        if input.lower() in item:
            results.append(item)
    return results

for result in search('bugis'):
    print result
    print get_parent(result)
    print get_child(result)
del details