## The input files for this notebook are output files of the destinations graph notebook so it'll be necessary to complete that first.

# 1. Load partially cleaned data from destinations graph notebook

In [None]:
import json
# Get the text from each article (Step 4 in destinations graph notebook)
with open('wikivoyage_latest_articles_text.json', 'r') as f:
    raw_text = json.load(f)

# Get list of destinations to process (Step 5 in destinations graph notebook)
with open('destination_details.json', 'r') as f:
    destination_details = json.load(f)
    

# 2. Names in raw_text are unedited while those in destinations_to_check are cleaned, need to standardize them

In [None]:
import unicodedata
def standardize_name(destination_name):
    # this is following the name cleaning process in step 5 of destinations graph notebook
    destination_name = destination_name.replace('_', ' ').split('{{')[0].strip().lower()
    if unicodedata.normalize('NFKD', destination_name).encode('ascii', 'ignore') == 'brac':
        destination_name = 'brac'
    elif unicodedata.normalize('NFKD', destination_name).encode('ascii', 'ignore') == 'rugen':
        destination_name = 'rugen'
    return destination_name

# 3. Extract specific parts of article text for more efficient extraction of themes

In [None]:
# create a dictionary with destination name as key and the relevant text as value
consolidated = {}

import re
print('To process %s records.' %len(raw_text))

for i in raw_text:
    if standardize_name(i) in destination_details:
        
        
        # get the section of introductory text which might have indication of what a destination is known for. 
        # this section is before the 'Get in' section
        t =''
        if '==Get in==' in raw_text[i]:
            t = raw_text[i].split('==Get in==')[0]
            t += '\n'
        elif '== Get in ==' in raw_text[i]:
            t = raw_text[i].split('== Get in ==')[0]
            t += '\n'

            
        # next, the 'See' and 'Do' sections are the mostly likely places to find information on the destination's themes
        if '==See==' in raw_text[i]:
            t+= raw_text[i].split('==See==')[1].split('==Do==')[0].split('== Do ==')[0].split('==Eat==')[0].split('== Eat ==')[0].split('==Eat and Drink==')[0].split('== Eat and Drink==')[0].split('==Buy==')[0].split('== Buy ==')[0]
        elif '== See ==' in raw_text[i]:
            t += raw_text[i].split('== See ==')[1].split('==Do==')[0].split('== Do ==')[0].split('==Eat==')[0].split('== Eat ==')[0].split('==Eat and Drink==')[0].split('== Eat and Drink==')[0].split('==Buy==')[0].split('== Buy ==')[0]
        if '==Do==' in raw_text[i]:
            t += raw_text[i].split('==Do==')[1].split('==Eat==')[0].split('== Eat ==')[0].split('==Eat and Drink==')[0].split('== Eat and Drink==')[0].split('==Buy==')[0].split('== Buy ==')[0]
        if '== Do ==' in raw_text[i]:
            t += raw_text[i].split('== Do ==')[1].split('==Eat==')[0].split('== Eat ==')[0].split('==Eat and Drink==')[0].split('== Eat and Drink==')[0].split('==Buy==')[0].split('== Buy ==')[0]
        
        
        # if there is text found in either the introductory part, or under 'See' or 'Do', proceed with further cleaning
        if t!='':
            # remove tags, headers, links
            t = re.sub('{{.*?}}', '', t)
            t = re.sub('===.*?===', '', t)
            t = re.sub('==.*?==', '', t)
            t = re.sub('<!--.*?-->', '', t)
            while len(re.findall('\[\[.*?\|', t))>0:
                t = re.sub('\[\[.*?\|', '[[', t)
            t = re.sub('\[\[.*?\]\]\n', '\n', t)
            t = t.replace('[', '').replace(']', '').replace("'''", "").replace("''", "")

            t = re.sub('\(.*?\)', '', t)
            t = re.sub('\*.*?\n', '', t)
            t = re.sub('\|.*?\n', '', t)
            t = re.sub('{{\w*', '', t)
            # t = re.sub('File:.*?\n', '', t)
            t = t.replace('}}', '').replace('::','')
            t = re.sub(r'https://.*?[\s|\n]', '', t)
            t = re.sub(r'http://.*?[\s|\n]', '', t)
            
            # replace double linebreaks and double spaces with singles
            while '\n\n' in t:
                t = t.replace('\n\n','\n')
            while '  ' in t:
                t = t.replace('  ', ' ')
                
            # remove whitespaces
            t = t.strip()
            compiled = ''
            rows = t.split('\n')
            
            # only consider lines with more than 6 words before a linebreak (if fewer than that likely irrelevant)
            for row in rows:
                if len(row.split(' '))>6:
                    compiled+=row
                    compiled+=' '
            compiled = compiled.strip()
            
            # only add destinations with more than 20 words in total to the final output for themes extraction
            if len(compiled.split(' '))>20:
                consolidated[standardize_name(i)] = compiled
            if len(consolidated)%1000==0:
                print 'Completed %s' %len(consolidated)

# dump output into a file
with open('cleaned_text.json', 'w') as f:
    json.dump(consolidated, f)
print('Final output %s records.' %len(consolidated))

del t
del compiled
del consolidated
del raw_text

# 4. Create corpus

In [None]:
with open('cleaned_text.json', 'r') as f:
    cleaned_text = json.load(f)

corpus = []
id_lookup = {}
completed = 0
for destination in cleaned_text:
    id_lookup[len(corpus)] = destination
    corpus.append(cleaned_text[destination])

print('Corpus size: %s' %len(corpus))
print('Lookup size: %s' %len(id_lookup))

with open('corpus_id_lookup.json', 'w') as f:
    json.dump(id_lookup, f)

# 5. Perform TFIDF on data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 0.001, max_df=0.6, lowercase=False, use_idf=False, norm=None, stop_words = 'english')

tfidf_matrix =  tf.fit_transform(corpus)
feature_names = tf.get_feature_names()
print 'Number of unique words in corpus: %s '%len(feature_names)

# 6. Transform the output to store it

In [None]:
completed=0
doc_id = 0
consolidated = {}
for doc in tfidf_matrix.todense():
    word_id = 0
    consolidated[id_lookup[doc_id]] = {}
    for score in doc.tolist()[0]:
        if score > 0:
            consolidated[id_lookup[doc_id]][feature_names[word_id].encode("utf-8")] = score
        word_id +=1
    doc_id +=1
    if len(consolidated)%1000==0:
        print 'Completed %s' %len(consolidated)

with open('tfidf_scores.json', 'w') as f:
    json.dump(consolidated, f)
    
del consolidated
del tfidf_matrix
del feature_names

# 7. Get themes from word prominence in destination articles

In [None]:
themes_dict = {
    'beach': ['beach', 'beaches'],
    'shopping': ['shopping', 'malls'],
    'temples': ['temple', 'temples'],
    'surfing': ['surf', 'surfing', 'surfers'],
    'diving': ['dive', 'diving', 'divers'],
    'hiking': ['hike', 'hiking', 'hikers', 'trek', 'trekking', 'trekkers'],
    # 'family': ['families', 'kids', 'children', 'seniors'],
    'culture': ['culture', 'cultural', 'cultures'],
    'food': ['foodie', 'food', 'restaurants', 'delicacy', 'delicacies'],
    'museums': ['museums', 'museum'],
}

with open('tfidf_scores.json', 'r') as f:
    data=json.load(f)

consolidated = {'beach': [], 'shopping': [], 'temples': [], 'surfing': [], 'diving': [], 'hiking': [], 'culture': [], 'food': [], 'museums': []}
completed=0
for destination in data:
    for theme in themes_dict:
        score = 0
        for word in themes_dict[theme]:
            if word in data[destination]:
                score+=data[destination][word]
        if score>0:
            consolidated[theme].append((destination, score))
    completed+=1
    if completed%1000==0:
        print('Completed %s/%s' %(completed, len(data)))

with open('destination_themes.json', 'w') as f:
    json.dump(consolidated, f)
    
del data
del consolidated

# 8. Re use functions created in destinations graph

In [None]:
def get_parent(current, chain=''):
    if chain is '':
        chain=current.lower()
        current=current.lower()
    try:
        for parent in destination_details[current]['ispartof']:
            chain = '%s|%s' %(parent, chain)
            chain = get_parent(parent, chain)
    except KeyError:
        return chain
    else:
        return chain
print get_parent('Thailand')

def get_child(search):
    child_articles = []
    for article in destination_details:
        for parent in destination_details[article]['ispartof']:
            if parent == search.lower():
                child_articles.append(article)
    return child_articles

print get_child('Thailand')

# 9. Get top destinations for given theme and region

In [None]:
with open('destination_themes.json', 'r') as f:
    destination_themes = json.load(f)
    
theme = 'beach'
region = 'Asia'
sorted_scores = sorted(destination_themes[theme], key=lambda t: t[1] * -1)

printed = 0
for score in sorted_scores:
    if region.lower() in get_parent(score[0]) and (len(get_child(score[0]))==0):
        print '%s (%s): %s' %(score[0].title(), score[1], get_parent(score[0]).title().replace('|', ' > '))
        printed +=1
        if printed == 5:
            break

del destination_themes

# 10. Get top themes for a given destination

In [None]:
with open('destination_themes.json', 'r') as f:
    destination_themes = json.load(f)
    
destination = 'paris'
final_themes = []
for theme in destination_themes:
    for destination_score in destination_themes[theme]:
        if destination.lower() == destination_score[0]:
            final_themes.append((theme, destination_score[1]))
sorted_scores = sorted(final_themes, key=lambda t: t[1] * -1)

print get_parent(destination.lower()).title().replace('|', ' > ')
print
for item in sorted_scores:
    print '%s (%s)' %(item[0].title(), item[1])