In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
def read_json(filename):
    '''read json file'''
    return json.loads(open(filename).read()) 

In [3]:
def append_two_dict(dict1, dict2):
    '''append two dictionaries based on keys'''
    new_dict = dict1
    for key, val in dict2.items():
        if key not in dict1.keys():
            new_dict[key] = val
    return new_dict

In [4]:
def unicode_to_ascii(lst):
    '''convert unicode to ascii'''
    # avoid raising errors later on while writing data into csv files
    return [item.encode('ascii', 'ignore') for item in lst]

In [5]:
def dic_str_to_num(dic):
    '''convert list of strings in a dictionary into numbers'''
    new_dic = {}
    for key, val_lst in dic.items():
        new_dic[key] = [int(x.replace(',', '')) for x in val_lst]
    return new_dic

In [7]:
def encode_whole_dictionary(dic):
    '''convert whole dictionaty from unicode to ascii'''
    keys = dic.keys()
    values = dic.values()
    encode_key = unicode_to_ascii(keys)
    encode_val = [unicode_to_ascii(val) for val in values]
    
    # create new dictionary with encoded kay and values
    new_dic ={}
    for i in range(len(encode_key)):
        new_dic[encode_key[i]] = encode_val[i]

    return new_dic

In [22]:
def write_json(name, dic):
    '''write dictionary to json file'''
    filename = name + '.json'
    with open(filename, 'w') as f:
        json.dump(dic, f)

In [26]:
def get_tag_cloud_lst(tag_cloud):
    '''get tag cloud list: [(tag name, its frequency)]'''
    tag_cloud_lst = reduce(lambda x,y: x + y, tag_cloud.values(),[])
    freq_dic = {}
    for i in tag_cloud_lst:
        if i not in freq_dic.keys(): 
            freq_dic[i] = 1
        else:
            freq_dic[i] +=1
    tag_freq_lst = freq_dic.items()
    tag_freq_lst.sort(key = lambda x: x[1], reverse=True)
    return tag_freq_lst

In [32]:
def reverse_dic(target, original_dic):
    '''reverse a key-value dictionary to a value-key one'''
    dic = {}
    for i in target:
        dic[i] = []
    for name, item_lst in original_dic.items():
        for item in item_lst:
            if item in target:
                dic[item] += [name]
    return dic

In [8]:
us_category = read_json('./Data/museum_categories_USonly.json')
us_tag_cloud = read_json('./Data/tag_clouds_USonly.json')
us_traveler_type = read_json('./Data/traverler_type_USonly.json')

w_category = read_json('./Data/museum_categories_world.json')
w_tag_cloud = read_json('./Data/tag_clouds_world.json')
w_traveler_type = read_json('./Data/traverler_type_world.json')

category = append_two_dict(us_category, w_category)
tag_cloud = append_two_dict(us_tag_cloud, w_tag_cloud)
traveler_type = append_two_dict(us_traveler_type, w_traveler_type)

# convert all dictionaries from unicode to ascii
category = encode_whole_dictionary(category)
tag_cloud = encode_whole_dictionary(tag_cloud)
traveler_type = encode_whole_dictionary(traveler_type)

# convert strings in dictionary to number
traveler_type = dic_str_to_num(traveler_type)

In [15]:
category_lst = reduce(lambda x,y: x + y, category.values(),[])
target_category = [i for i in set(category_lst) if 'Museum' in i \
                   or 'Galleries' in i or 'Historic Sites' in i or 'Landmarks' in i]
target_category

['History Museums',
 'Military Museums',
 'Points of Interest & Landmarks',
 'Natural History Museums',
 'Art Museums',
 'Historic Sites',
 'Science Museums',
 'Museums',
 'Specialty Museums',
 'Art Galleries',
 "Children's Museums",
 'Sights & Landmarks']

In [34]:
dic_cat = reverse_dic(target_category, category)
write_json('museum_types', dic_cat)

In [51]:
tag_freq_lst = get_tag_cloud_lst(tag_cloud)
target_tags = tag_freq_lst[0:50]
target_tags

[('on display', 751),
 ('gift shop', 405),
 ('rainy day', 281),
 ('all ages', 275),
 ('couple of hours', 268),
 ('special exhibits', 188),
 ('few hours', 167),
 ('permanent collection', 162),
 ('two hours', 158),
 ('exhibits', 155),
 ('worth a visit', 154),
 ('free admission', 151),
 ('beautiful building', 136),
 ('audio guide', 135),
 ('well worth a visit', 130),
 ('great collection', 125),
 ('information', 117),
 ('interactive exhibits', 112),
 ('great for kids', 111),
 ('hands on activities', 111),
 ('interesting exhibits', 109),
 ('great exhibits', 108),
 ('entrance fee', 104),
 ('great place to visit', 100),
 ('well worth the visit', 85),
 ('guided tour', 81),
 ('amazing collection', 81),
 ('traveling exhibits', 79),
 ('interactive displays', 79),
 ('whole family', 78),
 ('kids and adults', 77),
 ('his life', 75),
 ('worth the trip', 73),
 ('local history', 73),
 ('nice collection', 72),
 ('great history', 70),
 ('civil war', 70),
 ('free entry', 67),
 ('interesting place', 67),
 

In [52]:
tag_of_interest = ['gift shop', 'rainy day', 'all ages', 'beautiful building', 
                   'audio guide', 'great for kids', 'guided tour',
                   'interactive exhibits', 'interactive displays', 'modern art']
dic_tag = reverse_dic(tag_of_interest, tag_cloud)
dic_tag['interactive'] = list(set(dic_tag['interactive exhibits'] + dic_tag['interactive displays']))
# remove two key-value pairs since they have combined into 'interactive'
dic_tag.pop('interactive exhibits') 
dic_tag.pop('interactive displays')
write_json('museum_tags', dic_tag)