# Museum Recommendation System
## Consine Similarity Metric Construction
Author: Anne Chen  
2016

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity
from fancyimpute import KNN

Using Theano backend.


In [2]:
museum_df = pd.read_csv("dummified_df.csv")
museum_df.columns.tolist()

['Unnamed: 0',
 'Address',
 'Description',
 'FeatureCount',
 'Fee',
 'Langtitude',
 'Latitude',
 'LengthOfVisit',
 'MuseumName',
 'PhoneNum',
 'Rank',
 'Rating',
 'ReviewCount',
 'TotalThingsToDo',
 'Country',
 'State',
 'RankPercentage',
 'Ancient Ruins',
 'Architectural Buildings',
 'Arenas & Stadiums',
 'Art Galleries',
 'Art Museums',
 'Auto Race Tracks',
 'Ballets',
 'Battlefields',
 'Biking Trails',
 'Bodies of Water',
 'Bridges',
 'Castles',
 'Cemeteries',
 "Children's Museums",
 'City Tours',
 'Classes & Workshops',
 'Coffeehouses',
 'Concerts & Shows',
 'Cultural Tours',
 'Educational sites',
 'Factory Tours',
 'Flea & Street Markets',
 'Food & Drink',
 'Fun & Games',
 'Gardens',
 'Geologic Formations',
 'Gift & Specialty Shops',
 'Government Buildings',
 'Historic Sites',
 'Historic Walking Areas',
 'Historical & Heritage Tours',
 'History Museums',
 'Islands',
 'Lessons & Workshops',
 'Libraries',
 'Lighthouses',
 'Literary, Art & Music Tours',
 'Lookouts',
 'Military Bases 

In [3]:
idx_to_drop = [0,1,2,4,5,6,7,9,14,15] # drop non-numeric columns besides MuseumName
museum_df = museum_df.drop(museum_df.columns[idx_to_drop], axis=1)
museum_df.columns.tolist()

['FeatureCount',
 'MuseumName',
 'Rank',
 'Rating',
 'ReviewCount',
 'TotalThingsToDo',
 'RankPercentage',
 'Ancient Ruins',
 'Architectural Buildings',
 'Arenas & Stadiums',
 'Art Galleries',
 'Art Museums',
 'Auto Race Tracks',
 'Ballets',
 'Battlefields',
 'Biking Trails',
 'Bodies of Water',
 'Bridges',
 'Castles',
 'Cemeteries',
 "Children's Museums",
 'City Tours',
 'Classes & Workshops',
 'Coffeehouses',
 'Concerts & Shows',
 'Cultural Tours',
 'Educational sites',
 'Factory Tours',
 'Flea & Street Markets',
 'Food & Drink',
 'Fun & Games',
 'Gardens',
 'Geologic Formations',
 'Gift & Specialty Shops',
 'Government Buildings',
 'Historic Sites',
 'Historic Walking Areas',
 'Historical & Heritage Tours',
 'History Museums',
 'Islands',
 'Lessons & Workshops',
 'Libraries',
 'Lighthouses',
 'Literary, Art & Music Tours',
 'Lookouts',
 'Military Bases & Facilities',
 'Military Museums',
 'Mines',
 'Monuments & Statues',
 'Movie Theaters',
 'Museums',
 'National Parks',
 'Natural Hi

In [4]:
# impute missing value using knn imputation
# k is determined by the one yielding decent prediction for classification 'Rating' in 'TripAdvisor_Rating_Prediction.ipynb'
no_name_df = museum_df.drop(museum_df.columns[[1]], axis = 1)
no_name_df.columns.tolist()

['FeatureCount',
 'Rank',
 'Rating',
 'ReviewCount',
 'TotalThingsToDo',
 'RankPercentage',
 'Ancient Ruins',
 'Architectural Buildings',
 'Arenas & Stadiums',
 'Art Galleries',
 'Art Museums',
 'Auto Race Tracks',
 'Ballets',
 'Battlefields',
 'Biking Trails',
 'Bodies of Water',
 'Bridges',
 'Castles',
 'Cemeteries',
 "Children's Museums",
 'City Tours',
 'Classes & Workshops',
 'Coffeehouses',
 'Concerts & Shows',
 'Cultural Tours',
 'Educational sites',
 'Factory Tours',
 'Flea & Street Markets',
 'Food & Drink',
 'Fun & Games',
 'Gardens',
 'Geologic Formations',
 'Gift & Specialty Shops',
 'Government Buildings',
 'Historic Sites',
 'Historic Walking Areas',
 'Historical & Heritage Tours',
 'History Museums',
 'Islands',
 'Lessons & Workshops',
 'Libraries',
 'Lighthouses',
 'Literary, Art & Music Tours',
 'Lookouts',
 'Military Bases & Facilities',
 'Military Museums',
 'Mines',
 'Monuments & Statues',
 'Movie Theaters',
 'Museums',
 'National Parks',
 'Natural History Museums',

In [7]:
# impute missing value using knn imputation
# k is determined by the one yielding decent prediction for classification 'Rating' in 'TripAdvisor_Rating_Prediction.ipynb'
no_name_df = museum_df.drop(museum_df.columns[[1]], axis = 1)
X_filled_knn = KNN(k = 40).complete(no_name_df)
length = no_name_df.shape[0]
imputed_df = pd.DataFrame(data = X_filled_knn,
                          index= range(0,length),
                          columns = no_name_df.columns)

imputed_df.head()

Computing pairwise distances between 1603 samples
Computing distances for sample #1/1603, elapsed time: 1.454
Computing distances for sample #101/1603, elapsed time: 2.093
Computing distances for sample #201/1603, elapsed time: 2.768
Computing distances for sample #301/1603, elapsed time: 3.387
Computing distances for sample #401/1603, elapsed time: 4.007
Computing distances for sample #501/1603, elapsed time: 4.632
Computing distances for sample #601/1603, elapsed time: 5.251
Computing distances for sample #701/1603, elapsed time: 5.874
Computing distances for sample #801/1603, elapsed time: 6.498
Computing distances for sample #901/1603, elapsed time: 7.151
Computing distances for sample #1001/1603, elapsed time: 7.814
Computing distances for sample #1101/1603, elapsed time: 8.874
Computing distances for sample #1201/1603, elapsed time: 9.736
Computing distances for sample #1301/1603, elapsed time: 10.613
Computing distances for sample #1401/1603, elapsed time: 11.690
Computing dista

Unnamed: 0,FeatureCount,Rank,Rating,ReviewCount,TotalThingsToDo,RankPercentage,Ancient Ruins,Architectural Buildings,Arenas & Stadiums,Art Galleries,...,descri_pol,descri_sub,LengthOfVisit__1-2 hours,LengthOfVisit__2-3 hours,LengthOfVisit__<1 hour,LengthOfVisit__More than 3 hours,Country__Other,Country__USA,Fee__No,Fee__Yes
0,3.0,8.0,4.5,6309.0,398.0,2.01005,0.0,0.0,0.0,0.0,...,0.49,0.506667,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,12.0,2.0,5.0,36627.0,1028.0,0.194553,0.0,0.0,0.0,0.0,...,0.318182,0.477273,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,11.0,1.0,5.0,15611.0,319.0,0.31348,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,4.0,4.5,2564.0,231.0,1.731602,0.0,0.0,0.0,0.0,...,0.1,0.4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,5.0,1.0,5.0,15532.0,614.0,0.162866,0.0,0.0,0.0,0.0,...,0.3,0.3375,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


In [8]:
# merge museum name with imputed dataframe
merged_df = pd.concat([museum_df['MuseumName'], imputed_df], axis=1)
# imputed_df.to_csv('imputed_df_no_name.csv')
merged_df.to_csv('./app/data/imputed_df_with_name.csv')
merged_df.describe()

Unnamed: 0,FeatureCount,Rank,Rating,ReviewCount,TotalThingsToDo,RankPercentage,Ancient Ruins,Architectural Buildings,Arenas & Stadiums,Art Galleries,...,descri_pol,descri_sub,LengthOfVisit__1-2 hours,LengthOfVisit__2-3 hours,LengthOfVisit__<1 hour,LengthOfVisit__More than 3 hours,Country__Other,Country__USA,Fee__No,Fee__Yes
count,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,...,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0
mean,1.007486,16.555209,4.4267,1540.467249,243.429195,9.297117,0.001248,0.006862,0.001871,0.006862,...,0.17722,0.39355,0.149719,0.123518,0.021834,0.054273,0.388022,0.611978,0.032439,0.115409
std,2.149806,28.900707,0.285016,3906.633803,308.115561,10.134265,0.035311,0.082579,0.043234,0.082579,...,0.159632,0.190507,0.356907,0.329134,0.146187,0.226627,0.487452,0.487452,0.177219,0.319614
min,0.0,1.0,2.5,32.0,1.0,0.0693,0.0,0.0,0.0,0.0,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,4.5,213.0,48.0,2.941176,0.0,0.0,0.0,0.0,...,0.095239,0.310554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,6.0,4.5,510.0,142.0,6.25,0.0,0.0,0.0,0.0,...,0.166667,0.38926,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,1.0,19.0,4.5,1257.0,314.5,12.5,0.0,0.0,0.0,0.0,...,0.24556,0.484623,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
max,27.0,397.0,5.0,63112.0,2279.0,100.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Function Development (Just Trying) 

In [9]:
def get_museum_lst(target_museum_input):
    '''get the museum lst from input'''
    return target_museum_input.split(';')[1:]

def get_master_srt_lst(museum_lst):
    '''concatenate all top five lists for museums in museum_lst'''
    master_srt_lst = []
    for m in museum_lst:
         master_srt_lst += get_top_five_for_one(m)
    return master_srt_lst

def sort_list(lst):
    '''sort the nested list based on the second item in list'''
    sorted_lst = sorted(lst, key=lambda x: x[1], reverse = True) 
    return sorted_lst

def get_top_five_for_one(target_museum):
    '''get top five museum and consine similarity for one musuem'''
    target_idx = museum_df[museum_df['MuseumName'] == target_museum].index.tolist()[0]
    input_vec = np.array(imputed_df.iloc[target_idx]).reshape(1, -1)
    nrow = imputed_df.shape[0]
    cos_sim = []
    for i in range(nrow):
        # reshapre the row into a vector
        vec = np.array(imputed_df.iloc[i]).reshape(1, -1)
        # compute and store consine similarity along with musuem name
        cos_sim.append([museum_df['MuseumName'][i], cosine_similarity(input_vec, vec)[0][0]])
    top_five  = sort_list(cos_sim)
    return top_five[1:6] # ignore the top one since it's the target musuem itself

def lst_to_dic(lst):
    '''convert lst into dictionary'''
    dic = {}
    for i in lst:
        dic[i[0]] = i[1]
    return dic

def to_json(name, dic):
    '''write dictionary to json file'''
    filename = name + '.json'
    with open(filename, 'w') as f:
        json.dump(dic, f)

def get_sorted_dic(lst):
    dic = {}
    for idx, item in enumerate(lst):
        dic[idx+1] = [item[0], item[1]]
    return dic
        
def exclude_selected(museum_lst, srt_lst):
    return [x for x in srt_lst if x[0] not in museum_lst]

In [10]:
museum_df = pd.read_csv("./app/data/imputed_df_with_name.csv")
museum_df = museum_df.drop(museum_df.columns[0], axis=1)
imputed_df = museum_df.drop(museum_df.columns[[0,4,5]], axis=1)

In [11]:
museum_df.head()

Unnamed: 0,MuseumName,FeatureCount,Rank,Rating,ReviewCount,TotalThingsToDo,RankPercentage,Ancient Ruins,Architectural Buildings,Arenas & Stadiums,...,descri_pol,descri_sub,LengthOfVisit__1-2 hours,LengthOfVisit__2-3 hours,LengthOfVisit__<1 hour,LengthOfVisit__More than 3 hours,Country__Other,Country__USA,Fee__No,Fee__Yes
0,Newseum,3.0,8.0,4.5,6309.0,398.0,2.01005,0.0,0.0,0.0,...,0.49,0.506667,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,The Metropolitan Museum of Art,12.0,2.0,5.0,36627.0,1028.0,0.194553,0.0,0.0,0.0,...,0.318182,0.477273,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,The National WWII Museum,11.0,1.0,5.0,15611.0,319.0,0.31348,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,Denver Museum of Nature & Science,0.0,4.0,4.5,2564.0,231.0,1.731602,0.0,0.0,0.0,...,0.1,0.4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,Art Institute of Chicago,5.0,1.0,5.0,15532.0,614.0,0.162866,0.0,0.0,0.0,...,0.3,0.3375,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


In [32]:
def get_unique_recom(master_srt_lst):
    unique_name = list(set([i[0]for i in master_srt_lst]))
    uni_lst = []
    for i in master_srt_lst:
        if i[0] in unique_name:
            uni_lst.append([ i[0],i[1] ])
            unique_name.pop(unique_name.index(i[0]))

    return uni_lst

In [38]:
target_museum_input = ';British Museum;The Metropolitan Museum of Art'
# target_museum_input = ';Science Museum'
museum_lst = get_museum_lst(target_museum_input)
master_srt_lst = get_master_srt_lst(museum_lst)
uni_lst = get_unique_recom(master_srt_lst)
sorted_lst = sort_list(uni_lst)
top_lst = exclude_selected(museum_lst, sorted_lst)
sorted_dic = get_sorted_dic(top_lst)
to_json('./app/data/testing_top_five', sorted_dic)
# sorted_dic

In [39]:
sorted_dic

{1: ['The Museum of Modern Art (MoMA)', 0.99906464372393389],
 2: ['Chrysler Museum of Art', 0.99899748432815461],
 3: ['Kelvingrove Art Gallery and Museum', 0.99888187628993297],
 4: ['Tenement Museum', 0.99839705749069696],
 5: ['Philadelphia Museum of Art', 0.99806614954842421],
 6: ['Derby Museum and Art Gallery', 0.99769219462338976],
 7: ['Dubai Museum', 0.99765541530093471],
 8: ['Birmingham Museum & Art Gallery', 0.99757124988299339],
 9: ['The Walters Art Museum', 0.9974297895439872]}

In [20]:
top_lst  

[['The Museum of Modern Art (MoMA)', 0.99906464372393389],
 ['Chrysler Museum of Art', 0.99899748432815461],
 ['Kelvingrove Art Gallery and Museum', 0.99888187628993297],
 ['Tenement Museum', 0.99839705749069696],
 ['The Museum of Modern Art (MoMA)', 0.99815697155943672],
 ['Philadelphia Museum of Art', 0.99806614954842421],
 ['Derby Museum and Art Gallery', 0.99769219462338976],
 ['Dubai Museum', 0.99765541530093471],
 ['Birmingham Museum & Art Gallery', 0.99757124988299339],
 ['The Walters Art Museum', 0.9974297895439872]]

In [13]:
# features that are included in calculating the cosine similarity
print 'number of features:', len(imputed_df.columns)
imputed_df.columns

number of features: 219


Index([u'FeatureCount', u'Rank', u'Rating', u'RankPercentage',
       u'Ancient Ruins', u'Architectural Buildings', u'Arenas & Stadiums',
       u'Art Galleries', u'Art Museums', u'Auto Race Tracks',
       ...
       u'descri_pol', u'descri_sub', u'LengthOfVisit__1-2 hours ',
       u'LengthOfVisit__2-3 hours ', u'LengthOfVisit__<1 hour ',
       u'LengthOfVisit__More than 3 hours ', u'Country__Other',
       u'Country__USA', u'Fee__No ', u'Fee__Yes '],
      dtype='object', length=219)

In [14]:
pd.DataFrame({'colname':imputed_df.columns.values})

Unnamed: 0,colname
0,FeatureCount
1,Rank
2,Rating
3,RankPercentage
4,Ancient Ruins
5,Architectural Buildings
6,Arenas & Stadiums
7,Art Galleries
8,Art Museums
9,Auto Race Tracks
