In [82]:
# the essentials
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from scipy import stats
%matplotlib inline

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.metrics import pairwise_distances

# scipy
from scipy.sparse import coo_matrix

import re
import pickle
import random
from tqdm.notebook import tqdm as tqdm  # progress bar for x in tqdm(range(100))

# Collaborative Filtering

In [83]:
with open('city_info_final.pkl', 'rb') as picklfile:
    df = pickle.load(picklfile)
    
# we dont need the wifi and link columns
df.drop(columns=['wifi', 'link'], inplace=True)

# remove manhattan because New york city is already a city we have in the data
df = df[df.city != 'manhattan'].reset_index()

In [84]:
df.shape

(1010, 19)

In [85]:
df.head()

Unnamed: 0,index,city,country,city_and_country,score,cost/month,fun,safety,quality_of_life,walkability,happiness,nightlife,friendly_to_foreigners,english,avg_trip_length_days,return_rate_percent,hotel_price_night,airbnb_price_night,visitors
0,0,buenos-aires,Argentina,buenos aires argentina,4.88,1026.0,Good,Okay,Good,Great,Good,Great,Good,Okay,25.0,16.0,34.0,24.0,"[/@krausefx, /@aczuleta, /@alexanderjoo, /@sil..."
1,1,bangkok,Thailand,bangkok thailand,4.73,1522.0,Good,Good,Good,Great,Good,Great,Great,Okay,7.0,18.0,31.0,51.0,"[/@remyp, /@manas, /@timrael, /@dimqen, /@dani..."
2,2,mexico-city,Mexico,mexico city mexico,4.72,1493.0,Good,Okay,Good,Great,Good,Good,Good,Okay,14.0,14.0,30.0,31.0,"[/@rohit, /@nadiaronquillo, /@evelienal, /@bri..."
3,3,canggu,Indonesia,canggu indonesia,4.69,1389.0,Good,Great,Good,Okay,Okay,Good,Great,Good,28.0,17.0,21.0,58.0,"[/@tris, /@joytravels, /@guar47, /@mariebriand..."
4,4,chiang-mai,Thailand,chiang mai thailand,4.68,1126.0,Good,Great,Good,Great,Good,Okay,Good,Okay,28.0,15.0,25.0,41.0,"[/@meedamian, /@zapperen, /@john, /@kymellis, ..."


## References

**This section is for dictionaries/lists that will be used as references to get:**  
    - city names
    - country names
    - indexes
   **To be used later**

In [86]:
# Total number of recommendations wanted
total_num_recs = 21

In [87]:
# city and country to be used in the recommendation function
city_and_country = df['city_and_country']
city_and_country.head()

0    buenos aires argentina
1          bangkok thailand
2        mexico city mexico
3          canggu indonesia
4       chiang mai thailand
Name: city_and_country, dtype: object

In [88]:
# to add cities to the df later
cities = df['city']
cities.head()

0    buenos-aires
1         bangkok
2     mexico-city
3          canggu
4      chiang-mai
Name: city, dtype: object

In [89]:
# cities in a dict to be referenced by the pair-wise distances
city_country_dict = dict(df['city_and_country'])
city_country_dict

{0: 'buenos aires argentina',
 1: 'bangkok thailand',
 2: 'mexico city mexico',
 3: 'canggu indonesia',
 4: 'chiang mai thailand',
 5: 'budapest hungary',
 6: 'taipei taiwan',
 7: 'prague czechia',
 8: 'medellin colombia',
 9: 'sofia bulgaria',
 10: 'istanbul turkey',
 11: 'phuket thailand',
 12: 'ho chi minh city vietnam',
 13: 'belgrade serbia',
 14: 'tbilisi georgia',
 15: 'lisbon portugal',
 16: 'kuala lumpur malaysia',
 17: 'sarajevo bosnia',
 18: 'da nang vietnam',
 19: 'warsaw poland',
 20: 'krakow poland',
 21: 'saint petersburg russia',
 22: 'valencia spain',
 23: 'ubud indonesia',
 24: 'hanoi vietnam',
 25: 'zagreb croatia',
 26: 'porto portugal',
 27: 'shanghai china',
 28: 'ko tao thailand',
 29: 'barcelona spain',
 30: 'rio de janeiro brazil',
 31: 'montreal canada',
 32: 'bucharest romania',
 33: 'sao paulo brazil',
 34: 'vilnius lithuania',
 35: 'cancun mexico',
 36: 'santiago chile',
 37: 'berlin germany',
 38: 'bansko bulgaria',
 39: 'pattaya thailand',
 40: 'antalya t

In [90]:
# city/visitor information
collab_df = df[['city', 'visitors']]
collab_df

Unnamed: 0,city,visitors
0,buenos-aires,"[/@krausefx, /@aczuleta, /@alexanderjoo, /@sil..."
1,bangkok,"[/@remyp, /@manas, /@timrael, /@dimqen, /@dani..."
2,mexico-city,"[/@rohit, /@nadiaronquillo, /@evelienal, /@bri..."
3,canggu,"[/@tris, /@joytravels, /@guar47, /@mariebriand..."
4,chiang-mai,"[/@meedamian, /@zapperen, /@john, /@kymellis, ..."
...,...,...
1005,northampton,"[/@iainc, /@john]"
1006,lincoln,"[/@cpitkin, /@residualdata, /@graham, /@brody,..."
1007,noumea,[/@shadiosta]
1008,ashland,"[/@camposped, /@adamnowek, /@mjmeidinger]"


In [91]:
# this separates the list of vistors in each row into dummies
# this also removed the duplicates
from sklearn.preprocessing import MultiLabelBinarizer

split = collab_df['visitors']
mlb = MultiLabelBinarizer()
collab_df = pd.DataFrame(mlb.fit_transform(split),columns=mlb.classes_)

collab_df

Unnamed: 0,/@0x72,/@10kjuan,/@30andawakeup,/@9102180,/@_kp_,/@a7dc,/@a_malchenko,/@aaatelier_ejay,/@aabergkvist,/@aakashdhuna,...,/@zmontagu,/@zonorm,/@zot24,/@zoxel,/@zpeirce,/@zrueda,/@zsolt,/@zsoltee,/@ztargos,/@zwasham
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1007,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1008,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [92]:
# swap rows and columns
# user by city
collab_df = collab_df.transpose()

# add back city names to the column names
collab_df.columns = cities

collab_df

city,buenos-aires,bangkok,mexico-city,canggu,chiang-mai,budapest,taipei,prague,medellin,sofia,...,amundsen-scott,saint-helier,shrewsbury,charlotte-amalie,napa,northampton,lincoln,noumea,ashland,monrovia
/@0x72,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
/@10kjuan,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
/@30andawakeup,0,1,1,1,1,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
/@9102180,0,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
/@_kp_,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
/@zrueda,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
/@zsolt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
/@zsoltee,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
/@ztargos,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
# convert our sparse matrix (collab_df) to a coordinate matrix
coo = coo_matrix(collab_df)
coo

<3296x1010 sparse matrix of type '<class 'numpy.int64'>'
	with 35665 stored elements in COOrdinate format>

In [94]:
# Dimensionality Reductions
# optimal n_components determined by looking at recommendations manually
collab_svd_model = TruncatedSVD(n_components=100, random_state=42)

collab_user_vec = collab_svd_model.fit_transform(coo)

In [95]:
# how each user fits into n_components
collab_user_vec.shape

(3296, 100)

In [96]:
# uses the sparse matric of collab_df to make a user:places visited dictionary
user_visited_dict = {}

for user in collab_df.index:
    user_visited_dict[user] = [a*b for a,b in zip(collab_df.loc[user].values,city_and_country) if a*b !='']
    
user_visited_dict

{'/@0x72': ['bratislava slovakia',
  'ljubljana slovenia',
  'madrid spain',
  'frankfurt germany',
  'granada spain',
  'bordeaux france',
  'florence italy',
  'paris france',
  'bilbao spain',
  'munich germany'],
 '/@10kjuan': ['budapest hungary',
  'medellin colombia',
  'belgrade serbia',
  'lisbon portugal',
  'sarajevo bosnia',
  'warsaw poland',
  'krakow poland',
  'zagreb croatia',
  'porto portugal',
  'barcelona spain',
  'rio de janeiro brazil',
  'bucharest romania',
  'sao paulo brazil',
  'berlin germany',
  'bratislava slovakia',
  'vienna austria',
  'brasov romania',
  'moscow russia',
  'fortaleza brazil',
  'minsk belarus',
  'montanita ecuador',
  'madrid spain',
  'guayaquil ecuador',
  'manaus brazil',
  'kotor montenegro',
  'mumbai india',
  'dubrovnik croatia',
  'split croatia',
  'paris france',
  'munich germany',
  'andorra la vella andorra',
  'new york city united states',
  'miami united states',
  'fort lauderdale united states',
  'key west united s

In [97]:
# dictionary of index to user
user_dict = dict(collab_df.reset_index()['index'])
user_dict

{0: '/@0x72',
 1: '/@10kjuan',
 2: '/@30andawakeup',
 3: '/@9102180',
 4: '/@_kp_',
 5: '/@a7dc',
 6: '/@a_malchenko',
 7: '/@aaatelier_ejay',
 8: '/@aabergkvist',
 9: '/@aakashdhuna',
 10: '/@aakashg',
 11: '/@aaleksandar',
 12: '/@aaranmcguire',
 13: '/@aaronbailey',
 14: '/@aaronpyon',
 15: '/@abbsterrr',
 16: '/@abbygmcclain',
 17: '/@abhishek77in',
 18: '/@abhshkgpt93',
 19: '/@abigaillind',
 20: '/@abitharackal',
 21: '/@ablibereco',
 22: '/@aboundlessworld',
 23: '/@abstaa',
 24: '/@absyah',
 25: '/@acallaghan',
 26: '/@acharlop',
 27: '/@acormail',
 28: '/@acsaari',
 29: '/@aczuleta',
 30: '/@adambunch',
 31: '/@adamhalleyprinable',
 32: '/@adamloke',
 33: '/@adammo',
 34: '/@adamnowek',
 35: '/@adamturnbulluk',
 36: '/@adamwebb',
 37: '/@adboson',
 38: '/@adconk',
 39: '/@additionaltest',
 40: '/@adeal',
 41: '/@adimoldovan',
 42: '/@adityarao310',
 43: '/@adl5166',
 44: '/@adomasb',
 45: '/@adriaan',
 46: '/@adrianavecc',
 47: '/@adrianchinghc',
 48: '/@adrianlazarte',
 49: '

## Model

In [98]:
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process 

# match input string to the cloesest city_and_country avaialable
def match_words(input_word):
    closest_word = []

    for index in range(0,len(city_country_dict)):
        # quantitative value of how close the words are to each other
        # higher the better
        sort_ratio = fuzz.token_sort_ratio(input_word, city_country_dict[index]) 
        closest_word.append((sort_ratio, index))
    
    # return the index location of the city that is closest
    return max(closest_word)[1]

In [99]:
# This function creates TFIDF vectors for documents that have already been tokenized
# normally a document has to be a single string of words, not a list
def tfidf_for_tokens(tokenized_corpus):

    def identity_tokenizer(text):
        return text
    
    tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, stop_words=None, lowercase=False)    
    rec_matrix = tfidf.fit_transform(tokenized_corpus)
    
    return rec_matrix.todense(), tfidf.get_feature_names()

In [100]:
# places is the places the user has gone to 
# should be a list of city AND country to be more specific(some countries have cities with the same name)
def collab_user_user_recommender_tfidf(user_input):
    
    # placeholder. new user hasnt visited any places yet
    new_user_input = np.zeros(len(city_country_dict))
    
    # fix the incorrect spelling here
    places = []
    for city in user_input:
        city_index = match_words(city)
        places.append(city_country_dict[city_index])
        print('You entered: ', city_country_dict[city_index])
        
    places_index = []
    for city in places:
        city_index = match_words(city) # index of the place the've been to
        new_user_input[city_index] = 1 # change the array of zeros to 1 where the user has been
        places_index.append(city_index)
        
    # calculate new coo matrix
    new_coo = coo_matrix(new_user_input) 
    
    # transform coo only since we have the model already fitted
    new_user_vec = collab_svd_model.transform(new_coo) 
    new_user_dist = pairwise_distances(new_user_vec, collab_user_vec,metric='cosine')
    
    
    most_similar_users= [user_dict[index] for index in new_user_dist[0].argsort()]
    #top_1percent_closest_users = most_similar_users[:int(len(most_similar_users)*0.01)]
    
    possible_recs = []
    i = 0
    for user in most_similar_users:
        if len(possible_recs) < int(len(most_similar_users)*0.01): # top 1% closest to you, skipping users that 
                                                            # add empty strings bc they only went to 1 or 2 places
            rec = []
            rec = [place for place in user_visited_dict[user] 
                   if place not in places]

            if ' '.join(rec) != '':
                possible_recs.append(rec)
            i+=1
        #print(dict(sorted(possible_recs.items(), key = itemgetter(1), reverse = True)[:5]))
    
    # convert top 1% closest users to TFIDF values and return high sum over the users
    tfidf_dense_matrix, column_names = tfidf_for_tokens(possible_recs)
    
    tfidf_df = pd.DataFrame(tfidf_dense_matrix)
    tfidf_df.columns = column_names
    recommendations = pd.DataFrame(tfidf_df.sum().sort_values(ascending=False)).reset_index() 
    
    recommendations.columns = ['city_recommendation', 'score']
    
    return recommendations.iloc[:10]

In [101]:
# The user can input their own "Travel History"
# i.e as many cities as they want 
collab_user_user_recommender_tfidf(['Tokyo Japan', 'miami united states'])

You entered:  tokyo japan
You entered:  miami united states


Unnamed: 0,city_recommendation,score
0,bangkok thailand,2.57177
1,medellin colombia,2.500577
2,los angeles united states,2.233932
3,ottawa canada,1.866526
4,kyoto japan,1.652586
5,helsinki finland,1.499709
6,boston united states,1.332476
7,mexico city mexico,1.042445
8,amsterdam netherlands,1.0
9,philadelphia united states,1.0


# Content Based Filtering

In [102]:
# Load cleaned wikivoyage descriptions of each city
with open('tokenized_corpus_final2.pkl', 'rb') as picklfile:
    tokenized_corpus_final = pickle.load(picklfile)

In [103]:
tokenized_corpus_final[1]

['thai',
 'krung',
 'thep',
 'population',
 'inhabitant',
 'heavy',
 'traffic',
 'congestion',
 'intense',
 'heat',
 'naughty',
 'nightlife',
 'warm',
 'welcome',
 'impression',
 'mislead',
 'asia',
 'cosmopolitan',
 'magnificent',
 'palace',
 'authentic',
 'canal',
 'busy',
 'vibrant',
 'nightlife',
 'trading',
 'chao',
 'phraya',
 'river',
 'king',
 'rama',
 'monarch',
 'chakri',
 'dynasty',
 'siam',
 'burn',
 'ayutthaya',
 'burmese',
 'invader',
 'treasure',
 'function',
 'spiritual',
 'cultural',
 'commercial',
 'educational',
 'diplomatic',
 'modern',
 'hum',
 'nightlife',
 'fervor',
 'khet',
 'sub',
 'khwaeng',
 'visitor',
 'conceptual',
 'division',
 'useful',
 'province',
 'pathom',
 'nonthaburi',
 'northwest',
 'pathum',
 'chachoengsao',
 'samut',
 'prakan',
 'southeast',
 'samut',
 'sakhon',
 'southwest',
 'degree',
 'equator',
 'tropical',
 'metropolis',
 'traveller',
 'friendly',
 'asia',
 'furious',
 'assault',
 'visitor',
 'confront',
 'heat',
 'pollution',
 'unpleasant',

In [104]:
# join tokenized corpus to make a block of text for each city
corpus = [' '.join(doc) for doc in tokenized_corpus_final]

In [105]:
# TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_df=0.5, min_df=0.03)
tfidf_matrix = tfidf.fit_transform(corpus)

In [106]:
tfidf_matrix.shape

(1011, 2505)

In [107]:
# Dimensionality Reduction
wiki_svd_model = TruncatedSVD(n_components=75, random_state=42)

wiki_svd_city_vec = wiki_svd_model.fit_transform(tfidf_matrix)
wiki_svd_city_vec.shape

(1011, 75)

In [108]:
# pairwise distances
wiki_svd_city_dist = pairwise_distances(wiki_svd_city_vec, metric='cosine')
wiki_svd_city_dist.shape

(1011, 1011)

## Model

In [109]:
def wiki_svd_recommender(input_city): 

    city_index = match_words(input_city)
    print('You entered: ', city_country_dict[city_index])

    return [city_country_dict[index] for index in wiki_svd_city_dist[city_index].argsort()[1:total_num_recs]]

In [110]:
wiki_svd_recommender('new york city')

You entered:  new york city united states


['philadelphia united states',
 'chicago united states',
 'southampton united kingdom',
 'buffalo united states',
 'atlanta united states',
 'taipei taiwan',
 'rio de janeiro brazil',
 'san francisco united states',
 'tel aviv israel',
 'new orleans united states',
 'sao paulo brazil',
 'charlotte united states',
 'louisville united states',
 'copenhagen denmark',
 'seattle united states',
 'rochester mn united states',
 'minneapolis united states',
 'aarhus denmark',
 'salvador brazil',
 'beijing china']

# Hybrid Recommendations

In [115]:
collab_user_user_recommender_tfidf(['Tokyo', 'Miami United States'])

You entered:  tokyo japan
You entered:  miami united states


Unnamed: 0,city_recommendation,score
0,bangkok thailand,2.57177
1,medellin colombia,2.500577
2,los angeles united states,2.233932
3,ottawa canada,1.866526
4,kyoto japan,1.652586
5,helsinki finland,1.499709
6,boston united states,1.332476
7,mexico city mexico,1.042445
8,amsterdam netherlands,1.0
9,philadelphia united states,1.0


In [113]:
wiki_svd_recommender('Tokyo')

You entered:  tokyo japan


['kyoto japan',
 'osaka japan',
 'daejeon south korea',
 'taipei taiwan',
 'fukuoka japan',
 'kobe japan',
 'sapporo japan',
 'beijing china',
 'taichung taiwan',
 'singapore singapore',
 'hiroshima japan',
 'shanghai china',
 'dalian china',
 'nanjing china',
 'sendai japan',
 'seoul south korea',
 'new york city united states',
 'gwangju south korea',
 'tainan taiwan',
 'zhengzhou china']

In [117]:
wiki_svd_recommender('Miami United States')

You entered:  miami united states


['orlando united states',
 'pensacola united states',
 'tampa united states',
 'fort lauderdale united states',
 'daytona beach united states',
 'sarasota united states',
 'jacksonville united states',
 'santa barbara united states',
 'port st lucie united states',
 'melbourne fl united states',
 'buenos aires argentina',
 'fort myers united states',
 'buffalo united states',
 'birmingham al united states',
 'zurich switzerland',
 'atlanta united states',
 'seattle united states',
 'chicago united states',
 'phoenix united states',
 'raleigh united states']

So using both the Collaborative and Content Based Models, a user has a more robust recommendation than simply looking at the most popular places to visit in the world.