In [1]:
import re
import json

import numpy as np
import pandas as pd

from collections import Counter

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.neighbors import NearestNeighbors

In [2]:
def series_string_to_list(series):
    return (series.fillna('[]')
            .map(lambda v: v.replace('\'', '\"'))
            .map(lambda v: json.loads(v)))

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')) 

df = pd.read_csv('data/output_combined_30k_locations_plus.csv', index_col='id')
df['countries'] = series_string_to_list(df['countries'])
df['cities'] = series_string_to_list(df['cities'])

In [3]:
custom_city_to_country_substitutions = {
    'Indonesia': ['Bali'],
    'England': ['London', 'Oxford', 'Liverpool', 'Manchester', 'Southampton',
               'Plymouth'],
    'United States of America': ['York', 'Florida', 'Miami', 'Chicago', 'Denver', 
                                 'Colorado', 'Atlanta', 'Seattle', 'Washington'],
    'Mexico': ['Mexico'],
    'France': ['Paris'],
    'Canada': ['Toronto', 'Ontario'],
    'Italy': ['Venice', 'Florence'],
    'Ireland': ['Dublin'],
    'Austria': ['Vienna'],
    'India': ['Manali', 'Goa', 'Mumbai', 'Delhi', 'Shimla', 'Udaipur'],
    'Spain': ['Barcelona'],
    'Germany': ['Berlin'],
    'Netherlands': ['Amsterdam'],
    'Portugal': ['Lisbon'],
    'Japan': ['Tokyo'],
    'Czechia': ['Prague'],
    'United Arab Emirates': ['Dubai', 'Ajman'],
    'Ireland': ['Ireland']
}

inv_custom_city_to_country_substitutions = {word: root for root, words in 
                                            custom_city_to_country_substitutions.items() for word in words}

for _, row in df.iterrows():
    for city in row['cities']:
        if city in inv_custom_city_to_country_substitutions.keys():
            row['countries'] += [inv_custom_city_to_country_substitutions[city]]

In [4]:
drop_words = {'com', 'twitter', 'travel', 'vacation', 'holiday', 
              'destination', 'flight', 'deal', '00', 'u', 
              'english', 'bali', 'weekend', 'thanksgiving', 'christmas', 
              'trip', 'best', 'new', 'news', 'london', 
              'day', 'sale', 'traveller', 'book'}

same_map = {
    'photo': ['pic', 'travelblogger', 'blog', 'photography', 'travelgram', 
              'travelphotography', 'beautiful', 'photographytour'],
    'private': ['getaway', 'escape'],
    'nature': ['experience', 'adventure', 'explore', 'outdoor', 'skiing',
              'beautiful', 'paradise', 'island', 'stunning', 'waterfall', 
               'scenic', 'garden'],
    'beach': ['cruise', 'island', 'sunset', 'beachvacation'],
    'luxury': ['luxurytravel', 'resort', 'luxurypic', 'luxurious', 'spa', 'hospitality'],
    'hotel': ['hoteldeals'],
    'tour': ['guide'],
    'family': ['disney', 'familytimepic', 'familytrip', 'familypic', 'familytime', 'familytravel'],
    'romantic' : ['forher', 'forhim', 'engagement', 'lovepic', 'weddingplanner', 
                  'weddinginspiration', 'weddingstationery', 'lovetravel', 'lover', 'romance', 
                  'dateideashttps', 'dateideas', 'dateideashttp', 'wedding']
}

same_inv_map = {word: root for root, words in same_map.items() for word in words}

def splitter(data):
    words = re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ' ', data.lower()).split()
    return [lemmatizer.lemmatize(word) for word in words]

def convert_same(text):
    return [(same_inv_map[word] if word in same_inv_map else word) for word in splitter(text)]

def wordCount(data):
    filtered = [w for w in convert_same(' '.join(data['text'])) if w not in stop_words|drop_words]
    return Counter(filtered).most_common()

# wordCount(df)[:10]

In [5]:
df['tokens'] = df['text'].map(convert_same)

missed_countries = {
    'Scotland': ['scotland'],
    'Czechia': ['czech', 'czechia'],
    'Wales': ['wales'],
    'England': ['england'],
    'United States of America': ['usa']
}

inv_missed_countries = {word: root for root, words in missed_countries.items() for word in words}

for _, row in df.iterrows():
    for city in row['tokens']:
        if city in inv_missed_countries.keys():
            row['countries'] += [inv_missed_countries[city]]

In [6]:
data = pd.DataFrame(set(df['countries'].sum()), columns=['countries']).set_index('countries')
data['tweets'] = 'temp-value'
data['tweets'] = data['tweets'].map(lambda _: set())

for tweet_id, row in df.iterrows():
    for country in row['countries']:
        data.loc[country]['tweets'].add(tweet_id)

feature_words = ['photo', 'private', 'city', 'nature', 'beach', 'tour', 'family', 'romantic', 'hotel', 'luxury']

feature_pos_di = dict(zip(feature_words, list(range(len(feature_words)))))

def vectorize_tokens(tokens):
    vector = np.zeros(len(feature_words))
    for index, feature_word in enumerate(feature_words):
        for token in tokens:
            if token == feature_word:
                vector[index] += 1
    return vector

def aggregate(tweets): return sum([tweet_ids_to_tokens[tweet] for tweet in tweets])

tweet_ids_to_tokens = df['tokens'].map(vectorize_tokens).to_dict()

data['vectors'] = data['tweets'].map(aggregate)

data['vectors'].map(sum).nlargest(10)

countries
England                     4702.0
Scotland                    2259.0
United States of America    1233.0
India                        705.0
Indonesia                    538.0
China                        399.0
Italy                        334.0
Canada                       328.0
Spain                        278.0
France                       271.0
Name: vectors, dtype: float64

In [7]:
def recommend(series, query):
    query_vector = series.loc[query]
    reduced_series = series.drop(['China', 'India', 'United States of America', 'England'])
    matrix = pd.DataFrame(reduced_series.tolist()).values

    nn = NearestNeighbors(n_neighbors=1+5, algorithm='auto', metric='cosine').fit(matrix)
    distances, similar_items = nn.kneighbors([query_vector])

    similar_item_names = reduced_series.iloc[similar_items[0]].index.tolist()
    
    return similar_item_names[1:]

In [8]:
recommendations = pd.DataFrame([recommend(data['vectors'], country) for country in data.index], index=data.index)

In [9]:
recommendations.loc[['Mauritius', 'Italy', 'Haiti', 'Malaysia', 'Spain']]

Unnamed: 0_level_0,0,1,2,3,4
countries,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mauritius,Seychelles,Cyprus,Barbados,Greece,Cambodia
Italy,Georgia,Bolivia,Australia,Mozambique,Madagascar
Haiti,Palau,Brazil,Malta,Greenland,Macao
Malaysia,Thailand,Myanmar,Seychelles,Israel,Cambodia
Spain,Luxembourg,Denmark,Cuba,Norway,Netherlands


In [10]:
recommendations.to_csv('data/recommendations.csv')