In [1]:
import re
import json

import numpy as np
import pandas as pd

from collections import Counter

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.neighbors import NearestNeighbors

In [2]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')) 

df = pd.read_csv('output_combined_30k_locations.csv', index_col='id')
df['countries'] = (df['countries']
                   .fillna('[]')
                   .map(lambda v: v.replace('\'', '\"'))
                   .map(lambda v: json.loads(v)))

In [3]:
drop_words = {'com', 'twitter', 'travel', 'vacation', 'holiday', 
              'destination', 'flight', 'deal', '00', 'u', 
              'english', 'bali', 'weekend', 'thanksgiving', 'christmas', 
              'trip', 'best', 'new', 'news', 'london', 
              'day', 'sale', 'traveller', 'book'}

same_map = {
    'photo': ['pic', 'travelblogger', 'blog', 'photography', 'travelgram', 
              'travelphotography', 'beautiful', 'photographytour'],
    'private': ['getaway', 'escape'],
    'nature': ['experience', 'adventure', 'explore', 'outdoor'],
    'beach': ['cruise', 'resort', 'island', 'sunset'],
    'city': ['hotel', 'hoteldeals', 'luxury'],
    'tour': ['guide']
}

same_inv_map = {word: root for root, words in same_map.items() for word in words}

def splitter(data):
    words = re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ' ', data.lower()).split()
    return [lemmatizer.lemmatize(word) for word in words]

def convert_same(text):
    return [(same_inv_map[word] if word in same_inv_map else word) for word in splitter(text)]

def wordCount(data):
    filtered = [w for w in convert_same(' '.join(data['text'])) if w not in stop_words|drop_words]
    return Counter(filtered).most_common()

wordCount(df)[:10]

[('photo', 13537),
 ('private', 7865),
 ('beach', 6644),
 ('nature', 6610),
 ('city', 6456),
 ('tour', 2981),
 ('family', 2196),
 ('gift', 1808),
 ('happy', 1679),
 ('time', 1640)]

In [4]:
data = pd.DataFrame(set(df['countries'].sum()), columns=['countries']).set_index('countries')
data['tweets'] = 'temp-value'
data['tweets'] = data['tweets'].map(lambda _: set())

for tweet_id, row in df.iterrows():
    for country in row['countries']:
        data.loc[country]['tweets'].add(tweet_id)

feature_words = ['photo', 'private', 'city', 'nature', 'beach', 'tour', 'family']

feature_pos_di = dict(zip(feature_words, list(range(len(feature_words)))))

def vectorize_tokens(tokens):
    vector = np.zeros(len(feature_words))
    for index, feature_word in enumerate(feature_words):
        for token in tokens:
            if token == feature_word:
                vector[index] += 1
    return vector

def aggregate(tweets): return sum([tweet_ids_to_tokens[tweet] for tweet in tweets])

tweet_ids_to_tokens = df['text'].map(convert_same).map(vectorize_tokens).to_dict()

data['vectors'] = data['tweets'].map(aggregate)

In [5]:
data.drop(['China', 'India'], inplace=True)

data['vectors'].map(sum).nlargest(10)

countries
Italy        373.0
Australia    320.0
Spain        307.0
Thailand     278.0
Canada       266.0
Mexico       265.0
France       237.0
Jamaica      206.0
Turkey       173.0
Malaysia     158.0
Name: vectors, dtype: float64

In [6]:
def recommend(series, query):
    query_vector = series.loc[query]
    matrix = pd.DataFrame(series.tolist()).values

    nn = NearestNeighbors(n_neighbors=1+5, algorithm='auto', metric='cosine').fit(matrix)
    distances, similar_items = nn.kneighbors([query_vector])

    similar_item_names = series.iloc[similar_items[0]].index.tolist()
    
    return similar_item_names[1:]

In [7]:
recommend(data['vectors'], 'Australia')

['Japan', 'Morocco', 'Italy', 'Romania', 'Georgia']