In [31]:
import json
import numpy as np
import nltk
from pprint import pprint
from nltk.corpus import wordnet
from string import punctuation
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer 
import gensim 
from gensim.models import Word2Vec 
import os

nltk.download("wordnet")
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()


[nltk_data] Downloading package wordnet to /home/vatsal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/vatsal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [32]:
from glove import Corpus, Glove
corpus = Corpus() 

In [33]:
glob_corpora = []
top100 = ['#travel', '#wanderlust', '#nature', '#travelling', '#traveling', '#traveller', '#photography', '#traveler', '#trip', '#travels', '#vacation', '#love', '#travelers', '#adventure', '#tourist', '#landscape', '#travellers', '#holiday', '#explore', '#beautiful', '#tourism', '#hiking', '#beach', '#photo', '#sunset', '#photographer', '#mountains', '#globetrotter', '#summer', '#art', '#sky', '#treking', '#europe', '#view', '#architecture', '#sea', '#fun', '#happy', '#city', '#sun', '#amazing', '#lifestyle', '#backpacking', '#wanderer', '#italy', '#follow', '#life', '#visiting', '#fashion', '#autumn', '#ocean', '#outdoors', '#explorer', '#world', '#india', '#beauty', '#mountain', '#spain', '#style', '#backpacker', '#like', '#clouds', '#france', '#exploring', '#trekking', '#asia', '#me', '#friends', '#canon', '#usa', '#blogger', '#ig', '#happiness', '#sunrise', '#smile', '#holidays', '#girl', '#wander', '#germany', '#island', '#paradise', '#Travel', '#flowers', '#discover', '#voyage', '#turkey', '#sightseeing', '#landscapes', '#italia', '#outdoor', '#cute', '#indonesia', '#history', '#food', '#pic', '#forest', '#beaches', '#inspiration', '#green', '#memories']
top100 = [r.replace('#', '') for r in top100]

In [34]:
top100_stemmed = [ps.stem(word) for word in top100]
top100_mapping = {}
for i in range(0,100):
    top100_mapping[top100_stemmed[i]]=top100[i]

In [8]:
def strip_punc(s):
    new_str = ""
    for c in s:
        if c in punctuation:
            new_str += " "
        else:
            new_str += c
    return new_str

In [9]:
for file in os.listdir("./jsons/"):
    print("./jsons/" + file)
    file_ptr = open("./jsons/" + file, "r")
    dic = json.load(file_ptr)
    for post in dic:
        text = strip_punc(dic[post]['text_des'].lower())
        toks = word_tokenize(text)
        toks_ = []
        for tok in toks:
            if not wordnet.synsets(tok) or len(tok) < 3:
                continue
            tok = ps.stem(tok)
            if tok not in stop_words:
                toks_.append(tok)
        if toks_:
            glob_corpora.append(toks_)


./jsons/Luxarytravel.json
./jsons/travelbook.json
./jsons/Traveldeeper.json
./jsons/Hiking.json
./jsons/travelquotes.json
./jsons/travelstoke.json
./jsons/travel.json
./jsons/traveladdict.json
./jsons/travellersnotebook.json
./jsons/travelguide.json
./jsons/Travelabout.json
./jsons/trip.json
./jsons/solotravel.json
./jsons/Travelphotography.json
./jsons/travelbug.json
./jsons/travelpic.json
./jsons/travelgram.json
./jsons/tourist.json
./jsons/travelislife.json
./jsons/beachvibes.json
./jsons/treking.json
./jsons/Travelawesome.json
./jsons/traveltheglobe.json
./jsons/travelworld.json
./jsons/worldtraveller.json
./jsons/Travellove.json
./jsons/nature.json
./jsons/travelcaptures.json
./jsons/Citytravel.json


In [10]:
print(len(glob_corpora))

25481


In [11]:
# Using word2vec for training

glob_model = Word2Vec(glob_corpora, min_count = 1)

In [13]:
print(glob_model.wv.most_similar(positive = 'grass', topn = 20))

[('monsoon', 0.9906405210494995), ('leaf', 0.9840993881225586), ('snowi', 0.9808374047279358), ('dramat', 0.9807693362236023), ('butterfli', 0.9797319173812866), ('mist', 0.9773508310317993), ('plant', 0.9765117168426514), ('mane', 0.9760971069335938), ('fog', 0.9753571152687073), ('lite', 0.9749687910079956), ('lush', 0.9747658371925354), ('baum', 0.9730554223060608), ('dusk', 0.9719358682632446), ('branch', 0.9707391262054443), ('foggi', 0.9700635671615601), ('bamboo', 0.9697462916374207), ('wood', 0.9692462682723999), ('stream', 0.968997597694397), ('clad', 0.9688808917999268), ('redwood', 0.9674018621444702)]


In [14]:
# Using Glove for training

corpus.fit(glob_corpora, window = 10)
glove = Glove(no_components = 100, learning_rate = 0.05)
glove.fit(corpus.matrix, epochs = 30, no_threads = 4, verbose = True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')
glove.add_dictionary(corpus.dictionary)

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [27]:
# Preprocessing of input text

input_sentence = "#forest#green#wanderlust#tour"
input_sentence = "I enjoy an easy jog. In the mountains, or the forest with my own thoughts, I can escape from the world for a bit. #travel#travellife"
text = strip_punc(input_sentence.lower())
toks = word_tokenize(text)
toks_ = []
for tok in toks:
    if wordnet.synsets(tok):
        tok = ps.stem(tok)
        if tok not in stop_words and len(tok)>2:
            toks_.append(tok)

input_words = toks_
print(input_words)

['enjoy', 'easi', 'jog', 'mountain', 'forest', 'thought', 'escap', 'world', 'bit', 'travel']


In [28]:
# Calculating sentence embedding using word2vec

sent = []
for word in input_words:
    if word in glob_model.wv.vocab:
        sent.append(glob_model.wv[word])
    
print(len(sent))
sent = np.array(sent)
print(sent.shape)
embed = np.average(sent, axis = 0)
print(embed.shape)

10
(10, 100)
(100,)


In [29]:
# Finding similarity of top 100 hashtags using word2vec

ans = []
for hasht in top100_mapping:
    try:
        v1 = glob_model.wv[hasht]
        simi = np.dot(v1, embed) / (np.linalg.norm(v1) * np.linalg.norm(embed))
        ans.append((simi, top100_mapping[hasht]))
    except:
        pass

ans = sorted(ans, reverse = True)
print(ans[:20])

[(0.8708082, 'ig'), (0.80178446, 'beauty'), (0.7823005, 'love'), (0.7746443, 'adventure'), (0.77142674, 'exploring'), (0.76981014, 'world'), (0.7670982, 'like'), (0.76384693, 'nature'), (0.7550086, 'green'), (0.75313425, 'view'), (0.7530867, 'inspiration'), (0.74956316, 'life'), (0.7438802, 'amazing'), (0.74356675, 'outdoor'), (0.7383999, 'landscapes'), (0.7364143, 'memories'), (0.72931117, 'mountain'), (0.72159785, 'wanderlust'), (0.71721, 'forest'), (0.7161728, 'friends')]


In [30]:
# Finally top k for word2vec

k = 10
topk_word2vec = [h[1] for h in ans[:k]]
print(topk_word2vec)

['ig', 'beauty', 'love', 'adventure', 'exploring', 'world', 'like', 'nature', 'green', 'view']


In [87]:
# Calculating sentence embedding using glove

sent = []
print(input_words)
for word in input_words:
    if word in glove.dictionary:
        sent.append(glove.word_vectors[glove.dictionary[word]])
    
sent = np.array(sent)
print(sent.shape)
embed = np.average(sent, axis = 0)
print(embed.shape)

['forest', 'green', 'wanderlust', 'tour']
(4, 100)
(100,)


In [88]:
# Finding similarity using glove embeddings

ans2 = []
for hasht in top100_mapping:
    try:
        v1 = glove.word_vectors[glove.dictionary[hasht]]
        simi = np.dot(v1, embed) / (np.linalg.norm(v1) * np.linalg.norm(embed))
        ans2.append((simi, top100_mapping[hasht]))
    except:
        pass

ans2 = sorted(ans2, reverse = True)
print(ans2[:20])

[(0.8380648704679269, 'wanderlust'), (0.8145650633528564, 'adventure'), (0.8074920271173722, 'green'), (0.8073567039677635, 'wander'), (0.796642199847806, 'outdoor'), (0.7964946952104364, 'exploring'), (0.7501473393295741, 'mountain'), (0.7222836597743094, 'hiking'), (0.7202703288855951, 'forest'), (0.6950989619387951, 'photography'), (0.6895518054964919, 'landscapes'), (0.6802330189692215, 'trip'), (0.6801947216846282, 'backpacker'), (0.6774100972886504, 'nature'), (0.676163261342647, 'trekking'), (0.6698956183488703, 'tourist'), (0.6670689142296702, 'flowers'), (0.6656441327919346, 'beauty'), (0.6638979517838771, 'sky'), (0.6500933784545571, 'vacation')]


In [90]:
# Finally topk for glove
k = 10

topk_glove = [h[1] for h in ans2[:k]]
print(topk_glove)

['wanderlust', 'adventure', 'green', 'wander', 'outdoor', 'exploring', 'mountain', 'hiking', 'forest', 'photography']


In [91]:
print(topk_glove, topk_word2vec)

['wanderlust', 'adventure', 'green', 'wander', 'outdoor', 'exploring', 'mountain', 'hiking', 'forest', 'photography'] ['outdoor', 'wanderlust', 'usa', 'sightseeing', 'exploring', 'ig', 'backpacker', 'wander', 'landscapes', 'asia']
