In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

# Data Load

In [2]:
text = pd.read_parquet("../data/tour-text.parquet").drop_duplicates(subset='tour_id')

In [3]:
text

Unnamed: 0,tour_id,sub_category_label,category_type,product_type,poi_type_name,country_name,continent_name,main_topic_id,tokens
0,853,guided_walking_tour,Within city,Guided Walking Tour,Castles & Palaces,Austria,Europe,23,"[salzburg, city, tour, with, tickets, to, moza..."
1,6557,day_trip,Outside city,Day Trip,Castles & Palaces,Romania,Europe,13,"[peles, castle, &, wine, tasting, tour, -, ful..."
2,14812,guided_experience,Within city,Guided Experience,Others,Others,Others,6,"[adrena, line:, zip, line, adventure, in, sain..."
3,15216,guided_walking_tour,Within city,Guided Walking Tour,Others,Others,Others,23,"[szentendre:, half-day, tour, with, marzipan, ..."
4,15584,day_trip,Outside city,Day Trip,Others,Others,Others,23,"[madeira, walks, -, rabaçal, and, the, 25, fou..."
5,16257,guided_experience,Within city,Guided Experience,Others,Others,Others,2,"[algarve, dolphin, watching, &, marine, life, ..."
6,17904,guided_motorized_tour,Within city,Guided Motorized Tour,District or Neighborhood,Spain,Europe,13,"[barcelona, by, vespa, scooter:, 4-hour, guide..."
7,19063,day_trip,Outside city,Day Trip,Others,Others,Others,22,"[private, shore, excursion, in, berlin, enjoy,..."
8,21306,day_trip,Outside city,Day Trip,Amusement & Activity Parks,Mexico,North America,13,"[from, cancun:, xcaret, park, full-day, trip, ..."
9,22003,guided_walking_tour,Within city,Guided Walking Tour,Museums,Germany,Europe,23,"[berlin, historical, highlights, walking, tour..."


In [4]:
vectors = pd.read_parquet("../data/tour-vectors.parquet").set_index('tour_id')

In [5]:
vectors.loc[853]

vector    [-0.05225854367017746, -0.732638955116272, -0....
Name: 853, dtype: object

# Run Clustering

In [6]:
#from tsnecuda import TSNE
from MulticoreTSNE import MulticoreTSNE as TSNE
perplexities = [3.0, 5.0, 10.0, 30.0, 40]
tsne_articles = []
for perp in perplexities:
    print(f"running model for perplexity {perp:.2}\n")
    tsne_model = TSNE(n_components=2, verbose = 2, perplexity=perp, learning_rate=50.0, early_exaggeration=45,
                      n_iter=10000, metric='cosine', n_jobs=8)
    tsne_articles.append((perp, tsne_model.fit_transform(np.array(vectors.vector.values.tolist()))))

running model for perplexity 3.0

running model for perplexity 5.0



KeyboardInterrupt: 

In [None]:
import pickle

with open('tsne_w2v_3_5_10_30_40.pcl', "wb") as dest:
    pickle.dump(tsne_articles, dest)

# Plot Clusters

In [None]:
from plotly import tools
import plotly.offline as py
import plotly.graph_objs as go

fig = tools.make_subplots(rows=len(tsne_articles), cols=1,
                          subplot_titles=tuple([f"Perpexlity: {perp:.2}" for (perp,_) in tsne_articles]))

for (i, (perp, tsne_results)) in enumerate(tsne_articles):
    trace = go.Scattergl(
        x = tsne_articles[i][:,0],
        y = tsne_articles[i][:,1],
        mode = 'markers',
        text=vectors.index.values
    )
    fig.append_trace(trace, i+1, 1)

fig['layout'].update(height=600, width=800, title='tSNE Results for several perplexities')
py.iplot(data, filename='tsne_w2v')