In [10]:
import pandas as pd
import numpy as np
import pickle
%matplotlib inline

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:78% !important; }</style>"))

run_cluster = False
cuda = False
points_data = 'points_data.pcl'

# Data Load

In [2]:
text = pd.read_parquet("../data/tour-text.parquet").drop_duplicates(subset='tour_id')

In [3]:
text

Unnamed: 0,tour_id,sub_category_label,category_type,product_type,poi_type_name,country_name,continent_name,main_topic_id,tokens
0,853,guided_walking_tour,Within city,Guided Walking Tour,Castles & Palaces,Austria,Europe,23,"[salzburg, city, tour, with, tickets, to, moza..."
1,6557,day_trip,Outside city,Day Trip,Castles & Palaces,Romania,Europe,13,"[peles, castle, &, wine, tasting, tour, -, ful..."
2,14812,guided_experience,Within city,Guided Experience,Others,Others,Others,6,"[adrena, line:, zip, line, adventure, in, sain..."
3,15216,guided_walking_tour,Within city,Guided Walking Tour,Others,Others,Others,23,"[szentendre:, half-day, tour, with, marzipan, ..."
4,15584,day_trip,Outside city,Day Trip,Others,Others,Others,23,"[madeira, walks, -, rabaçal, and, the, 25, fou..."
5,16257,guided_experience,Within city,Guided Experience,Others,Others,Others,2,"[algarve, dolphin, watching, &, marine, life, ..."
6,17904,guided_motorized_tour,Within city,Guided Motorized Tour,District or Neighborhood,Spain,Europe,13,"[barcelona, by, vespa, scooter:, 4-hour, guide..."
7,19063,day_trip,Outside city,Day Trip,Others,Others,Others,22,"[private, shore, excursion, in, berlin, enjoy,..."
8,21306,day_trip,Outside city,Day Trip,Amusement & Activity Parks,Mexico,North America,13,"[from, cancun:, xcaret, park, full-day, trip, ..."
9,22003,guided_walking_tour,Within city,Guided Walking Tour,Museums,Germany,Europe,23,"[berlin, historical, highlights, walking, tour..."


In [4]:
vectors = pd.read_parquet("../data/tour-vectors.parquet").set_index('tour_id')

In [5]:
vectors.loc[853]

vector    [-0.05225854367017746, -0.732638955116272, -0....
Name: 853, dtype: object

# Run Clustering

In [7]:
from numpy.random import choice

sample_size = 5000
plot_data = np.zeros(10)
if run_cluster:
    plot_data = vectors.iloc[choice(vectors.shape[0], sample_size, False)]
    plot_data.to_pickle(points_data)
else:
    plot_data = pd.read_pickle(points_data)
    sample_size = plot_data.shape[0]

In [8]:
tsne_articles = []

if run_cluster:
    from tsnecuda import TSNE
    #from MulticoreTSNE import MulticoreTSNE as TSNE
    perplexities = [10.0, 12.5, 20.0]
    for perp in perplexities:
        print(f"running model for perplexity {perp:.2f}\n")
        tsne_model = TSNE(n_components=2, verbose = 2, perplexity=perp, learning_rate=300.0, init='random',
                          early_exaggeration=50.0, n_iter=15000, theta=0.1, metric='euclidean')
        tsne_articles.append((perp, tsne_model.fit_transform(np.array(plot_data.vector.values.tolist()))))
    with open('tsne_w2v_10_12_20.pcl', "wb") as dest:
        pickle.dump(tsne_articles, dest)
else:
    with open('tsne_w2v_10_12_20.pcl', "rb") as source:
        tsne_articles = pickle.load(source)

In [9]:
tsne_articles[0]

(10.0, array([[ -0.703296  ,  -1.4601798 ],
        [-67.59611   ,  45.54607   ],
        [ -0.70357925,  -1.4588956 ],
        ...,
        [  0.14335412,  64.98401   ],
        [ -0.70258754,  -1.4566941 ],
        [ 77.07234   ,  36.47841   ]], dtype=float32))

# Plot Clusters

In [11]:
from plotly import tools
import plotly.offline as py
import plotly.graph_objs as go

py.init_notebook_mode(connected=True)
fig = tools.make_subplots(rows=len(tsne_articles), cols=1,
                          subplot_titles=tuple([f"Perplexity: {perp:6.2f}" for (perp,_) in tsne_articles]))

for (i, (perp, tsne_results)) in enumerate(tsne_articles):
    trace = go.Scattergl(
        x = tsne_articles[i][1][:,0],
        y = tsne_articles[i][1][:,1],
        mode = 'markers',
        marker = dict(
            size = 5,
            color = 'rgba(50, 150, 75, .2)',
            line = dict(
                width = 1,
                color = 'rgb(0, 0, 0)'
            )
        )
    )
    fig.append_trace(trace, i+1, 1)

fig['layout'].update(height=4000, width=1200, title='tSNE Results for several perplexities')
py.iplot(fig, filename='tsne_w2v')

This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x2,y2 ]
[ (3,1) x3,y3 ]

