## Evaluating Dns2Vec evaluation

This notebook loads the Dns2Vec model as specified in the constant defined at the start and then evaluates the model by doing the following:

1. Plot model training loss.

1. Find similar words to some common domain names.

2. Test and identify some interesting domain name ananlogies.

2. Perform t-SNE on the DNS vectors visualize using plotly.

3. Define an efficiency score which is the percentage of domains for which at least one of the top 3 closest domains identified by Dns2Vec lies in the same category (news/media, education, shopping etc) as the input domain name itself.

Based on the results obtained it seems that the **Dns2Vec model with 128 dimensional embedding, learning rate of 0.01, window size of 3, negative sampling sample size of 20 and trained for 100 epochs provides the best results** both in term of a manual inspection of the domains as well as the efficiency metric.

The following sections of the notebook cover the code and the discuss the above listed items.

### Code setup

In [186]:
# all the imports at the start of the notebook
import os

# pandas and numpy
import pandas as pd
import numpy as np

# misc
from datetime import datetime
from operator import itemgetter
from collections import Counter
import json

# Gensim for Word2Vec
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

# plotly for charting
from plotly import tools
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.io as pio
import plotly

# multicore version of TSNE
from MulticoreTSNE import MulticoreTSNE as TSNE

In [187]:
# global constants
APP_NAME = "dnswrangler"
APP_VER = "1.0.0.0"
DATASET_VER = "1.0.0.0"
RAW_DATA_DIR = "raw_data"
DATA_DIR = "data"
DNS_DATASET_FILE_NAME = "dns_queries.csv"
MODEL_FILE_SUFFIX = "_dns_word2vec.model"
TOP_N_SIMILAR_WORDS = 15
TOP_N_SIMILAR_WORDS_FOR_MODEL_ACCURACY_METRIC = 3
MIN_SIMILAR_WORDS_W_MATCHING_CATEGORY_FOR_MODEL_ACCURACY_METRIC = 1

DOMAIN_CATEGORIZATION_FILE_NAME = "website_categories_k9.csv"
DOMAIN_CATEGORY_COUNT_THRESHOLD = 100
# pick one run, the model is loaded from the run specific directory
# RUN_NAME = "run_1_epoch"
# RUN_NAME = "run_15_epochs"
RUN_NAME = "run_150_epochs"
# RUN_NAME = "run_10_epochs"
# RUN_NAME = "run_20_epochs"
#RUN_NAME = "run_100_epochs"
DNS_VECTORS_FILE_NAME = "dnsvectors"
DNS_VECTOR_METADATA_FILE_NAME = "metadata"
MODEL_TRAINING_LOSS_FILE_NAME = "training_loss_per_epoch.csv"
MODEL_TRAINING_LOSS_PLOT_FILE_NAME = "training_loss_per_epoch_plot.html"
W2V_PARAMS_FILE_NAME = "word2vec_params.json"
SIMILIAR_DOMAINS_FILE_NAME = "similiar_domains_from_nb.csv"
ANALOGIES_FILE_NAME = "analogies.txt"
MODEL_EFFICIENCY_FILE_NAME = "efficiency.csv"
SIMILAR_DOMAINS_WHOLE_VOCAB_TIDY = "similar_domains_whole_vocab_tidy.csv"
TSNE_DOMAIN_ANALOGIES_PLOT_FILE_NAME = "domain_analogies_plot.html"

# t-SNE related
TSNE_PERPLEXITY = 32
TSNE_DIMENSIONS = 2
TSNE_ITERATIONS = 5000
TSNE_RANDOM_STATE = 1603
TSNE_NUM_WORKERS = 42
TSNE_PLOT_FILE_NAME = "tsne.html"


class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''
    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        print("latest training loss is {}".format(model.get_latest_training_loss()))

        self.epoch += 1       

### Model

In [188]:
# load dns word2vec embeddings
file_path = os.path.join(DATA_DIR, DATASET_VER, RUN_NAME,
                         DNS_DATASET_FILE_NAME.split('.')[0] + MODEL_FILE_SUFFIX)
print("loading model from file {}".format(file_path))
dns_model = Word2Vec.load(file_path)

# print the model parameters as well
file_path = os.path.join(DATA_DIR, DATASET_VER, RUN_NAME, W2V_PARAMS_FILE_NAME)
with open(file_path, 'r') as params_file:
    params = json.load(params_file)
    print("model parameters {}".format(params))

loading model from file data/1.0.0.0/run_150_epochs/dns_queries_dns_word2vec.model
model parameters {'run_name': 'run_150_epochs', 'num_sentences': 9238066, 'embedding_size': 128, 'window_size': 7, 'min_count': 5, 'negative': 5, 'max_vocab_size': 40000, 'sample': 1e-05, 'ns_exponent': -1, 'num_workers': 42, 'sg': 1, 'epochs': 150, 'seed': 1603}


#### Training loss
Gensim does not provide training loss correctly, see https://github.com/RaRe-Technologies/gensim/pull/2135. Instead we look at the decrease in the rate of increase of the training loss with each epoch, as explained in this SO https://stackoverflow.com/questions/52038651/loss-does-not-decrease-during-training-word2vec-gensim.

In [189]:
training_loss_file_path = os.path.join(DATA_DIR, DATASET_VER, RUN_NAME, MODEL_TRAINING_LOSS_FILE_NAME)
df_training_loss = pd.read_csv(training_loss_file_path, header=None)
df_training_loss.columns = ['loss']
df_training_loss['iteration'] = range(len(df_training_loss))
df_training_loss['loss'] = df_training_loss['loss'].map(lambda x: float(x.split('.')[1]))
display(df_training_loss.head())
df_training_loss_delta = df_training_loss.diff()

import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
df_training_loss_delta.head()
df_training_loss_delta = df_training_loss_delta.dropna()
df_training_loss_delta['iteration'] = range(len(df_training_loss_delta))
df_training_loss_delta.head()
fig = {
    'data': [
        {
            'x': df_training_loss_delta['iteration'],
            'y': df_training_loss_delta['loss'],
        } 
    ],
    'layout': {
        'xaxis': {'title': 'epoch'},
        'yaxis': {'title': "training losss delta"}
    }
}

# IPython notebook
iplot(fig, filename='training loss delta')
training_loss_plot_file_path = os.path.join(DATA_DIR, DATASET_VER, RUN_NAME, MODEL_TRAINING_LOSS_PLOT_FILE_NAME)
plotly.offline.plot(fig, filename=training_loss_plot_file_path)
# pio.write_image(fig, training_loss_plot_file_path) # weird orca errors, leaving this for now

Unnamed: 0,loss,iteration
0,817788.0,0
1,1467746.0,1
2,2029725.0,2
3,2538130.0,3
4,3081326.0,4


'file:///home/ec2-user/dnswrangler/data/1.0.0.0/run_150_epochs/training_loss_per_epoch_plot.html'

### Find similar words

In [190]:
# find similar words (domains)
# note the words should be written only using the last 2 or last 3 (in case the domain ends in a country TLD lke nsit.ac.in)
# pieces of the domain, for example mail.yahoo.com would not work
queries = ["elle.com", "uber.com", "webmd.com", "match.com", "glassdoor.com", "ixl.com",
          "yelp.com", "food.com", "walmart.com", "foxnews.com", "pornhub.com",
          "salliemae.com", "54.207", "newmexico.gov", "outbrain.com", "norton.com", "vzw.com", "ebay.com", 
          "office.com", "apartments.com", "23andme.com", "ups.com", "uber.com"]
# dns_model.wv.most_similar(q, topn=10)
pd.set_option('display.max_colwidth', -1)
df = pd.DataFrame([(q,', '.join([d for d,_ in dns_model.wv.most_similar(q, topn=TOP_N_SIMILAR_WORDS)])) for q in queries])
df.columns = ['Query Name', 'Similar Domains']

display(df)
file_path = os.path.join(DATA_DIR, DATASET_VER, RUN_NAME, SIMILIAR_DOMAINS_FILE_NAME)
df.to_csv(file_path, index=False)


Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.



Unnamed: 0,Query Name,Similar Domains
0,elle.com,"getpocket.com, economist.com, hbr.org, nextadvisor.com, theatlantic.com, theringer.com, newyorker.com, wired.com, vox.com, mozilla.net, bloomberg.com, fivethirtyeight.com, thepointsguy.com, prevention.com, mozaws.net"
1,uber.com,"ubr.to, bttn.io, adj.st, bnc.lt, test-app.link, fb.gg, m.me, ig.me, onelink.me, amazon.it, app.link, amazon.es, linkedinmobileapp.com, etsy.me, lyft.com"
2,webmd.com,"honcode.ch, tagsrvcs.com, surveywriter.net, ibclick.stream, medscape.com, medicinenet.com, healthline.com, medscapestatic.com, sfvwe.com, emedicinehealth.com, mayoclinic.org, everydayhealth.com, rxlist.com, medicalnewstoday.com, agoramedia.com"
3,match.com,"peoplemedia.com, hexagon-analytics.com, ourtime.com, siftscience.com, blackpeoplemeet.com, zoosk.com, pof.com, okccdn.com, chowhound.com, gotinder.com, birchlane.com, tindersparks.com, ibmlivenationapi.com, wdrimg.com, okcupid.com"
4,glassdoor.com,"ziprecruiter.com, indeed.com, glassdoor.de, filter.to, appcast.io, careerbuilder.com, jobcase.com, jobs2careers.com, milwpc.com, myjobhelper.com, talentbrew.com, j2c.com, uber.com, snagajob.com, dominos.com"
5,ixl.com,"teacherspayteachers.com, sfdr-cisd.org, sugarandcotton.com, fullsail.edu, smiledirectclub.com, i-ready.com, arteza.com, familyminded.com, screencastify.com, romper.com, elledecor.com, privacypop.com, learnosity.com, farandwide.com, snorgtees.com"
6,yelp.com,"yelpcdn.com, braze.com, foursquare.com, cuebiq.com, wfxtriggers.com, w-x.co, android.com, ipv4only.arpa, mapbox.com, tripadvisor.com, wunderground.com, whatsapp.net, waze.com, att.net, mparticle.com"
7,food.com,"geniuskitchen.com, whisk.com, snidigital.com, foodnetwork.com, sndimg.com, addapinch.com, mediavine.com, ahalogy.com, filestackapi.com, sni-dat.com, adthrive.com, tastykitchen.com, allrecipes.com, unicornengine.net, spendwithpennies.com"
8,walmart.com,"wal.co, walmartimages.com, wmt.co, hlserve.com, radar.io, schema.org, lowes.com, wayfair.com, target.com, samsclub.com, webcollage.net, chatidcdn.com, myvisualiq.net, bestbuy.com, mparticle.com"
9,foxnews.com,"fncstatic.com, foxbusiness.com, edgekey.net, h-cdn.com, admantx.com, pxltopxl.com, spots.im, securetve.com, foxnewsinsider.com, rpxnow.com, sercheshub-hearing-aids.com, segment.com, chartbeat.net, segment.io, spot.im"


### Word analogies with Domain names

In [191]:
def get_analogies(pos, neg, result_index=0, mode="a"):
    w = dns_model.wv.most_similar_cosmul(positive=pos, negative=neg)
    print("positive {}, negative {}, result \"{}\"".format(pos, neg, w[result_index][0]))
    file_path = os.path.join(DATA_DIR, DATASET_VER, RUN_NAME, ANALOGIES_FILE_NAME)
    with open(file_path, mode) as analogies_file:
        analogies_file.write("positive {}, negative {}, result \"{}\"\n".format(pos, neg, w[result_index][0]))

"""

# for future experiments
# encodes a sense of geography
get_analogies(['yelp.fr', 'glassdoor.com'], ['yelp.com'])

# find an image site, but basically it is finding the first closest one because
# difference between walmart and walmartimages is very small
get_analogies(['walmart.com', 'target.com'], ['walmartimages.com'])

#w = dns_model.wv.most_similar(positive=['dealsofamerica.com', 'bestblackfriday.com'], negative=['walmart.com'])
#print("result \"{}\"".format(w[0][0]))

# competitor
#w = dns_model.wv.most_similar(positive=['epicgames.com', 'glassdoor.com'], negative=['infinityward.com'])
#print("result \"{}\"".format(w[0][0]))

# same as the walmart example


get_analogies(['delta.com', 'uber.com'], ['hilton.com'])



get_analogies(['delta.com', 'hilton.com'], ['lyft.com'], result_index=0)
"""

# good
get_analogies(['overstock.com', 'wfcdn.com'], ['wayfair.com'], mode="w")
get_analogies(['delta.com', 'hilton.com'], [], result_index=1)
get_analogies(['lyft.com', 'tripadvisor.com'], ['delta.com'], result_index=0)


positive ['overstock.com', 'wfcdn.com'], negative ['wayfair.com'], result "ostkcdn.com"
positive ['delta.com', 'hilton.com'], negative [], result "tripadvisor.com"
positive ['lyft.com', 'tripadvisor.com'], negative ['delta.com'], result "nps.gov"


## Metric for finding out embedding efficiency
We setup some utility functions first that search a database that was setup by a separate piece of code to map a domain name to a category which defines what type of website this is, meaning is it a sports website, news/media, entertainment etc.

In [192]:
def get_website_category(q):
    try:
        return website_categories.loc[website_categories['domain'] == q, 'category'].iloc[0]
    except Exception as e:
        return 'ERROR'
        
# load the website (domain) to category mapping file
file_path = os.path.join(RAW_DATA_DIR, DATASET_VER, DOMAIN_CATEGORIZATION_FILE_NAME)
website_categories = pd.read_csv(file_path, header=None)
website_categories.columns = ['domain', 'category']
display(website_categories.head())

website_categories_vc = website_categories['category'].value_counts()
most_frequent_domain_categories = list(website_categories_vc[website_categories_vc >= DOMAIN_CATEGORY_COUNT_THRESHOLD].index)
print(most_frequent_domain_categories)

Unnamed: 0,domain,category
0,,Uncategorized
1,taboola.com,Web Ads/Analytics
2,dotomi.com,Web Ads/Analytics
3,krxd.net,Web Ads/Analytics
4,addthis.com,Technology/Internet


['Uncategorized', 'Business/Economy', 'Technology/Internet', 'Suspicious', 'Pornography', 'Shopping', 'Health', 'Education', 'Entertainment', 'News/Media', 'Financial Services', 'Games', 'Restaurants/Dining/Food', 'Web Ads/Analytics', 'Sports/Recreation', 'Vehicles', 'Society/Daily Living', 'Travel', 'Reference', 'Government/Legal', 'Malicious Sources/Malnets', 'Religion', 'Placeholders', 'Malicious Outbound Data/Botnets', 'Content Servers', 'Real Estate', 'Personal Sites', 'Mixed Content/Potentially Adult', 'Political/Social Advocacy', 'Military', 'Adult/Mature Content', 'Search Engines/Portals', 'Hacking', 'Weapons', 'Job Search/Careers', 'Charitable Organizations', 'Web Hosting', 'Audio/Video Clips', 'Newsgroups/Forums', 'File Storage/Sharing', 'Scam/Questionable/Illegal', 'Gambling', 'Office/Business Applications', 'Software Downloads', 'Social Networking', 'Phishing', 'Proxy Avoidance', 'Brokerage/Trading', 'Personals/Dating', 'Email', 'Chat (IM)/SMS', 'Alternative Spirituality/Be

In [193]:
# get similar domains in a tidy format i.e. if we have one input
# domain and want to find 5 similar domains, the resulting dataframe
# returned from this function would be 5 rows and 2 columns with the
# input domain name repeated in all 5 rows of column 1
def get_similar_domains_tidy(q, n):
    similar = [d for d,_ in dns_model.wv.most_similar(q, topn=n)]
    df = pd.DataFrame({'similar': similar})
    df['domain'] = q
    return df
    

# find similar domains to a given list of domains and 
# enriches the tidy dataframe returned by get_similar_domains_tidy
# to have the website category also in the form of two columns
# one for the input domain name and one for the similar domain
# remember that each similar domain is in a row by itself.
# finally a match column is added to compare the category
# of the original domain with the input domain, it is set to True
# if they match, False otherwise
def find_similar_domains(queries, n):
    # find similar words (domains)
    # print(queries)
    pd.set_option('display.max_colwidth', -1)
    # query_domain_categories = [get_website_category(q) for q in queries]
    print("going to get {} similar domains for {} domains".format(n, len(queries)))
    s = datetime.now()
    df_similar_domains = pd.concat([get_similar_domains_tidy(q, n) for q in queries])
    df_similar_domains = pd.merge(df_similar_domains,
                                  website_categories,
                                  how='left',
                                  on=["domain"])
    e = datetime.now()
    print("finished getting similar domains in {}...".format(e-s))
    df_similar_domains = df_similar_domains.rename(index=str, columns={"category": "domain_category"})
    
    df_similar_domains = pd.merge(df_similar_domains,
                                  website_categories.rename(index=str, columns={"domain": "similar"}),
                                  how='left',
                                  on=["similar"])
    df_similar_domains = df_similar_domains.rename(index=str, columns={"category": "similar_domain_category"})
    
    
    
    df_similar_domains['match'] = df_similar_domains.apply(lambda row: row['domain_category'] == row['similar_domain_category'],
                                                           axis=1)
    return df_similar_domains

#### Load the domain vectors and metadata file (domain name), to prepare for t-SNE

In [194]:
#queries = ['www.elle.com', 'www.zillow.com', 'www.southwest.com', 'www.tripadvisor.com', 'www.cnn.com']
# load dns word2vec embeddings
vectors_file_path = os.path.join(DATA_DIR, DATASET_VER, RUN_NAME, DNS_VECTORS_FILE_NAME)
vectors_metadata_file_path = os.path.join(DATA_DIR, DATASET_VER, RUN_NAME, DNS_VECTOR_METADATA_FILE_NAME)
vectors = np.loadtxt(vectors_file_path)
vectors_metadata = pd.read_csv(vectors_metadata_file_path, header=None)
vectors_metadata.columns = ['domain']

#### Embedding efficiency metric

In [195]:
queries = (vectors_metadata['domain'])
queries = queries.dropna()
n = TOP_N_SIMILAR_WORDS_FOR_MODEL_ACCURACY_METRIC
# queries = ['match.com', 'ixl.com']
df_domains_w_results = find_similar_domains(queries, n)
display(df_domains_w_results.sample(TOP_N_SIMILAR_WORDS))
file_path = os.path.join(DATA_DIR, DATASET_VER, RUN_NAME, SIMILAR_DOMAINS_WHOLE_VOCAB_TIDY)
df_domains_w_results.to_csv(file_path, index=False)

df_summarized_results = df_domains_w_results.groupby(['domain'])[['match']].sum()

df_summarized_results['match'] = df_summarized_results['match'].map(lambda x: 1 if x >= MIN_SIMILAR_WORDS_W_MATCHING_CATEGORY_FOR_MODEL_ACCURACY_METRIC else 0 )

effi = np.mean(df_summarized_results)
print("Efficiency of the Dns embeddings is {}".format(effi))
df_effi = pd.DataFrame({'efficiency': [effi]})
file_path = os.path.join(DATA_DIR, DATASET_VER, RUN_NAME, MODEL_EFFICIENCY_FILE_NAME)
df_effi.to_csv(file_path, index=False)

going to get 3 similar domains for 16129 domains



Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.



finished getting similar domains in 0:00:29.124546...


Unnamed: 0,similar,domain,domain_category,similar_domain_category,match
32112,fdcollab.com,scribesoft.com,Technology/Internet,Web Ads/Analytics,False
42639,mysmithfield.com,smgord.loc,Uncategorized,Uncategorized,True
3795,truekey.com,tkassets.com,Business/Economy,Technology/Internet,False
38558,gannett-cdn.com,demingheadlight.com,News/Media,News/Media,True
46559,89.72,89.68,Uncategorized,Uncategorized,True
32829,myinnovage.com,celebros.com,Technology/Internet,Business/Economy,False
41523,apiforpush.com,appcaptcha.com,Uncategorized,Uncategorized,True
36861,llnw.net,bjupressonline.com,Education,Content Servers,False
48376,bigfishgames.com,jetdogs.com,Games,Games,True
28322,returnpath.net,fragrancenet.com,Shopping,Business/Economy,False


Efficiency of the Dns embeddings is match    0.554715
dtype: float64


### t-SNE on DNS Vectors

#### Load the domain vectors and metadata file (domain name), to prepare for t-SNE

In [196]:
# prepare a dataframe specifically for t-SNE
tsne_df = pd.DataFrame(vectors)
tsne_df['domain'] = vectors_metadata['domain']

%time tsne_df = pd.merge(tsne_df, website_categories, how='left', on=["domain"])
tsne_df['category'].value_counts()
len(tsne_df)


CPU times: user 82.6 ms, sys: 0 ns, total: 82.6 ms
Wall time: 81.6 ms


16130

In [197]:
# Creates and TSNE model and plots 
NUM_WORKERS = 42
"""
categories_that_cluster_well = ['Health', 'Shopping', 'Pornography', 'Government/Legal',
                               'Military', 'Hacking',
                               'Adult/Mature Content', 'Resturant/Dining/Food', 
                               'Personal Sites', 'Malicious Outbound Data/Botnets']
tsne_df_subset = tsne_df[tsne_df['category'].isin(categories_that_cluster_well)]
"""
tsne_df_subset = tsne_df # in case we need to subset, see abovee
tsne_model = TSNE(perplexity=TSNE_PERPLEXITY,
                  n_components=TSNE_DIMENSIONS,
                  n_iter=TSNE_ITERATIONS,
                  random_state=TSNE_RANDOM_STATE,
                  verbose=True, n_jobs=TSNE_NUM_WORKERS)

%time fit = tsne_model.fit_transform(tsne_df_subset.drop(['domain', 'category'], axis=1).as_matrix())


Method .as_matrix will be removed in a future version. Use .values instead.



CPU times: user 4h 9min 52s, sys: 5min 46s, total: 4h 15min 38s
Wall time: 6min 7s


In [198]:
tsne_fit = pd.DataFrame(fit)
tsne_fit.columns = ['x', 'y']
tsne_fit['category'] = tsne_df_subset['category']
tsne_fit['domain'] = tsne_df_subset['domain']
print(tsne_fit.head())
unique_categories = tsne_fit['category'].unique()
tsne_fit['category'].value_counts()

           x          y             category       domain
0 -24.951360  9.002411   Uncategorized        NaN        
1 -21.392659  16.388566  Web Ads/Analytics    taboola.com
2 -24.188924  8.949873   Web Ads/Analytics    dotomi.com 
3 -25.737096  8.882002   Web Ads/Analytics    krxd.net   
4 -26.047689 -6.269995   Technology/Internet  addthis.com


Uncategorized                      3269
Technology/Internet                2564
Business/Economy                   1764
Suspicious                         1119
Web Ads/Analytics                  881 
Shopping                           596 
News/Media                         454 
Games                              417 
Financial Services                 410 
Entertainment                      367 
Education                          361 
Content Servers                    324 
Health                             302 
Pornography                        246 
Malicious Sources/Malnets          228 
Search Engines/Portals             211 
Placeholders                       177 
Travel                             172 
Reference                          153 
Government/Legal                   153 
Restaurants/Dining/Food            125 
Vehicles                           124 
Sports/Recreation                  123 
Audio/Video Clips                  112 
Mixed Content/Potentially Adult    111 


In [199]:
"""
Technology/Internet                480
Shopping                           465
Business/Economy                   443
News/Media                         435
Travel                             258
Search Engines/Portals             251
Entertainment                      241
Financial Services                 233
Pornography                        190
Education                          162
Health                             156
Government/Legal                   141
Sports/Recreation                  131
Reference                          120
Uncategorized                      119
Web Ads/Analytics                  109
Games                              102
Society/Daily Living               102
"""

# unique_categories = ['Content Servers', 'Malicious Sources/Malnets', 'Sports/Recreation', 'Health', 'Education', 'Technology/Internet', 'Shopping', 'Business/Economy', 'News/Media', 'Travel', 'Entertainment']
fig = {
    'data': [
        {
            'x': tsne_fit[tsne_fit['category']==category]['x'],
            'y': tsne_fit[tsne_fit['category']==category]['y'],
            'text': tsne_fit[tsne_fit['category']==category]['domain'],
            'name': category, 'mode': 'markers',
            #'visible': 'legendonly',
        } for category in unique_categories   # # most_frequent_domain_categories
    ],
    'layout': {
        'xaxis': {'title': ''},
        'yaxis': {'title': ""}
    }
}

# IPython notebook
iplot(fig, filename='t-SNE plot of domain name vectors')
tsne_file_path = os.path.join(DATA_DIR, DATASET_VER, RUN_NAME, TSNE_PLOT_FILE_NAME)
plotly.offline.plot(fig, filename=tsne_file_path)


'file:///home/ec2-user/dnswrangler/data/1.0.0.0/run_150_epochs/tsne.html'

### Plotting domain analogies
Here we attempt to plot analogies similar to the famous king-man+woman=queen analogy in the original word2vec paper. We use the t-SNE reduced form to visualize the domain vectors.

In [200]:
"""
positive ['overstock.com', 'wfcdn.com'], negative ['wayfair.com'], result "ostkcdn.com"
positive ['delta.com', 'hilton.com'], negative [], result "tripadvisor.com"
positive ['lyft.com', 'tripadvisor.com'], negative ['delta.com'], result "nps.gov"
"""

tsne_fit_subset1 = tsne_fit[tsne_fit['domain'].isin(['overstock.com', 'ostkcdn.com'])]
tsne_fit_subset2= tsne_fit[tsne_fit['domain'].isin(['wayfair.com', 'wfcdn.com'])]

trace1 = go.Scatter(
    x=tsne_fit_subset1['x'],
    y=tsne_fit_subset1['y'],
    mode='lines+text+markers',
    #name='Lines and Text',
    text=tsne_fit_subset1['domain'],
    textposition='bottom right'
)

trace2 = go.Scatter(
   x=tsne_fit_subset2['x'],
    y=tsne_fit_subset2['y'],
    mode='lines+text+markers',
    text=tsne_fit_subset2['domain'],
    textposition='bottom left'
)

data = [trace1, trace2]

layout = go.Layout(
    showlegend=False,

)

fig = go.Figure(data=data, layout=layout)

# IPython notebook
iplot(fig, filename='t-SNE plot of domain name analogies')
tsne_file_path = os.path.join(DATA_DIR, DATASET_VER, RUN_NAME, TSNE_DOMAIN_ANALOGIES_PLOT_FILE_NAME)
plotly.offline.plot(fig, filename=tsne_file_path)

'file:///home/ec2-user/dnswrangler/data/1.0.0.0/run_150_epochs/domain_analogies_plot.html'