# Visualising word embeddings

We are going to use the Google News embeddings (300d). **3GB+ of RAM are required**

In [None]:
import numpy as np
import pandas as pd

from gensim.models.keyedvectors import KeyedVectors

In [None]:
# I WILL EXECUTE THIS ONE ONLY ONCE!
GOOGLE_VECTORS = "/Users/albarron/corpora/embeddings/GoogleNews/GoogleNews-vectors-negative300.bin.gz"
wv = KeyedVectors.load_word2vec_format(GOOGLE_VECTORS,
    binary=True)# , limit=400000)

In [None]:
len(wv.vocab)

## Screening the vocabulary

In [None]:
vocab = pd.Series(wv.vocab)
print(vocab.iloc[0: 100])

In [None]:
# Notice that in the book there is one 0 missing
print(vocab.iloc[100000: 1000006])

In [None]:
print(vocab['New_York'])

Back to the slides

## Computing the distance between two words

In [None]:
# Getting the vectors (I don't really need these variables)
v1 = wv['Illinois'] 
v2 = wv['Illini']

In [None]:
# Euclidean distance
np.linalg.norm(v1 - v2)

In [None]:
# Cosine "distance"
cos_similarity = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
1 - cos_similarity

Back to the slides

In [None]:
wv.most_similar(positive=['city', 'cities', 'us'], topn=10)

## Distance between cities

1. Getting a list of cities from a fix list (Geocities)

In [None]:
# Loading a dataset from nlpia
#from nlpia.data.loaders import get_data
#cities = get_data('cities')
# This downloader from nlpia doesn't work. 
# Download it from https://www.dropbox.com/s/tcri5eyzpabhnyy/cities.csv.gz?dl=1
# and save it in the same place where your notebook is
cities = pd.read_csv('cities.csv.gz')

In [None]:
cities.head(1).T

In [None]:
# Keeping US cities only
us = cities[(cities.country_code == 'US') & (cities.admin1_code.notnull())].copy()

# Loading states from another repo
states = pd.read_csv('http://www.fonz.net/blog/wp-content/uploads/2008/04/states.csv')
states = dict(zip(states.Abbreviation, states.State))

# adding the info to 
us['city'] = us.name.copy()
us['st'] = us.admin1_code.copy()
us['state'] = us.st.map(states)
us[us.columns[-3:]].head()

In [None]:
# Are these cities in our vocabulary?

vocab = pd.np.concatenate([us.city, us.st, us.state])
vocab = np.array([word for word in vocab if word in wv.wv])
vocab[:5]

In [None]:
len(vocab)

In [None]:
# *Adding* state info to the game (as there are homonymns in different states)
city_plus_state = []
for c, state, st in zip(us.city, us.state, us.st):
    if c not in vocab:
        continue
    row = []
    if state in vocab:
        row.extend(wv[c] + wv[state])
    else:
        row.extend(wv[c] + wv[st])
    city_plus_state.append(row)
us_300D = pd.DataFrame(city_plus_state)
us_300D
# back to the slides 

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
# us_300D = get_data('cities_us_wordvectors')
us_2D = pca.fit_transform(us_300D.iloc[:, :300])

In [None]:
print(len(us_2D))
us_2D

In [None]:
import matplotlib.pyplot as plt
import numpy as np
 
from nlpia.data.loaders import get_data
df = get_data('cities_us_wordvectors_pca2_meta')

# population to numbers
df['population'] = pd.to_numeric(df['population'],errors='coerce') #df['population'].astype(float)
# print(df)

# cleaning the time zone
# df['actual_timezone'] = df["timezone"].str.extract("/(\w+)", expand = True) 
# df["timezone"]= new[0] 
# print(df)
# print(df[:10])
# colors = df['timezone']
# kk = colors.str.split("\t", expand=True,)
# #"@expand=True,).split("/")[2]
# # colors[colors.has('America/Los_Angeles')] = 'blue', 'purple'
# print(kk['0'])
# # colors = {'America/Chicago':'red', 'America/New_York':'blue', 'America/Phoenix':'green', 'America/Los_Angeles':'black'}
# # df_la = df.loc[df['timezone'] == 'America/Los_Angeles']
# # df_chi = df.loc[df['timezone'] == 'America/Chicago']


This is the original example from the book.
It needs tk (and it's been imposible to install it in a mac). So, let's just see a snapshot

In [None]:
# import seaborn
# from matplotlib import pyplot as plt
# from nlpia.plots import offline_plotly_scatter_bubble
# from nlpia.data.loaders import get_data
# df = get_data('cities_us_wordvectors_pca2_meta')
# html = offline_plotly_scatter_bubble(
#     df.sort_values('population', ascending=False)[:350].copy().sort_values('population'),
#     filename='plotly_scatter_bubble.html',
#     x='x', y='y',
#     size_col='population', text_col='name', category_col='timezone',
#     xscale=None, yscale=None, # 'log' or None
#     layout={}, marker={'sizeref': 3000})
# # {'sizemode': 'area', 'sizeref': 3000}

We can do a similar example, just plotting the dataframe. 
It is not as fancy, but we se similar patterns.

Notice that there is [an issue](https://github.com/pandas-dev/pandas/issues/32904) in the implementation of the scatter plot and one has to add the size of the dots "explicitly"

I tried plenty of ways to use the colors, but this dataframe is odd

In [None]:

# df = df.sort_values('population', ascending=False)[:350].copy().sort_values('population')
# print(df)
# use the scatter function
# colors = {'D':'red', 'E':'blue', 'F':'green', 'G':'black'}
print(df)

In [None]:
df.plot(kind='scatter',x='x',y='y', s=df['population']/10000)
# df_la.plot(kind='scatter',x='x',y='y', s=df_la['population']/10000, color=['blue']*len(df_la))
# df_chi.plot(kind='scatter',x='x',y='y', s=df_chi['population']/10000)
# ax = fig.add_subplot(1, 1, 1)
# ax.spines['left'].set_position('center')
# ax.spines['bottom'].set_position('center')
# plt.scatter(df['x'], df['y'], s=df['population']*1000, alpha=0.5)
plt.show()

# Doc2vec

In [None]:
# Finding out the number of cores available
import multiprocessing
num_cores = multiprocessing.cpu_count()
num_cores

In [None]:
# Importing dependencies
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
# gensim crude tokenizer that ignores one-letter words and punctuation
from gensim.utils import simple_preprocess

In [None]:
# We need a list of documents as it's iterable
# corpus = ['This is the first document ...',\
# 'another document ...']
corpus = ["The faster Harry got to the store, the faster and faster Harry would get home."]
corpus.append("Harry is hairy and faster than Jill.")
corpus.append("Jill is not as hairy as Harry.")
training_corpus = []
for i, text in enumerate(corpus):
    tagged_doc = TaggedDocument(simple_preprocess(text), [i])
    training_corpus.append(tagged_doc)
print(training_corpus)

In [None]:
# Instantiating the object 
model = Doc2Vec(size=100,   # dimensions of the vectors
                min_count=2, # min frequency for the tokens
                workers=num_cores,  
                iter=10)   # number of iterations
# Compiling the vocabulary 
model.build_vocab(training_corpus)

# training the model
model.train(training_corpus, total_examples=model.corpus_count, epochs=model.iter)

Inferring a  vector for a new document

In [None]:
model.infer_vector(simple_preprocess('Indeed Jill is the fastest'), steps=10)
# This is not a static model. It has to be trained (10 iterations  in this case)