# Requirements for this notebook:
1. Internet connection (to download corpora and tokenizer data with calls to nltk.download())
3. Download this file and keep track of the path where it goes:
  1. https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing
2. The following packages:
  1. nltk (Anaconda or PIP command line install : pip install -U nltk OR conda install nltk)
  2. gensim (pip install -U gensim)
  3. scikit-learn v0.18.1 (pip install -U scikit-learn)
  4. matplotlib (pip install -U matplotlib)
  5. numpy (pip install -U numpy)

# Objectives of this notebook are to illustrate how we can load and explore pre-trained word embeddings
1. Loading pre-trained embeddings with GenSim
2. Explore embeddings vectors

In [1]:
import time

In [2]:
import nltk
nltk.__version__

'3.2'

# Gensim page : https://radimrehurek.com/gensim/index.html

In [3]:
import gensim
gensim.__version__



'0.12.4'

# Scikit-learn page : http://scikit-learn.org/stable/

In [4]:
import sklearn
from sklearn.manifold import TSNE
sklearn.__version__

'0.17.1'

In [5]:
import numpy as np
np.__version__

'1.10.4'

In [6]:
import matplotlib.pyplot as plt

In [7]:
%matplotlib inline

In [8]:
from gensim.models import Word2Vec
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence

# Before we begin, let's talk about some sources of pre-trained embeddings:
1.  Non-clinical Word2Vec
  1. GoogleNews (download : https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing more info : https://code.google.com/archive/p/word2vec/)
2. Clinical Word Vectors
  1. PubMed and PubMed Wikipedia from http://bio.nlplab.org/ (download : http://evexdb.org/pmresources/vec-space-models/)
  2. BioASQ trained PubMed embeddings (http://bioasq.org/news/bioasq-releases-continuous-space-word-vectors-obtained-applying-word2vec-pubmed-abstracts) 
3. Clincal Data (Codes, Events)
  1. http://clinicalml.org/ (from this paper : https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5001761/)
4. GloVe embeddings (similar to word2vec but uses global words statistics)
  1. While these are not compatible with GenSim out-of-the-box, there are scripts to make this work : (https://radimrehurek.com/gensim/scripts/glove2word2vec.html)
  2. Download and more info : http://nlp.stanford.edu/projects/glove/

# Let's load our pre-trained embeddings from Google:

In [9]:
GOOGLE_EMBEDDINGS_PATH = 'C:/temp-embeddings/GoogleNews-vectors-negative300.bin'

In [10]:
print('Loading Google News model from : ' + GOOGLE_EMBEDDINGS_PATH)

google_w2v_model = Word2Vec.load_word2vec_format(GOOGLE_EMBEDDINGS_PATH, binary=True)  # C binary format

In [11]:
print('GoogleNews Model vocab size : ', len(google_w2v_model.vocab))

('GoogleNews Model vocab size : ', 3000000)


# Let's also load our NLTK resources so that we can try to re-classify

In [12]:
%time nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\u0061995\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
Wall time: 1.95 s


True

In [13]:
%time nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\u0061995\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Wall time: 754 ms


True

In [14]:
from nltk.corpus import brown
from nltk.corpus import movie_reviews

# Let's start inspect our pre-trained vectors

In [15]:
# Now let's inspect what some of these vectors look like
#print(google_w2v_model['business'])

In [16]:
# Now let's see how similar certain word pairs might be
TERM_SIMILARITY_1 = 'movie'
TERM_SIMILARITY_2 = 'film'

print(google_w2v_model.similarity(TERM_SIMILARITY_1, TERM_SIMILARITY_2))

0.867676976388


In [17]:
# Now let's try another pair
TERM_SIMILARITY_3 = 'computer'
TERM_SIMILARITY_4 = 'life'

print(google_w2v_model.similarity(TERM_SIMILARITY_3, TERM_SIMILARITY_4))

0.162694340463


In [18]:
# Can we replicate the famous example from this paper:
# "King - Man + Woman ~~ Queen"
# http://www.aclweb.org/anthology/N13-1#page=784
RELATIONSHIP_WORD_1 = 'woman'
RELATIONSHIP_WORD_2 = 'king'
RELATIONSHIP_WORD_3 = 'man'

# this gives what we might expect from the MOVIE corpus
#RELATIONSHIP_WORD_1 = 'films'
#RELATIONSHIP_WORD_2 = 'movie'
#RELATIONSHIP_WORD_3 = 'film'

# this gives what we might expect from the BROWN corpus
#RELATIONSHIP_WORD_1 = 'families'
#RELATIONSHIP_WORD_2 = 'city'
#RELATIONSHIP_WORD_3 = 'family'

# This can take a very long time (minutes) with GoogleNews model
#print(google_w2v_model.most_similar(positive=[RELATIONSHIP_WORD_1, RELATIONSHIP_WORD_2], negative=[RELATIONSHIP_WORD_3]))