# Requirements for this notebook:
1. Internet connection (to download corpora and tokenizer data with calls to nltk.download())
2. The following packages:
  1. nltk (Anaconda or PIP command line install : pip install -U nltk OR conda install nltk)
  2. gensim (pip install -U gensim)
  3. scikit-learn v0.18.1 (pip install -U scikit-learn)
  4. matplotlib (pip install -U matplotlib)
  5. numpy (pip install -U numpy)

# Objectives of this notebook are to illustrate how we can do the following with word embeddings:
1. Train from scratch
2. Explore embeddings vectors
3. Use these for an NLP task

In [15]:
import os
import time
import logging
import pickle
from collections import defaultdict

# NLTK page : http://www.nltk.org/

In [3]:
import nltk
nltk.__version__

'3.2.3'

# Gensim page : https://radimrehurek.com/gensim/index.html

In [4]:
import gensim
gensim.__version__



'3.4.0'

# Scikit-learn page : http://scikit-learn.org/stable/

In [5]:
import sklearn
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
sklearn.__version__



'0.19.1'

In [6]:
import numpy as np
np.__version__

'1.14.0'

In [7]:
import matplotlib.pyplot as plt

In [8]:
%matplotlib inline

In [9]:
from gensim.models import Word2Vec

In [10]:
%time nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\slick\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
Wall time: 1.36 s


True

In [11]:
%time nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\slick\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.
Wall time: 8.03 s


True

In [12]:
%time nltk.download('treebank')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\slick\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\treebank.zip.
Wall time: 3.88 s


True

In [13]:
# Let's download the PUNKT tokenizer first so that we can use tokenize words and sentences
%time nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\slick\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Wall time: 49 ms


True

In [18]:
from nltk.corpus import movie_reviews

In [19]:
# let's load existing clusters
CLUSTERS_BASE_DIR = r'D:\Dropbox\Datasets\Word_Embeddings_Clusters\Wikipedia_PubMed'
CLUSTER_SIZE = '10000'
CLUSTER_FILE_PATH = os.path.join(CLUSTERS_BASE_DIR, 'WordClusters_K{0}_BatchKmeans_wikipedia-pubmed-and-PMC-w2v.pickle'.format(CLUSTER_SIZE))

print('Loading clusters from path : {}'.format(CLUSTER_FILE_PATH))

word_cluster_map = None
with open(CLUSTER_FILE_PATH, 'rb') as f:
    word_cluster_map = pickle.load(f)
    
print('Size of word-cluster map {}'.format(len(word_cluster_map)))

Loading clusters from path : D:\Dropbox\Datasets\Word_Embeddings_Clusters\Wikipedia_PubMed\WordClusters_K10000_BatchKmeans_wikipedia-pubmed-and-PMC-w2v.pickle
Size of word-cluster map 5443656


In [22]:
# and then find the clusters in our vocabulary
corpus_cluster_map = {}
for sentence in movie_reviews.sents():
    for token in sentence:
        if token not in corpus_cluster_map and token in word_cluster_map:
            corpus_cluster_map[token] = word_cluster_map[token]
        
print('Size of corpus cluster map : {}'.format(len(corpus_cluster_map)))

Size of corpus cluster map : 32666


In [24]:
corpus_cluster_map_path = 'corpus_cluster_map.pickle'
with open(corpus_cluster_map_path, 'wb') as f:
    pickle.dump(corpus_cluster_map, f, protocol=pickle.HIGHEST_PROTOCOL)
    print('Saved corpus cluster map to {}'.format(corpus_cluster_map_path))

Saved corpus cluster map to corpus_cluster_map.pickle
