## Word2Vec using Tensorflow

In [1]:
# Import Dependencies
import os
import math
import collections
from collections import Counter
import errno
import random
import zipfile
import numpy as np
from six.moves import urllib, xrange
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

  from ._conv import register_converters as _register_converters


In [2]:
# Load the Data
def fetch_words(url, words_data):
    # Make Directory if it does not exists
    os.makedirs(words_data, exist_ok=True)
    
    # Path to Zip File
    file_path = os.path.join(words_data, 'words.zip')
    
    # If zip file is not present, download it
    if not os.path.exists(file_path):
        print('Downloading Data ...')
        urllib.request.urlretrieve(url,file_path)
        
    # Get data from Zip File
    with zipfile.ZipFile(file_path) as f:
        print('Loading Data from Zip File ...')
        data = f.read(f.namelist()[0])
    
    # Return a list of all words in data source
    return data.decode('ascii').split()

In [3]:
# Load all Words
data_url = 'http://mattmahoney.net/dc/text8.zip'
saved_data = './word2vec_data/words/'
words = fetch_words(url=data_url, words_data=saved_data)

Loading Data from Zip File ...


In [4]:
# Print Sample Words
words[:10]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against']

In [5]:
# Length of Word list
len(words)

17005207

In [6]:
# Print a sentence using Words
for w in words[9000:9040]:
    print(w, end=' ')

feelings and the auditory system of a person without autism often cannot sense the fluctuations what seems to non autistic people like a high pitched sing song or flat robot like voice is common in autistic children some autistic children 

In [7]:
# Word Count
def word_count(vocab_size):
    # Grab most common words (with count)
    vocab = [] + Counter(words).most_common(vocab_size)
    # Numpy Array of Most Common Words (without count)
    vocab = np.array([w for w, _ in vocab])
    # Create Dictionary
    dictionary = {word:code for code,word in enumerate(vocab)}
    # Create Data
    data = np.array([dictionary.get(word,0) for word in words])
    return data, vocab

In [8]:
# Test the Function
data, vocabulary = word_count(vocab_size=50000)

In [9]:
data[:10]

array([5235, 3083,   11,    5,  194,    1, 3136,   45,   58,  155])

In [10]:
words[:10]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against']

In [11]:
# Shape of Data
data.shape

(17005207,)

In [12]:
vocabulary[:10]

array(['the', 'of', 'and', 'one', 'in', 'a', 'to', 'zero', 'nine', 'two'],
      dtype='<U28')

In [13]:
vocabulary.shape

(50000,)

In [14]:
# Word at Index
words[1000]

'american'

In [15]:
# Corresponding Number at same Index
data[1000]

63