# Deeplearning
CBOW word2vec

In [1]:
import zipfile
import tensorflow as tf
import numpy as np
import random
import math
import collections

from matplotlib import pylab

In [2]:
# global variable
data_index = 0 # 采样时从文本的第一个词开始采样

def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data
    
def build_dataset(vocabulary_size,words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count = unk_count + 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary
    
def generate_batch(data,batch_size,num_skips,skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    context_size = 2 * skip_window
    labels = np.ndarray(shape=(batch_size,1),dtype=np.int32)
    batchs = np.ndarray(shape=(context_size,batch_size),dtype=np.int32)
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index+1)%(len(data))
    
    for i in range(batch_size//num_skips):
        target = skip_window
        for j in range(num_skips):
            labels[i * num_skips + j,0] = buffer[target]
            met_target = False
            for k in range(context_size):
                if k == target:
                    met_target = True
                if met_target == True:
                    batchs[k,i * num_skips + j] = buffer[k+1]
                else:
                    batchs[k,i * num_skips + j] = buffer[k]
        buffer.append(data[data_index])
        data_index = (data_index + 1)%len(data)
    return batchs,labels

In [7]:
filename="text8.zip"
vocabulary_size=50000
batch_size=8

print('...... Reading data from zip file......')
words = read_data(filename)
print('Data Size = {0}'.format(len(words)))
    
print('...... Transfer word data to word index,dictionary ......')
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary_size,words)
    
for num_skips,skip_window in [(1,1),(2,1),(4,2)]:
    data_index = 0
    batchs,labels = generate_batch(data,
                                  batch_size=batch_size,
                                  num_skips=num_skips,
                                  skip_window=skip_window
                                 )
    # TODO
    print('\nWith num_skips = {0} and skip_window = {1}'.format(num_skips,skip_window))
    print('    batch:',[reverse_dictionary[bi] for bi in batch[0]])
    print('    labels:',[reverse_dictionary[li] for li in labels.reshape(8)])
    for i in range(8):
        print(batch[0][i],'->',labels[i,0],'<===>',
            reverse_dictionary[batch[0][i]],'->',reverse_dictionary[labels[i,0]])

...... Reading data from zip file......
Data Size = 17005207
...... Transfer word data to word index,dictionary ......

With num_skips = 1 and skip_window = 1
('    batch:', ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first'])
('    labels:', ['originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used'])
(5239, '->', 3084, '<===>', 'anarchism', '->', 'originated')
(3084, '->', 12, '<===>', 'originated', '->', 'as')
(12, '->', 6, '<===>', 'as', '->', 'a')
(6, '->', 195, '<===>', 'a', '->', 'term')
(195, '->', 2, '<===>', 'term', '->', 'of')
(2, '->', 3137, '<===>', 'of', '->', 'abuse')
(3137, '->', 46, '<===>', 'abuse', '->', 'first')
(46, '->', 59, '<===>', 'first', '->', 'used')

With num_skips = 2 and skip_window = 1
('    batch:', ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first'])
('    labels:', ['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term'])
(5239, '->', 3084, '<===>', 'anarchism', '->', 'originated')
(3084, '->', 308