# Create toy CBoW and Skip-Gram datasets from a sample text

In [None]:
import h5py
import numpy as np
import string
from nltk.tokenize import word_tokenize

In [None]:
# Sample text passage from here: http://www.tnewfields.info/Articles/sum1.htm

sample_text = "There are basically two types of auctions: ascending-bid auctions and descending-bid auctions. Ascending-bid auctions start out with a low bid for an object. The price of the object is gradually raised until only one bidder remains. By contrast, descending-bid auctions start out with a high bid and the price is progressively lowered until a customer expresses a willingness to purchase the object. Both procedures have a number of variants. For example, in some types of auctions a professional auctioneer declares the suggested bids. In other types of auctions, however, the customers make their own bids. Another variant, used at places such as eBay or Yahoo Auction, is called a 'buyout option'. A high price for an item is declared. Anyone willing to pay that price is guaranteed a purchase. This variant seems to appeal consumers who dislike uncertainty: for a fixed price they are guaranteed an object. 'Buyout options' are most commonly used if the seller has a stock of several copies of the same item." 

In [None]:
# Some elementary text pre-processing

text = sample_text.lower() # All lowercase

text = word_tokenize(text) # Tokenise

text = list(filter(lambda x: x not in string.punctuation, text)) # Remove punctuation

In [None]:
# Create vocabulary

vocab = sorted(list(set(text)))

word_to_idx = {word: idx for idx, word in enumerate(vocab)}

essay = []
for word in text:
    val = word_to_idx[word]
    essay.append(val)

### Create CBoW

In [None]:
# Continuous Bag of Words with a window size of 3 words.

dat = []
lab = []

for i in range(3, len(essay) - 3):
    context = [essay[i - 3], essay[i - 2], essay[i - 1], essay[i + 1], essay[i + 2], essay[i + 3]]
    target = essay[i]
    dat.append(context)
    lab.append(target)

dat_np = np.asarray(dat)
lab_np = np.asarray(lab)

In [None]:
# Save to HDF5 database

hdf_trn_file = "CBoW.hdf5"
hdf_list_trn_file = "CBoW_hdf5.txt"

with h5py.File(hdf_trn_file, "w") as f:
    f.create_dataset("data", data=dat_np)
    f.create_dataset("label", data=lab_np)
    f.close()

with open(hdf_list_trn_file, "w") as f:
    f.write(hdf_trn_file)
    f.close()

### Create Skip-Gram

In [None]:
# Skip-Gram (reproduced from this GitHub gist by Mateusz Bednarski: https://gist.github.com/mbednarski/da08eb297304f7a66a3840e857e060a0 ).

window_size = 3

word_pairs = []
for centre_word_pos in range(len(essay)):
    for w in range(-window_size, window_size + 1):
        context_word_pos = centre_word_pos + w
        if context_word_pos < 0 or context_word_pos >= len(essay) or centre_word_pos == context_word_pos:
            continue
        context_word_idx = essay[context_word_pos]
        word_pairs.append((essay[centre_word_pos], context_word_idx))

word_pairs = np.array(word_pairs)


In [None]:
# Save to a HDF5 database

np.random.shuffle(word_pairs)

hdf_trn_file = "skpgrm.hdf5"
hdf_list_trn_file = "skpgrm_hdf5.txt"

with h5py.File(hdf_trn_file, "w") as f:
    f.create_dataset("data", data=word_pairs[:,0])
    f.create_dataset("label", data=word_pairs[:,1])
    f.close()

with open(hdf_list_trn_file, "w") as f:
    f.write(hdf_trn_file)
    f.close()

### Create skip-gram with negative sampling

The method below creates a great number of negative samples. Select as many as you see fit. Alternatively, you may wish to use the function "skipgrams()" in the keras.preprocessing.sequence Python library.

In [None]:
unique, counts = np.unique(np.asarray(essay), return_counts = True) # Get unigram frequencies.

In [None]:
uni_probs = (counts**(3/4))/np.sum(counts**(3/4)) # Calculate sampling probabilities using Mikolov's equation.

In [None]:
neg_samp = np.random.choice(unique, len(word_pairs), p = uni_probs) # Obtain negative samples

In [None]:
neg_dat = np.stack((word_pairs[:,0],neg_samp),axis=1) # Create negative sampling dataset

In [None]:
# Filter out any positive samples created.
ns = []
for n in range(len(neg_dat)):
    if word_pairs[n,0] ==  neg_dat[n,0] and word_pairs[n,1] ==  neg_dat[n,1]:
        continue
    ns.append(neg_dat[n])
ns = np.asarray(ns)

In [None]:
pos_labs = np.ones(len(word_pairs)) # Create label of '1' for positive samples

In [None]:
neg_labs = np.zeros(len(ns)) # Create label of '0' for negative samples

In [None]:
all_dat = np.concatenate((word_pairs,ns),axis=0) # concatenate positive and negative samples

In [None]:
all_labs = np.append(pos_labs,neg_labs) # join positive and negative labels

In [None]:
all_dat = np.c_[all_dat,all_labs] # Add labels to dataset

In [None]:
np.random.shuffle(all_dat) # Shuffle the data before saving to database

In [None]:
# Save to a HDF5 database

hdf_trn_file = "trn_skpgrm_neg_samp.hdf5"
hdf_list_trn_file = "trn_skpgrm_neg_samp_hdf5_list.txt"

with h5py.File(hdf_trn_file, "w") as f:
    f.create_dataset("data", data=all_dat[:,[0,1]])
    f.create_dataset("label", data=all_dat[:,2])
    f.close()

with open(hdf_list_trn_file, "w") as f:
    f.write(hdf_trn_file)
    f.close()