# Script to generate the embeddings for the vocabulary involved in the learning task
-------------------------------------------------------------------------------------------------------------------
# Technology used: Tensorflow

I start with the usual cells for utility purposes.

In [1]:
# packages used for processing: 
import cPickle as pickle # for reading the data
import matplotlib.pyplot as plt # for visualization
import numpy as np

# for counting of words
import collections

# for operating system related stuff
import os
import sys # for memory usage of objects
from subprocess import check_output

# The god of machine learning frameworks:
import tensorflow as tf# Input data files are available in the "../Data/" directory.


In [2]:
def exec_command(cmd):
    '''
        function to execute a shell command and see it's 
        output in the python console
        @params
        cmd = the command to be executed along with the arguments
              ex: ['ls', '../input']
    '''
    print(check_output(cmd).decode("utf8"))

# to plot the images inline
%matplotlib inline

In [3]:
# check the structure of the project directory
exec_command(['ls', '..'])

Data
Models
Scripts



In [4]:
''' Set the constants for the script '''

# various paths of the files
data_path = "../Data/WikiSQL/data" # the data path

train_files = {
    "questions": os.path.join(data_path, "train.jsonl"),
    "tables": os.path.join(data_path, "train.tables.jsonl")
}

base_model_path = '../Models'

processed_data_file_path = os.path.join(data_path, "processed.pickle")


# Constants governing the program
embeddings_size = 512

In [5]:
# check the contents of the data path
exec_command(['ls', data_path])

dev.db
dev.jsonl
dev.tables.jsonl
processed.pickle
test.db
test.jsonl
test.tables.jsonl
train.db
train.jsonl
train.tables.jsonl



### load the pickled data and display a few examples from it

In [6]:
with open(processed_data_file_path, "rb") as yummy_pickle:
    data_dict = pickle.load(yummy_pickle)

In [7]:
questions = data_dict["questions"]
queries = data_dict["queries"]

In [8]:
random_index = np.random.randint(len(questions))

print "English question: " + questions[random_index]
print "SQL query:        " + queries[random_index]

English question: Where was game number 5 played?
SQL query:        SELECT Location Attendance FROM <TABLE> WHERE Game = 5


In [9]:
# loop to generate all the words to from a vocabulary
temp_words = list()

# loop through the questions:
for question in questions:
    temp_words += question.lower().split()
    
# loop through the queries:
for query in queries:
    temp_words += query.lower().split()

In [10]:
words = list(set(temp_words)) # remove duplicate words if there are any

In [11]:
# Total words in the vocabulary:
len(words)

71863

In [12]:
for word in words[1000: 1010]:
    print word

altitude
2013's
screens?
symphony
impossible,
az?
rapes
dnq
1600?
693-004?


In [13]:
# assign an integer to every word and create a dictionary for the same
# function to build a formatted dataset from the list of words 
# so that the context of the words are taken into consideration

def build_dataset(words):
    
    """ build a dataset from the list of words 
    
        input: list of words
        output: the formatted data, count, dictionary (map), reverse_dictionary
        
    """
    
    vocabulary_size = len(words)  # size of the dictionary to be formed
    
    # form the dictionary
    count = [] 
    count.extend(collections.Counter(words).most_common(vocabulary_size))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
        
    dictionary["<other>"] = len(dictionary)
        
    # create a list from the dictionary
    data = list()
    for word in words:       
        index = dictionary[word]
        data.append(index)
  
    # a reverse dictionary for mapping the words to their unique id
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    
    return data, count, dictionary, reverse_dictionary

In [18]:
data, count, dictionary, reverse_dictionary = build_dataset(words)

# Now train the embeddings on the acquired vocabulary:
## For this, I will use the tensorflow interactive session to build and execute graphs on the fly

In [15]:
tf.reset_default_graph()
sess = tf.InteractiveSession() # create an interactive session

In [19]:
data = data + [dictionary["<other>"]]

In [22]:
# train the embeddings as discussed here: https://www.tensorflow.org/programmers_guide/embedding
word_embeddings = tf.get_variable("word_embeddings", [len(dictionary), embeddings_size])

In [23]:
embedded_word_ids = tf.gather(word_embeddings, data)

In [24]:
# initialize all the variables in the graph
tf.global_variables_initializer().run()

In [25]:
# start training the embeddings
embeddings = embedded_word_ids.eval()

In [26]:
len(embeddings), len(dictionary)

(71864, 71864)

# Let's try to visualize these embeddings using the tensorboard embeddings visualizer.