## The notebook for building an RNN from the chat data.

In [None]:
# Machine Learning modules for building the module
import keras 

# general modules for preprocessing and other stuff
import os # for os related calls
from subprocess import check_output

# for visualization stuff
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# for pickling the data
from six.moves import cPickle as pickle

%matplotlib inline

Using TensorFlow backend.


In [None]:
def exec_command(cmd):
    '''
        function to execute a shell command and see it's 
        output in the python console
        @params
        cmd = the command to be executed along with the arguments
              ex: ['ls', '../input']
    '''
    print check_output(cmd).decode("utf8")

In [None]:
''' define the important paths for the Task '''
data_path = "../Data/cornell_movie_dialogs_corpus"
train_file = os.path.join(data_path, "movie_lines.txt")

''' define a few important constant --variables. '''
field_separator = "+++$+++"

# Start with some preporcessing of the input data

In [None]:
# load the file data into a list
data = []
with open(train_file, "r") as data_file:
    for line in data_file:
        data.append(line.strip())
        
# print a few data elements:
data[:3]

In [None]:
# strip non-required information:
data = map(lambda x: list(x.split(field_separator))[3:], data)

# print some data fields:
data[: 3]

In [None]:
# now write a small loop to bring together adjacent dialogues into a single sentence:
i = 0
processed = []
while(i < len(data)):
    processed.append(data[i])
    current = data[i][0] 
    
    j = i + 1
    while(j < len(data) and data[j][0] == current):
        if(len(data[j]) == 2):
            processed[-1][1] += data[j][1]
        j += 1
    
    i = j

In [None]:
processed[:2]

In [None]:
for i in range(len(processed)):
    if(len(processed[i]) != 2):
        print processed[i]

### So, there are around 2.7 lakhs of spoken sentences in this conversational dataset.

# Generate the embeddings for the vocabulary of the words in the dataset

In [None]:
# convert the list into a numpy array:
processed = np.array(processed, dtype=np.str)
processed.shape

In [None]:
print type(processed) # check if it has been converted into a numpy array
processed[3: 10] # print a few entries in the data

In [None]:
# no need to keep the characters who spoke the sentences around anymore
sentences = processed[:, 1]
sentences.shape
sentences[45: 50]

In [None]:
# function to build a formatted dataset from the list of words 
# so that the context of the words are taken into consideration

def build_dataset(words):
    import collections
    
    """ build a dataset from the list of words 
    
        input: list of words
        output: the formatted data, count, dictionary (map), reverse_dictionary
        
    """
    
    vocabulary_size = len(words)  # size of the dictionary to be formed
    
    # form the dictionary
    count = [] 
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
        
    # create a list from the dictionary
    data = list()
    for word in words:       
        index = dictionary[word]
        data.append(index)
  
    # a reverse dictionary for mapping the words to their unique id
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    
    return data, count, dictionary, reverse_dictionary

In [None]:
# build a vocabulary of words. Use the above helper function for it.
'''
***********************************************************************
WARNING! WARNING! WARNING! 
This cell may take some time to execute on some low end systems
***********************************************************************
'''

# map and reduce is taking too long. So, I am trying to write a loop for it.
words = list() # empty list
for i in range(len(data)):
    words += data[i][1].split()
    
print len(words) # print the length of the words
words[100: 110] # print some arbitrary words from this list

In [None]:
# Now, use the words to build the dataset.
data , count, dictionary, reverse_dictionary = build_dataset(words)

# add the special characters to the list manually:
size = len(dictionary)
dictionary[" "] = size # blank space character.
dictionary["other"] = size + 1

reverse_dictionary[size] = " "
reverse_dictionary[size + 1] = "other"

vocab_size = len(dictionary) # size of the vocabulary
print "Vocabulary_size: " + str(vocab_size)

# print a few items from all of these
print "DATA               : " + str(data[:10])
print "COUNT              : " + str(count[:10])
print "DICTIONARY         : " + str((dictionary.keys()[:10], dictionary.values()[: 10]))

In [None]:
# Now transform the sentences into formatted sequences for the chatbot
sentences = map(lambda x: x.split(), sentences)

In [None]:
'''
***********************************************************************
WARNING! WARNING! WARNING! 
This cell may take some time to execute on some low end systems
***********************************************************************
'''

# set the input Dim constant here:
inputDim = 35

# loop through the sentences to make their lengths equal to the inputDim

fixed_length_input = list()
for i in range(len(sentences)):
    sentence = sentences[i] # extract the sentence
    
    if(len(sentence) < inputDim):
        while(len(sentence) != inputDim):
            sentence.append(" ") # append the blank character
        fixed_length_input.append(sentence)
        
    else:
        # The length is greater than or equal to 35
        splits = list()
        for j in range(len(sentence)):
            if(j % inputDim == 0):
                splits.append(list()) # append an empty list
                
            splits[-1].append(sentence[j])

        # pad the last list with the appropriate blanks
        while(len(splits[-1]) != inputDim):
            splits[-1].append(" ")

        # concat the sentences and the splits
        fixed_length_input = fixed_length_input + splits

In [None]:
# length checking:
lengths = map(lambda x: len(x), fixed_length_input)

for i in range(len(lengths)):
    if len(fixed_length_input[i]) != inputDim:
        print fixed_length_input[i]
        
# The check has passed since this didnot print anything

In [None]:
# create an empty ndArray
Data = np.ndarray((len(fixed_length_input), inputDim), dtype = np.int32)
Data.shape

In [None]:
# now fill the Data using the dictionary mapping:
for i in range(len(fixed_length_input)):
    for j in range(inputDim):
        Data[i, j] = dictionary[fixed_length_input[i][j]]
        
# print a few values from the Data
print Data[5: 10, :]

# print a random sentence 
reduce(lambda x, y: x + " " + y, map(lambda x: reverse_dictionary[x], Data[100]))

In [None]:
# Create a dictionary of the saved Data and pickle this processed Data
Processed_Data = {
    "data": Data,
    "mapping": dictionary,
    "rev_mapping": reverse_dictionary
}

In [None]:
# Now pickle the Processed_Data Dictionary
save_path = os.path.join(data_path, "Data_final.pickle")

if(not os.path.isfile(save_path)):
    with open(save_path, "wb") as pickle_file:
        pickle.dump(Processed_Data, pickle_file, pickle.HIGHEST_PROTOCOL)
        print("Pickling complete")
        
else:
    print("data is already pickled")

In [None]:

# naive checking of the integrity of the pickled data

with open(save_path, "rb") as pickle_file:
    my_dict = pickle.load(pickle_file)
    
len(my_dict)

# The data has been processed and pickled so that the Next part can be directly run the next time I come here.