## The notebook for building an RNN from the chat data.

In [1]:
# Machine Learning modules for building the module
import keras 

# general modules for preprocessing and other stuff
import os # for os related calls
from subprocess import check_output

# for visualization stuff
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# for pickling the data
from six.moves import cPickle as pickle

%matplotlib inline

Using TensorFlow backend.


In [2]:
def exec_command(cmd):
    '''
        function to execute a shell command and see it's 
        output in the python console
        @params
        cmd = the command to be executed along with the arguments
              ex: ['ls', '../input']
    '''
    print check_output(cmd).decode("utf8")

In [3]:
''' define the important paths for the Task '''
data_path = "../Data/cornell_movie_dialogs_corpus"
train_file = os.path.join(data_path, "movie_lines.txt")

''' define a few important constant --variables. '''
field_separator = "+++$+++"

# Start with some preporcessing of the input data

In [28]:
# load the file data into a list
data = []
with open(train_file, "r") as data_file:
    for line in data_file:
        data.append(line.strip())
        
# print a few data elements:
data[:3]

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.']

In [29]:
# strip non-required information:
data = map(lambda x: list(x.split(field_separator))[3:], data)

# print some data fields:
data[: 3]

[[' BIANCA ', ' They do not!'],
 [' CAMERON ', ' They do to!'],
 [' BIANCA ', ' I hope so.']]

In [30]:
# now write a small loop to bring together adjacent dialogues into a single sentence:
i = 0
processed = []
while(i < len(data)):
    processed.append(data[i])
    current = data[i][0] 
    
    j = i + 1
    while(j < len(data) and data[j][0] == current):
        if(len(data[j]) == 2):
            processed[-1][1] += data[j][1]
        j += 1
    
    i = j

In [31]:
processed[:2]

[[' BIANCA ', ' They do not!'], [' CAMERON ', ' They do to!']]

In [32]:
for i in range(len(processed)):
    if(len(processed[i]) != 2):
        print processed[i]

### So, there are around 2.7 lakhs of spoken sentences in this conversational dataset.

# Generate a BAG OF WORDS for the vocabulary of the dataset

In [33]:
# convert the list into a numpy array:
processed = np.array(processed, dtype=np.str)
processed.shape

(269955, 2)

In [34]:
print type(processed) # check if it has been converted into a numpy array
processed[3: 10] # print a few entries in the data

<type 'numpy.ndarray'>


array([[' CAMERON ', ' She okay?'],
       [' BIANCA ', " Let's go."],
       [' CAMERON ', ' Wow'],
       [' BIANCA ', " Okay -- you're gonna need to learn how to lie."],
       [' CAMERON ', ' No'],
       [' BIANCA ',
        ' I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit? Like my fear of wearing pastels?'],
       [' CAMERON ', ' The "real you".']],
      dtype='|S3089')

In [35]:
# no need to keep the characters who spoke the sentences around anymore
sentences = processed[:, 1]
sentences.shape
sentences[45: 50]

array([" Right.  See?  You're ready for the quiz.",
       " C'esc ma tete. This is my head", ' Let me see what I can do.',
       ' Gosh, if only we could find Kat a boyfriend...',
       " That's a shame."],
      dtype='|S3089')

In [42]:
# function to build a formatted dataset from the list of words 
# so that the context of the words are taken into consideration

def build_dataset(words):
    import collections
    
    """ build a dataset from the list of words 
    
        input: list of words
        output: the formatted data, count, dictionary (map), reverse_dictionary
        
    """
    
    vocabulary_size = 100000  # size of the dictionary to be formed
    
    # form the dictionary
    count = [] 
    count.extend(collections.Counter(words).most_common(vocabulary_size))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    
    # add some details here:
    dictionary[" "] = size # blank space character.
    dictionary["<other>"] = size + 1
    
        
    # create a list from the dictionary
    data = list()
    for word in words:
        try:
            index = dictionary[word]
        except KeyError:
            index = dictionary['<other>']
        data.append(index)
  
    # a reverse dictionary for mapping the words to their unique id
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    
    return data, count, dictionary, reverse_dictionary

In [43]:
# build a vocabulary of words. Use the above helper function for it.
'''
***********************************************************************
WARNING! WARNING! WARNING! 
This cell may take some time to execute on some low end systems
***********************************************************************
'''

# map and reduce is taking too long. So, I am trying to write a loop for it.
words = list() # empty list
for i in range(len(data)):
    words += map(lambda x: x.lower(), data[i][1].split())
    
print len(words) # print the length of the words All the possible words
# words = list(set(words)) # remove all the duplicate words from the set
# print len(words)
words[100: 110] # print some arbitrary words from this list

3605023


['babble.',
 "i'm",
 'like,',
 'boring',
 'myself.',
 'what',
 'crap?',
 'do',
 'you',
 'listen']

In [46]:
# Now, use the words to build the dataset.
print len(words)
data , count, dictionary, reverse_dictionary = build_dataset(words)

# add the special characters to the list manually:
size = len(dictionary)

vocab_size = len(dictionary) # size of the vocabulary
print "Vocabulary_size: " + str(vocab_size)

# print a few items from all of these
print "DATA               : " + str(data[:10])
print "COUNT              : " + str(count[:10])
print "DICTIONARY         : " + str((dictionary.keys()[:10], dictionary.values()[: 10]))

3605023
Vocabulary_size: 100002
DATA               : [40, 23, 2106, 40, 23, 3818, 1, 376, 387, 54]
COUNT              : [('you', 118207), ('i', 111794), ('the', 110671), ('to', 88125), ('a', 79057), ('and', 50048), ('of', 43704), ('in', 36122), ('it', 32991), ('that', 30057)]
DICTIONARY         : (['fawn', 'considered,', 'petra,', 'considered.', 'body-guard.', 'simple-icity', 'this--"look', '$750.00', 'petra?', 'considered?'], [73158, 47074, 36554, 73159, 73160, 90655, 73161, 84699, 73164, 73165])


In [52]:
reverse_dictionary[100002]

' '

In [53]:
# Now transform the sentences into formatted sequences for the chatbot
sentences = map(lambda x: x.lower().split(), sentences)

In [54]:
'''
***********************************************************************
WARNING! WARNING! WARNING! 
This cell may take some time to execute on some low end systems
***********************************************************************
'''

# set the input Dim constant here:
inputDim = 35

# loop through the sentences to make their lengths equal to the inputDim

fixed_length_input = list()
for i in range(len(sentences)):
    sentence = sentences[i] # extract the sentence
    
    if(len(sentence) < inputDim):
        while(len(sentence) != inputDim):
            sentence.append(" ") # append the blank character
        fixed_length_input.append(sentence)
        
    else:
        # The length is greater than or equal to 35
        splits = list()
        for j in range(len(sentence)):
            if(j % inputDim == 0):
                splits.append(list()) # append an empty list
                
            splits[-1].append(sentence[j])

        # pad the last list with the appropriate blanks
        while(len(splits[-1]) != inputDim):
            splits[-1].append(" ")

        # concat the sentences and the splits
        fixed_length_input = fixed_length_input + splits

In [55]:
# length checking:
lengths = map(lambda x: len(x), fixed_length_input)

for i in range(len(lengths)):
    if len(fixed_length_input[i]) != inputDim:
        print fixed_length_input[i]
        
# The check has passed since this didnot print anything

In [80]:
# create an empty ndArray
Data = np.ndarray((len(fixed_length_input), inputDim), dtype = np.int32)
print Data.shape

(286730, 35)


In [81]:
# now fill the Data using the dictionary mapping:
for i in range(len(fixed_length_input)):
    for j in range(inputDim):
        try: 
            Data[i, j] = dictionary[fixed_length_input[i][j]]
        except KeyError:
            Data[i, j] = dictionary["<other>"]
        
# print a few values from the Data
# print Data[5: 10, :]

# print a random sentence 
reduce(lambda x, y: x + " " + y, map(lambda x: reverse_dictionary[x], Data[100]))

"i'm not stupid enough to repeat your mistakes.                                                      "

In [89]:
# print a random sentence 
for i in range(10):
    print "Random Sentence: " + str(i + 1) + ": " + reduce(
        lambda x, y: x + " " + y, map(lambda x: reverse_dictionary[x], Data[np.random.randint(len(Data))])) + "\n"

Random Sentence: 1: no, they wouldn't -- they'd gain one! and i guarantee that they'll graduate with highest <other>                                      

Random Sentence: 2: what's that?                                                                  

Random Sentence: 3: hey -- so i've noticed.                                                            

Random Sentence: 4: we had time to get to know each other.                                                    

Random Sentence: 5: the same reason everyone does. you hear your name on <other> and you realize you're a skeleton in someone's closet and they're coming to bury you.                  

Random Sentence: 6: eighteen.                                                                    

Random Sentence: 7: you can't be serious.                                                              

Random Sentence: 8: remember the time we broke into the d.a.'s office, and copied <other> <other> diary?                                 

In [90]:
# Create a dictionary of the saved Data and pickle this processed Data
Processed_Data = {
    "data": Data,
    "mapping": dictionary,
    "rev_mapping": reverse_dictionary
}

In [91]:
# Now pickle the Processed_Data Dictionary
save_path = os.path.join(data_path, "Data_final.pickle")

if(not os.path.isfile(save_path)):
    with open(save_path, "wb") as pickle_file:
        pickle.dump(Processed_Data, pickle_file, pickle.HIGHEST_PROTOCOL)
        print("Pickling complete")
        
else:
    print("data is already pickled")

data is already pickled


In [92]:

# naive checking of the integrity of the pickled data

with open(save_path, "rb") as pickle_file:
    my_dict = pickle.load(pickle_file)
    
len(my_dict)

3

# The data has been processed and pickled so that the Next part can be directly run the next time I come here.