In [6]:
!ls *.json | pbcopy

In [22]:
import json
from pprint import pprint

def read_in_corpus(target_corpus_filename):
    '''
    alternative options to read-in the json would've been pandas 
    but that would be very unassuming to size of the file.
    '''
    
    corpus_json = json.load(open(target_corpus_filename))
    Issues_json = corpus_json['Issues']
    all_customer_service_lines = []
    
    for issue_dialogue in Issues_json:
        customer_service_lines = [message['Text'] for message in issue_dialogue['Messages'] \
         if not message['IsFromCustomer']]
        
        all_customer_service_lines.extend(customer_service_lines)
    
    return all_customer_service_lines
    
corp = read_in_corpus('sample_conversations.json')
corp

[u'Hello Werner how may I help you today?',
 u'Sure I can help you with that? Could you please provide me with your new address?',
 u'Let me update that information on our system',
 u'OK Wernzio, I have updated your address to the system',
 u'Ok let me go ahead and request a work order for a new installation. Give me a moment...',
 u'OK a installation order has been places. Seems the earilest we will be able to help you  is from the 20th February onwards',
 u'does that suite you?',
 u'good, I have scheduled. A operator will be contacted you one day prior to lock down a time slot.',
 u'Is there anything else I can help you with?',
 u'Great, was my pleasure helping you. Have a great day Wernzio',
 u'Hey Wernzio, What problems are you experiencing?',
 u'Interesting, let me quickly have a look what the status for your neighborhood is.',
 u'Give me a moment',
 u'It seems currently we are experiencing technical problems in near your vicinity.',
 u'We have sent out a technician and expect to 

In [34]:
import re
test= "good, I have scheduled. A operator will be contacted you're one day prior to lock down a time slot."
print '\n'.join(re.findall(r"[\w' ]+", test))
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
print '\n'.join(sent_detector.tokenize(test.strip().lower()))

good
 I have scheduled
 A operator will be contacted you're one day prior to lock down a time slot
good, i have scheduled.
a operator will be contacted you're one day prior to lock down a time slot.


In [38]:
abreviations_dict = {
    "'m":' am',
    "'ve":' have',
    "'ll":" will",
    "'d":" would",
    "'s":" is",
    "'re":" are",
    "  ":" ",
    "' s": " is",
}

def multiple_replace(text, adict):
    rx = re.compile('|'.join(map(re.escape, adict)))
    def one_xlat(match):
        return adict[match.group(0)]
    return rx.sub(one_xlat, text)

str_ = "you're bothering me"
multiple_replace("you're bothering me", abreviations_dict)

str_.replace()


<function replace>

In [187]:
import spell_checker
import nltk.data
import re
import json
from concurrent.futures import ThreadPoolExecutor

def create_key_stroke_to_cust_line_table(customer_service_line_key, 
                                         number_of_chars_in_ngram, 
                                         corpus_ctr_dict, master_lookup={}):
    #make a new dictionary per key stroke of n gram sequence
    '''
    what this does is it walks through the n gram string character-by-character and creates a dictionary 
    of all the possible lines this sequence of keystrokes could be leading to
    e.g.

        g {u'great was my pleasure helping you': 1}
        gr {u'great was my pleasure helping you': 1}
        ...
        great was my {u'great was my pleasure helping you': 1}
    '''
    
    
    #change to key_stroke_sequence
    for index in range(1, number_of_chars_in_ngram + 1):
        key_stroke_gram = customer_service_line_key[:index]
        count_of_this_particular_line = corpus_ctr_dict[customer_service_line_key]
        
        if key_stroke_gram not in master_lookup:
            
            master_lookup[key_stroke_gram] = {}
            master_lookup[key_stroke_gram][customer_service_line_key] = count_of_this_particular_line 
        
        # if the customer service line not in the master_lookup
        elif customer_service_line_key not in master_lookup[key_stroke_gram]:
            master_lookup[key_stroke_gram][customer_service_line_key] = count_of_this_particular_line
    
    return master_lookup

def find_num_chars_in_n_gram(line, number_of_grams):
    '''
    calculates the number of characters of a 
    the beginning ngram in a string
    
    I: string, ngram size (int)
    O: length (int)
    '''
    return len(' '.join(line.split(' ')[:number_of_grams])) 

def format_suggestions_properly(list_of):
    '''
    I:list of string
    O:properly formatted list of strings
    
    to dos
    ------
    * grammar corrections for punctuations
    '''
    output = []
    for str_ in list_of:
        str_ = str_.capitalize()
        str_ = str_.replace(' i ', ' I ')
        
        #need to think about how to add grammar
        str_
        output.append(str_)
    return output

def retrieve_suggestions(key_strokes, look_up_table, top_x_lines):
    try:
        sub_dict_of_suggestions = look_up_table[key_strokes]

        #grabs the top X number of lines sorted by count (most popular)
        suggestions = sorted(sub_dict_of_suggestions.items(), key=lambda x:x[1])[::-1][:top_x_lines]

        return [tuple_[0] for tuple_ in suggestions]
    except KeyError:
        return None

def abreviation_expander(target_str):
    
    # can add to this list
    abreviations_dict = {
        "'m":' am',
        "'ve":' have',
        "'ll":" will",
        "'d":" would",
        "'s":" is",
        "' s": " is",
        "'re":" are",
        "  ":" ",
    }
    
    for abreviated_form, full_form in abreviations_dict.iteritems():
        target_str.replace(abreviated_form, full_form)
    return target_str

def multithread_map(fn, work_list, num_workers=50):
    '''
    spawns a threadpool and assigns num_workers to some 
    list, array, or any other container. Motivation behind 
    this was for functions that involve scraping.
    '''

    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        return list(executor.map(fn, work_list))

class Suggestion_Generator(object):
    '''
    This class builds a retrieval based chatbot 
    and it's based off a frequency of n-grams in 
    a target corpus of text. It uses the reverse of 
    zipf's law to predict sentences by each key 
    stroke.

    Methods
    --------
    train
    predict
    read_in_corpus
        I: file name
        O: list
    print_corpus
        I: 
        O: print to text file or screen
    '''
    
    corpus = []
    
    def train(self, 
              target_corpus_filename, 
              filename_for_storage='n_gram_frequencies_dict.pkl'
             ):
        
        #read
        self.corpus = self.read_in_corpus(target_corpus_filename)

        #preprocess
        self.corpus = self.preprocess(self.corpus)
        
        #make counter dict
        self.key_stroke_lookup_table = self.create_frequency_dict()
    
    def create_frequency_dict(self, filename='key_stroke_lookup_table.pkl'):
        key_stroke_lookup_table = {}

        #change: line_frequency_table
        corpus_ctr_dict = Counter(self.corpus)

        for customer_service_line_key in corpus_ctr_dict.keys():

            number_of_chars_in_ngram = find_num_chars_in_n_gram(customer_service_line_key, 3)
            key_stroke_lookup_table = create_key_stroke_to_cust_line_table(customer_service_line_key, number_of_chars_in_ngram, corpus_ctr_dict, key_stroke_lookup_table)

        #saving lookup table
        cPickle.dump(key_stroke_lookup_table, open(filename,'wb'))

        return key_stroke_lookup_table

    
    def read_in_corpus(self, target_corpus_filename):
        '''
        alternative options to read-in the json would've been pandas 
        but that would be very unassuming to size of the file.
        '''

        corpus_json = json.load(open(target_corpus_filename))
        Issues_json = corpus_json['Issues']
        all_customer_service_lines = []

        for issue_dialogue in Issues_json:
            customer_service_lines = [message['Text'] for message in issue_dialogue['Messages'] \
             if not message['IsFromCustomer']]

            all_customer_service_lines.extend(customer_service_lines)

        return all_customer_service_lines

    def find_suggestions(self, key_stroke_sequence_str, top_x_lines=5):
        '''
        I: key stroke sequence e.g 'what th' (string), max number of suggestions (int) 
        O: suggestions that attempt to accurately complete the key stroke sequence (list of strings)
        '''

        print key_stroke_sequence_str.lower()
        look_this_up = key_stroke_sequence_str.lower()
        number_of_words = len(key_stroke_sequence_str.split())

        if number_of_words <= 2:

            #pull up the most frequently occuring line
            most_frequent_lines = retrieve_suggestions(look_this_up, key_stroke_lookup_table, top_x_lines)

        else:   

            #truncate the target sequence of key strokes to the first tri-gram
            #perhaps we could have something similar in our lookup table
            len_of_key_strokes = find_num_chars_in_n_gram(look_this_up, 3)
            truncated_key_strokes = look_this_up[:len_of_key_strokes - 1]
            most_frequent_lines = retrieve_suggestions(truncated_key_strokes, key_stroke_lookup_table, top_x_lines)

        return most_frequent_lines
    
    def preprocess(self, corpus):
        '''
        This iterates through the corpus line by line tokenizing, 
        spellchecking, normalizing abbreviations, etc.

        I: list of text strings
        O: preprocessed list of text strings

        It's debatable how you want to tokenize, I chose to do it by sentence
        but it could be done with regex like so "[\w' ]+" which would give you 
        smaller phrases.

        to dos:
        -------
        * fix grammar
        * remove first names
        * think about how to deal with punctuation
        * find nltk package to replace abbreviations
        * removing infrequent terms potentially
        * multithread the spellchecker
        * iterate through with a string fuzzy matching algo to dedupe typos
        * unit tests
        '''
        
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        
                                    #lowercase and tokenize by sentence
        corpus_formatted = [tokenized_line for line in corpus \
                            for tokenized_line in sent_detector.tokenize(line.strip().lower())]

        #expanding the abreviations
        corpus_formatted_expanded = [abreviation_expander(line) for line in corpus_formatted]

        #spell check
        #note this function takes quite a bit of time
        corpus_formatted_expanded_correct = [' '.join([spell_checker.correct(word) for word in line.strip().split()]) 
                                             for line in corpus_formatted_expanded]
 
        
        return corpus_formatted_expanded_correct
    
    def load_from_pickle(self, filename='key_stroke_lookup_table.pkl'):
        print 'loading: ', filename, '...',
        self.key_stroke_lookup_table = cPickle.load(open(filename,'rb'))
        print 'loaded.'

In [188]:
model = Suggestion_Generator()
model.load_from_pickle()
model.find_suggestions('what')

loading:  key_stroke_lookup_table.pkl ... loaded.
what


[u'what your account number',
 u'what your order number',
 u'what is your account number',
 u'what your new address',
 u'what the order number']

In [106]:
from collections import Counter
model = Suggestion_Generator()
model.train('sample_conversations.json')
model.create_frequency_dict()

{u'or black': {u'or black shelf': 1},
 u'jose is th': {u'jose is there anything else i can assist you with today': 1},
 u'expect the': {u'expect the service on your cell phone to resume in the next of minutes or so': 1},
 u'expect tha': {u'expect that shelf and the book later this weeks': 1},
 u'sending ': {u'sending a check would take a long time to clear': 1},
 u'not offer re': {u'not offer refund on online orders': 1},
 u'i could pro': {u'i could provide you with a signal bolster for your house': 1},
 u'hey same genera': {u'hey same generally we have a no refund policy': 1},
 u'ok al': {u'ok all transaction over the past two weeks have been refused to your account': 1},
 u'sweet happy': {u'sweet happy we located your account': 1},
 u'ok an': {u'ok and find the input button on your remote and make sure it is set to him please': 1,
  u'ok and the new address you need service to update our accounts': 1,
  u'ok and what airport are you at now': 1,
  u'ok and what is the new address you 

In [172]:
def retrieve_suggestions(key_strokes, look_up_table, top_x_lines):
    try:
        sub_dict_of_suggestions = look_up_table[key_strokes]

        #grabs the top X number of lines sorted by count (most popular)
        suggestions = sorted(sub_dict_of_suggestions.items(), key=lambda x:x[1])[::-1][:top_x_lines]

        return [tuple_[0] for tuple_ in suggestions]
    except KeyError:
        return None

retrieve_suggestions('bitches', key_stroke_lookup_table, 5)

In [169]:
[item for item in model.corpus if '?' in item]

[u'i see you purchase two sets of ticket to watch goldfish?',
 u'hello werner?',
 u'jeramey?',
 u'could you please look on the delivery box if this was not maybe a mistake made by fedex?',
 u'what time suits you for the pickup?',
 u'could you either describe the table which arrived or read me the barcode?',
 u'what time can i schedule for the pickup?',
 u'ok i will schedule the picked to occur friday at 3:30pm?',
 u'what i can do is make a re-bill?',
 u'what i can do is try and order a re-bill?',
 u'before i try that would you like me to try and make a re-bill?',
 u'will someone be home between 9-12am?',
 u'what i can do is try and authorise a re-bill?',
 u'ok great can i schedule it for 14:00?',
 u'sure i can authorized a re-bill?',
 u'what is our username?',
 u'what is your username?',
 u'regarding your question what is an aldis?',
 u'is it the oak a shelf bookshelf?',
 u'is it from the oak a shelf bookshelf?',
 u'could you please specify the telephone number on the invoice?',
 u'wha

In [167]:
def format_suggestions_properly(list_of):
    '''
    I:list of string
    O:properly formatted list of strings
    
    to dos
    ------
    * grammar corrections for punctuations
    '''
    output = []
    for str_ in list_of:
        str_ = str_.capitalize()
        str_ = str_.replace(' i ', ' I ')
        
        #need to think about how to add grammar
        str_
        output.append(str_)
    return output

suggest = retrieve_suggestions('where', key_stroke_lookup_table, 5)

format_suggestions_properly(suggest)

[u'Where was the destination',
 u'Where you not notified of this',
 u'Where these tickets purchased under your name']

In [171]:
key_stroke_lookup_table = model.key_stroke_lookup_table

def find_suggestions(key_stroke_sequence_str, top_x_lines=5):
    '''
    I: key stroke sequence e.g 'what th' (string), max number of suggestions (int) 
    O: suggestions that attempt to accurately complete the key stroke sequence (list of strings)
    '''
    
    look_this_up = key_stroke_sequence_str.lower()
    number_of_words = len(key_stroke_sequence_str.split())
    
    if number_of_words <= 2:
        
        #pull up the most frequently occuring line
        most_frequent_lines = retrieve_suggestions(look_this_up, key_stroke_lookup_table, top_x_lines)
    
    else:   
        
        #truncate the target sequence of key strokes to the first tri-gram
        #perhaps we could have something similar in our lookup table
        len_of_key_strokes = find_num_chars_in_n_gram(look_this_up, 3)
        truncated_key_strokes = look_this_up[:len_of_key_strokes - 1]
        most_frequent_lines = retrieve_suggestions(truncated_key_strokes, key_stroke_lookup_table, top_x_lines)
            
    return most_frequent_lines

find_suggestions('this')        

[u'this matches the address on file',
 u'this address matches the one in our system',
 u'this is true from acme',
 u'this is jaw from acme',
 u'this normally takes up to of hours']

In [54]:
def find_num_chars_in_n_gram(line, number_of_grams):
    '''
    calculates the number of characters of a 
    the beginning ngram in a string
    
    I: string, ngram size (int)
    O: length (int)
    '''
    return len(' '.join(line.split(' ')[:number_of_grams])) 

num_chars_in_n_gram('great was my pleasure helping you', 3)

12

In [137]:
def create_key_stroke_to_cust_line_table(customer_service_line_key, 
                                         number_of_chars_in_ngram, 
                                         corpus_ctr_dict, master_lookup={}):
    #make a new dictionary per key stroke of n gram sequence
    '''
    what this does is it walks through the n gram string character-by-character and creates a dictionary 
    of all the possible lines this sequence of keystrokes could be leading to
    e.g.

        g {u'great was my pleasure helping you': 1}
        gr {u'great was my pleasure helping you': 1}
        ...
        great was my {u'great was my pleasure helping you': 1}
    '''
    
    
    #change to key_stroke_sequence
    for index in range(1, number_of_chars_in_ngram + 1):
        key_stroke_gram = customer_service_line_key[:index]
        count_of_this_particular_line = corpus_ctr_dict[customer_service_line_key]
        
        if key_stroke_gram not in master_lookup:
            
            master_lookup[key_stroke_gram] = {}
            master_lookup[key_stroke_gram][customer_service_line_key] = count_of_this_particular_line 
        
        # if the customer service line not in the master_lookup
        elif customer_service_line_key not in master_lookup[key_stroke_gram]:
            master_lookup[key_stroke_gram][customer_service_line_key] = count_of_this_particular_line
    
    return master_lookup