In [6]:
!ls *.json | pbcopy

In [22]:
import json
from pprint import pprint

def read_in_corpus(target_corpus_filename):
    '''
    alternative options to read-in the json would've been pandas 
    but that would be very unassuming to size of the file.
    '''
    
    corpus_json = json.load(open(target_corpus_filename))
    Issues_json = corpus_json['Issues']
    all_customer_service_lines = []
    
    for issue_dialogue in Issues_json:
        customer_service_lines = [message['Text'] for message in issue_dialogue['Messages'] \
         if not message['IsFromCustomer']]
        
        all_customer_service_lines.extend(customer_service_lines)
    
    return all_customer_service_lines
    
corp = read_in_corpus('sample_conversations.json')
corp

[u'Hello Werner how may I help you today?',
 u'Sure I can help you with that? Could you please provide me with your new address?',
 u'Let me update that information on our system',
 u'OK Wernzio, I have updated your address to the system',
 u'Ok let me go ahead and request a work order for a new installation. Give me a moment...',
 u'OK a installation order has been places. Seems the earilest we will be able to help you  is from the 20th February onwards',
 u'does that suite you?',
 u'good, I have scheduled. A operator will be contacted you one day prior to lock down a time slot.',
 u'Is there anything else I can help you with?',
 u'Great, was my pleasure helping you. Have a great day Wernzio',
 u'Hey Wernzio, What problems are you experiencing?',
 u'Interesting, let me quickly have a look what the status for your neighborhood is.',
 u'Give me a moment',
 u'It seems currently we are experiencing technical problems in near your vicinity.',
 u'We have sent out a technician and expect to 

In [34]:
import re
test= "good, I have scheduled. A operator will be contacted you're one day prior to lock down a time slot."
print '\n'.join(re.findall(r"[\w' ]+", test))
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
print '\n'.join(sent_detector.tokenize(test.strip().lower()))

good
 I have scheduled
 A operator will be contacted you're one day prior to lock down a time slot
good, i have scheduled.
a operator will be contacted you're one day prior to lock down a time slot.


In [38]:
abreviations_dict = {
    "'m":' am',
    "'ve":' have',
    "'ll":" will",
    "'d":" would",
    "'s":" is",
    "'re":" are",
    "  ":" ",
    "' s": " is",
}

def multiple_replace(text, adict):
    rx = re.compile('|'.join(map(re.escape, adict)))
    def one_xlat(match):
        return adict[match.group(0)]
    return rx.sub(one_xlat, text)

str_ = "you're bothering me"
multiple_replace("you're bothering me", abreviations_dict)

str_.replace()


<function replace>

In [26]:
import spell_checker
import nltk.data
import re
import json
from concurrent.futures import ThreadPoolExecutor

def abreviation_expander(target_str):
    
    # can add to this list
    abreviations_dict = {
        "'m":' am',
        "'ve":' have',
        "'ll":" will",
        "'d":" would",
        "'s":" is",
        "'re":" are",
        "  ":" ",
        "' s": " is",
    }
    
    for abreviated_form, full_form in abreviations_dict.iteritems():
        target_str.replace(abreviated_form, full_form)
    return target_str

def multithread_map(fn, work_list, num_workers=50):
    '''
    spawns a threadpool and assigns num_workers to some 
    list, array, or any other container. Motivation behind 
    this was for functions that involve scraping.
    '''

    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        return list(executor.map(fn, work_list))

class Suggestion_Generator(object):
    '''
    This class builds a retrieval based chatbot 
    and it's based off a frequency of n-grams in 
    a target corpus of text. It uses the reverse of 
    zipf's law to predict sentences by each key 
    stroke.

    Methods
    --------
    train
    predict
    read_in_corpus
        I: file name
        O: list
    print_corpus
        I: 
        O: print to text file or screen
    '''
    
    corpus = []
    
    def train(self, 
              target_corpus_filename, 
              filename_for_storage='n_gram_frequencies_dict.pkl'
             ):
        
        #read
        self.corpus = self.read_in_corpus(target_corpus_filename)[:20]

        #preprocess
        self.corpus = self.preprocess(self.corpus)
        
        #make counter dict
#         self.corpus = self.create_frequency_dict(self.corpus)
    
    def create_frequency_dict(self, corpus):
        pass
    
    def read_in_corpus(self, target_corpus_filename):
        '''
        alternative options to read-in the json would've been pandas 
        but that would be very unassuming to size of the file.
        '''

        corpus_json = json.load(open(target_corpus_filename))
        Issues_json = corpus_json['Issues']
        all_customer_service_lines = []

        for issue_dialogue in Issues_json:
            customer_service_lines = [message['Text'] for message in issue_dialogue['Messages'] \
             if not message['IsFromCustomer']]

            all_customer_service_lines.extend(customer_service_lines)

        return all_customer_service_lines

    def preprocess(self, corpus):
        '''
        This iterates through the corpus line by line tokenizing, 
        spellchecking, normalizing abbreviations, etc.

        I: list of text strings
        O: preprocessed list of text strings

        It's debatable how you want to tokenize, I chose to do it by sentence
        but it could be done with regex like so "[\w' ]+" which would give you 
        smaller phrases.

        to dos:
        -------
        * fix grammar
        * remove first names
        * think about how to deal with punctuation
        * find nltk package to replace abbreviations
        * removing infrequent terms potentially
        * multithread the spellchecker
        '''
        
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        
                                    #lowercase and tokenize by sentence
        corpus_formatted = [tokenized_line for line in corpus \
                            for tokenized_line in sent_detector.tokenize(line.strip().lower())]

        #expanding the abreviations
        corpus_formatted_expanded = [abreviation_expander(line) for line in corpus_formatted]

        #spell check
        #note this function takes quite a bit of time
        corpus_formatted_expanded_correct = [' '.join([spell_checker.correct(word) for word in line.strip().split()]) 
                                             for line in corpus_formatted_expanded]
 
        
        return corpus_formatted_expanded_correct


In [28]:
model = Suggestion_Generator()
model.train('sample_conversations.json')
from collections import Counter




In [30]:
master_lookup = {}
corpus_ctr_dict = Counter(model.corpus)
for line in corpus_ctr_dict.keys():
    checker_ctr = 0
    length_of_line_ctr = 0
    
    for char in customer_service_line_key:
        if checker_ctr < 3:
            if char == ' ':
                checker_ctr += 1
            length_of_line_ctr += 1
    
    for index in range(1, length_of_line_ctr):
        slice_ = customer_service_line_key[:index]
        if customer_service_line_key[:index] not in master_lookup:
            
                

g
r
e
a
t
 
w
a
s
 
m
y
 
p
l
e
a
s
u
r
e
 
h
e
l
p
i
n
g
 
y
o
u
i
 
a
m
 
s
o
r
r
y
 
f
o
r
 
t
h
e
 
i
n
c
o
n
v
e
n
i
e
n
c
e
o
k
 
w
e
r
n
z
i
o
,
 
i
 
h
a
v
e
 
u
p
d
a
t
e
d
 
y
o
u
r
 
a
d
d
r
e
s
s
 
t
o
 
t
h
e
 
s
y
s
t
e
m
h
a
v
e
 
a
 
g
r
e
a
t
 
d
a
y
 
p
e
r
n
i
o
g
i
v
e
 
o
r
 
t
a
k
e
 
a
 
f
e
w
 
m
i
n
u
t
e
s
i
n
t
e
r
e
s
t
i
n
g
 
l
e
t
 
m
e
 
q
u
i
c
k
l
y
 
h
a
v
e
 
a
 
l
o
o
k
 
w
h
a
t
 
t
h
e
 
s
t
a
t
u
s
 
f
o
r
 
y
o
u
r
 
n
e
i
g
h
b
o
r
h
o
o
d
 
i
s
s
e
e
m
s
 
t
h
e
 
e
a
r
l
i
e
s
t
 
w
e
 
w
i
l
l
 
b
e
 
a
b
l
e
 
t
o
 
h
e
l
p
 
y
o
u
 
i
s
 
f
r
o
m
 
t
h
e
 
w
i
t
h
 
f
e
b
r
u
a
r
y
 
o
n
w
a
r
d
s
c
o
u
l
d
 
y
o
u
 
p
l
e
a
s
e
 
s
p
e
c
i
f
y
 
y
o
u
 
a
d
d
r
e
s
s
h
e
y
 
w
e
r
n
z
i
o
,
 
w
h
a
t
 
p
r
o
b
l
e
m
s
 
a
r
e
 
y
o
u
 
e
x
p
e
r
i
e
n
c
i
n
g
h
e
y
 
s
n
o
o
p
y
,
 
i
 
a
m
 
a
b
l
e
 
t
o
 
h
e
l
p
 
y
o
u
o
k
 
a
 
i
n
s
t
a
l
l
a
t
i
o
n
 
o
r
d
e
r
 
h
a
s
 
b
e
e
n
 
p
l
a
c
e
s
i
n
 
t
h
a
t
 
c
a
s
e
 
h
a
v
e
 
a
