
### Purpose
Currently being used to shortcut creating data for training.

*However:* A more rigerous method for cleaning data and make a pipeline where changes can produce better results in training.

Use as reference: https://www.safaribooksonline.com/library/view/hands-on-automated-machine/9781788629898/ccac8d45-a703-42b9-992c-d82eafafe94d.xhtml

* use as example for text utilities: https://github.com/openai/finetune-transformer-lm/blob/master/text_utils.py



### Pipeline considerations:
* 1) Cleaning
    * Stop words
    * newline, tab 
    * Lowercasing
* 2) Normalizing
    * Splitting stemmed words into stem and removed portion
    * Restricting the length of sentences (query and/or response)

In [6]:
import string
import pickle
punctuations = string.punctuation

import spacy
from spacy.lang.en import English
parser = English()

## Spacy pipeline

In [88]:
nlp_pipeline_sentencize = spacy.load('en')
nlp = spacy.load('en')

import spacy
import re
class QueryResponseNormalizer:
    """
    Able to initialize parameters for cleansing and normalizing both query and response.
    """
    
    # Pass optional parameters to 
    def __init__(self,
                remove_stop=False,
                lowercase=True,
                stem = 'stem_split',
                num_sent_words = 100):
        self.remove_stop = remove_stop
        # optionally lowercase all words
        self.lowercase = lowercase
        # options are: None (don't do anything), stem (stem but remove extra) and stem_split (stem and add extra)
        self.stem = stem
        # limits the number of words (not tokens) in a query
        self.num_sent_words = num_sent_words
    
    def newline_replace(self,txt_blob):
        """
        Recursive function deals with multiple newline replacement.
        """
        txt_blob = txt_blob.replace('\n\n','\n')
        if '\n\n' in txt_blob:
            self.newline_replace(txt_blob)
        else:
            txt_blob = txt_blob.replace('\n',' ')
            return txt_blob  
        
    def newline_join(self,doc):
        """
        Function deals with issue of having a hyphenatic word connected to a word after a newline.
        """
        new_doc = []
        for token in doc.split(' '):
            # strip whitespace
            token = token.strip()
            # join words if necessary else simply remove newline.
            if '\n' and '-' in token:
                token = token.replace('\n','').replace('-','')
            else:
                token = token.replace('\n',' ')
            if len(token) > 0:
                new_doc.append(token)

        return ' '.join(new_doc).strip()

    def normalize_text(self,txt_blob):
        txt_blob = self.newline_join(txt_blob)
        txt_blob = self.newline_replace(txt_blob)
        regex = r"[\W_]"
        txt_blob = re.sub(regex, " ", txt_blob, 0)
        txt_blob = ' '.join(txt_blob.split())
        spcy_txt = nlp(txt_blob)
        
        spcy_txt_new = []
        for token in spcy_txt:
            if not(self.remove_stop and token.is_punct):
                spcy_txt_new.append(token.lemma_)
        return spcy_txt_new


In [102]:
test = qa[35][1]
test

'an infection could have spread from the bladder to the kidneys and then the antibiotics may have cured it (or possibly) hidden it, perhaps in the prostate. they are notoriously'

In [103]:
# test text
normalizer = QueryResponseNormalizer()
' '.join(normalizer.normalize_text(test))


'an infection could have spread from the bladder to the kidney and then the antibiotic may have cure -PRON- or possibly hide -PRON- perhaps in the prostate -PRON- be notoriously'

In [133]:
def shorten_sentence(txt,length=30):
    """
    Shortens to 20 words.
    
    Input:
        txt: a string
    Output:
        a string
    """
    clean = ' '.join(str(txt).lower().replace("\n","").split(' ')[:length])
    
    return ' '.join(clean.split())

def only_original_interactions(query_response):
    """
    Removes a query/response pair if the query has been seen before.
    Input:
        query_response: a list of tuples
    Output:
        A list of tuples
    """
    query_response_new = []
    original_question = set()
    for tup in zip(*query_response):
        if tup[0] not in original_question:
            query_response_new.append(tup)
            original_question.update(tup[0])
    print('Originally {} chats. Removed {} chats from input.'.format(len(query_response),
                                                                (len(query_response)-len(query_response_new))))

    return query_response_new

def retrieve_corpora(path_to_pickle):
    data = pickle.load(open(path_to_pickle,'rb'))
    qa = []
    for i in list(data):
        qa.append((i[0]['utterance'],i[1]['utterance']))
    return qa

data = retrieve_corpora('../data/all_responses_equal.p')

In [134]:
d = only_original_interactions(data)


108825 chats total. Removed 108823 chats from input.


In [14]:
qa_new = []
original_question = set()
for line in qa:
    if line[0] not in original_question:
        qa_new.append(line)
        original_question.add(line[0])
print('Num original pairs:',len(qa_new))

Num original pairs: 29722


In [25]:
pickle.dump(qa_new,open('seq2seq_examples/fra-eng/all_responses_equal.p','wb'))

In [24]:
with open('seq2seq_examples/fra-eng/all_responses_equal.txt', 'w') as fp:
    fp.write('\n'.join('%s -----+----- %s' % x for x in qa_new))