In [340]:
import os
from spacy.en import English, LOCAL_DATA_DIR
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
nlp = English(data_dir=data_dir)

In [25]:
import pandas as pd
import numpy as np
import re
import sys
import cPickle

#Helper functions

In [30]:
def scrub_description(desc):
    """Puts final touches on cleaning the document to aid vectorization"""
    
    # This comes down to a series of regular expression cleanups.
    
    def reduce_mult_ent(desc):
        def sub_fun(ent_pattern):
            if ent_pattern.groups()[0] == ent_pattern.groups()[1]:
                return ent_pattern.groups()[0]
            else:
                return ent_pattern.group()
        desc = re.sub('\(?(ent\d\d\d)\)?s*\(?(ent\d\d\d)\)?',sub_fun,desc)
        desc = re.sub('\(?(ent\d\d\d)\)?\s*\(?(ent\d\d\d)\)?',sub_fun,desc)
        desc = re.sub('\(?(bEnt\d\d\d)\)?\s*\(?(bEnt\d\d\d)\)?',sub_fun,desc)
        desc = re.sub('\(?(bEnt\d\d\d)\)?\s*\(?(bEnt\d\d\d)\)?',sub_fun,desc)
        return desc
    
    # Missing spaces after punctuation:
    def fix_punc_after(text,punc):
        search_pattern = '(?<=\S)' + re.escape(punc) + '(?=[^\'\s])'
        replace_with = punc + ' '
        return re.sub( search_pattern, replace_with, text)
    
    # Missing spaces before punctuation:
    def fix_punc_before(text,punc):
        search_pattern = '(?<=[^\'\s])' + re.escape(punc) + '(?=(\w|\d|\())'
        replace_with = ' ' + punc
        return re.sub( search_pattern, replace_with, text)
    
    # Missing spaces before and after punctuation:
    def fix_punc_both(text,punc):
        search_pattern = '(?<=(\w|\d|\)|\())' + re.escape(punc) + '(?=(\w|\d|\)|\())'
        replace_with = ' ' + punc + ' '
        return re.sub( search_pattern, replace_with, text)
    
    # Fix lonely punctuation
    def lonely_punc_after(text,punc):
        search_pattern = '\s*' + re.escape(punc)
        replace_with = punc
        return re.sub( search_pattern, replace_with, text)
    def lonely_punc_before(text,punc):
        search_pattern = re.escape(punc) + '\s*'
        replace_with = punc
        return re.sub( search_pattern, replace_with, text)
    def lonely_punc_both(text,punc):
        search_pattern = '\s*' + re.escape(punc) + '\s*'
        replace_with = punc
        return re.sub( search_pattern, replace_with, text)
    
    # Punctuation to quotation without spacation
    def punc_to_quote(text,punc):
        search_pattern = re.escape(punc+"'") + '(?=\S)'
        replace_with = punc + "' "
        return re.sub( search_pattern, replace_with, text)
    
    
    desc = reduce_mult_ent(desc)
    
    desc = lonely_punc_after(desc,'.')
    desc = lonely_punc_after(desc,',')
    desc = lonely_punc_after(desc,'!')
    desc = lonely_punc_after(desc,'?')
    desc = lonely_punc_after(desc,')')
    
    desc = lonely_punc_before(desc,'(')
    
    desc = punc_to_quote(desc,'.')
    desc = punc_to_quote(desc,',')
    desc = punc_to_quote(desc,'!')
    desc = punc_to_quote(desc,'?')
    desc = punc_to_quote(desc,')')
    
    desc = fix_punc_after(desc,'.')
    desc = fix_punc_after(desc,',')
    desc = fix_punc_after(desc,'!')
    desc = fix_punc_after(desc,'?')
    desc = fix_punc_after(desc,')')
    
    desc = fix_punc_before(desc,'(')
    
    desc = fix_punc_both(desc,'/')
    desc = fix_punc_both(desc,'+')
    desc = fix_punc_both(desc,'=')
    desc = fix_punc_both(desc,'&')
    
    
    desc = re.sub(re.escape('+'),'plus',desc)
    desc = re.sub(re.escape('='),'equals',desc)
    desc = re.sub(re.escape('&'),'and',desc)
    desc = re.sub(re.escape('%'),'percent',desc)
    desc = re.sub(re.escape('?'),'.',desc)
    desc = re.sub(re.escape(':'),',',desc)
    desc = re.sub('\.+','.', desc)
    
    desc = re.sub(re.escape('\.'),'.',desc)
    desc = re.sub(re.escape('\,'),',',desc)
    
    desc = re.sub(re.escape('Dr. '),'Doctor ',desc)
    # spacy doesn't know these. Just remove them.
    desc = re.sub(re.escape('Mr.'),'',desc)
    desc = re.sub(re.escape('Mrs.'),'',desc)
    
    
    return desc

In [31]:
# I want training examples to be accessible through a query list and document dictionary
# Each document has a key to its own sub-dictionary
#   Each sub-dictionary has the vectorized document, and a tag for each point in the sequence
#   Tags say what to do at load time
# The query list is a list of dictionaries where each element is a query
#   Each dictionary has the vectorized query, tags for each point, the name of the reference document, and the answer
# There must also be access to the vectors that are decided on by not spacy

def vectorize_and_tag(text):
    # Easy part: final scrub, tokenize, vectorize
    clean_text = scrub_description(text)
    tokenized_text = nlp(text)
    vec_text = np.array( [x.vector for x in tokenized_text] )
    
    # Tags should correspond to one of the following:
    #   Word spacy knows        -- no further action required
    #   Word spacy doesn't know -- supply appropriate vector
    #   Entity #X               -- give permuted good entity
    #   Bad entity #X           -- give permuted bad entity
    #   Query start tag         -- supply appropriate vector
    #   Query blank tag         -- supply appropriate vector
    tags = []
    for t in tokenized_text:
        
        if t.vector_norm != 0.0:
            # This is something spacy knew.
            tags += [None]
            
        elif re.match('ent\d\d\d',str(t)):
            # This is a good entity. Supply that tag, and its ent#
            entNum = int(str(t)[3:6])
            tags += [(1,entNum)]
            
        elif re.match('bEnt\d\d\d',str(t)):
            # This is a bad entity. Supply that tag, and its bEnt#
            bEntNum = int(str(t)[4:7])
            tags += [(2,bEntNum)]
            
        elif re.match('QUERYSTART',str(t)):
            tags += [(3,None)]
            
        elif re.match('XXXXXX',str(t)):
            # Blank tag
            tags += [(4,None)]
            
        else:
            # Must be just some unknown word
            tags += [(0,None)]
    
    return clean_text, vec_text, tags

In [32]:
def name_like_vecs(n,return_real_vecs=False):
    """Returns random vectors that inhabit the vector space of real names"""
    
    # Use these names to estimate a distribution describing where in vector space names live
    string_of_names =\
        u'Alex Alice Brian Betty Carl Cindy David Dorothy Emilio Elizabeth France Francine Greg Gina' +\
        u'Harold Helen Ian Isabella Joshua Jane Kaleb Kimberly Leonard Lisa Mark Mindy Noah Nancy' +\
        u'Owen Olivia Paul Patsy Quinn Ronald Regina Steven Sally Teddy Tina Victor Vivian'
    
    # Process these names into their learned vectors
    ndoc = nlp(string_of_names)
    name_vecs = np.array([x.vector for x in ndoc if x.vector_norm != 0.0])
    
    # Generate names from the same empirical distribution
    fake_vecs = np.random.multivariate_normal(
        np.mean(name_vecs,axis=0),
        np.cov(name_vecs.T)+.000001*np.eye(300), # (lambda*I provides full rank)
        size=(n))
    # Normalize them
    fake_vecs = (fake_vecs.T / np.sqrt( np.sum( fake_vecs**2, axis=1 ) )).T
    if return_real_vecs:
        return fake_vecs, name_vecs
    else:
        return fake_vecs

#Main function:

In [871]:
from IPython.html.widgets import FloatProgress
from IPython.display import display
def build_trainables(DF,num_ent_vecs,num_bEnt_vecs,vec_store_path):
    doc_dic = {}
    query_list = []
    
    doc_counter = 0
    query_counter = 0
    
    # Generate vector lookups for good entities
    ent_vecs = name_like_vecs(num_ent_vecs+1) # Add one because ent "0" is the blank
    # Ensure that the closest 2 names are not too close
    while np.max(( np.dot(ent_vecs,ent_vecs.T) - np.eye(num_ent_vecs+1) ).flatten()) > .9:
        ent_vecs = name_like_vecs(num_ent_vecs+1)
    
    # Do the same thing for bad entity lookups
    bEnt_vecs = name_like_vecs(num_bEnt_vecs)
    # Ensure that the closest 2 names are not too close
    while np.max(( np.dot(bEnt_vecs,bEnt_vecs.T) - np.eye(num_bEnt_vecs) ).flatten()) > .9:
        bEnt_vecs = name_like_vecs(num+bEnt_vecs)
    
    # Step through each book and build the trainables
    f = FloatProgress(min=0, max=DF.index[-1])
    display(f)
    for book in DF.index:
        book_name = DF['title'][book]
        
        # Try each summary source
        for S in [0,1]:
            
            if S==0: # Try source "ii"
                if DF['anon_sum_ii'][book]:
                    found = True
                    # This summary exists. Create stuff for it.
                    summ_name = re.sub('\s','_',book_name) + '_ii'
                    summ = DF['anon_sum_ii'][book]
                    SQ = DF['solo_queries_ii'][book]
                    MQ = DF['mult_queries_ii'][book]
                else:
                    found = False
            
            else: # Try source "iiii"
                if DF['anon_sum_iiii'][book]:
                    found = True
                    # This summary exists. Create stuff for it.
                    summ_name = re.sub('\s','_',book_name) + '_iiii'
                    summ = DF['anon_sum_iiii'][book]
                    SQ = DF['solo_queries_iiii'][book]
                    MQ = DF['mult_queries_iiii'][book]
                else:
                    found = False
            
            if found: # Data got.
                # Tag whether the query is a solo or multiple
                s_tag = ['s' for x in range(len(SQ[0]))]
                m_tag = ['m' for x in range(len(MQ[0]))]
                
                # Merge the solo/mult tags, the queries, and the answers into their own lists
                all_T = s_tag + m_tag
                all_Q = SQ[0] + MQ[0]
                all_A = SQ[1] + MQ[1]

                # Vectors and tags for the summary
                doc_dic[summ_name] = {}
                doc_dic[summ_name]['text'], vecs, doc_dic[summ_name]['tags'] =\
                    vectorize_and_tag( summ )
                
                # NOTE: vecs do not go right into the dictionary.
                # They get written to the hard drive and their address is stored as loc in the dictionary
                doc_dic[summ_name]['loc'] = 'doc_%07d'%doc_counter + '.pkl'
                fo = open(doc_dic[summ_name]['loc'],'wb')
                cPickle.dump(vecs, fo)
                fo.close()
                doc_counter += 1
                
                # To leave in the option of pre-loading, have a key for vecs
                doc_dic[summ_name]['vecs'] = None


                # Step through each query and add it to the list
                for t,q,a in zip(all_T,all_Q,all_A):
                    # Create a dictionary for this query
                    q_dic = {}
                    q_dic['sm'] = t 
                    q_dic['a'] = a
                    q_dic['doc'] = summ_name

                    # Incorporate vectored query and token tags
                    q_dic['text'], vecs, q_dic['tags'] = vectorize_and_tag( u'QUERYSTART ' + q )
                    
                    # NOTE: vecs do not go right into the dictionary.
                    # They get written to the hard drive and their address is stored as loc in the dictionary
                    q_dic['loc'] = 'query_%07d'%query_counter + '.pkl'
                    fo = open(q_dic['loc'],'wb')
                    cPickle.dump(vecs, fo)
                    fo.close()
                    query_counter += 1
                    
                    # To leave in the option of pre-loading, have a key for vecs
                    q_dic['vecs'] = None
                    
                    # Add this query to the full query list
                    query_list += [q_dic]
    
        f.value=book
    
    return doc_dic, query_list, ent_vecs, bEnt_vecs

#Convert database into trainables

In [28]:
# Read in the database
df = pd.read_pickle('./data/database_with_queries.pd')

In [33]:
# This tells us what the largest numbered entity is
max_ent = 'ent000'
for AS in df['anon_sum_ii']:
    if AS:
        these_ents = re.findall('ent\d\d\d',str(scrub_description(AS)))
        if these_ents:
            this_max = max(these_ents)
        else:
            this_max = 'ent000'
        if this_max > max_ent:
            max_ent = this_max
for AS in df['anon_sum_iiii']:
    if AS:
        these_ents = re.findall('ent\d\d\d',str(scrub_description(AS)))
        if these_ents:
            this_max = max(these_ents)
        else:
            this_max = 'ent000'
        if this_max > max_ent:
            max_ent = this_max

# This tells us what the largest numbered BAD entity is
max_bEnt = 'bEnt000'
for AS in df['anon_sum_ii']:
    if AS:
        these_ents = re.findall('bEnt\d\d\d',str(scrub_description(AS)))
        if these_ents:
            this_max = max(these_ents)
        else:
            this_max = 'bEnt000'
        if this_max > max_bEnt:
            max_bEnt = this_max
for AS in df['anon_sum_iiii']:
    if AS:
        these_ents = re.findall('bEnt\d\d\d',str(scrub_description(AS)))
        if these_ents:
            this_max = max(these_ents)
        else:
            this_max = 'bEnt000'
        if this_max > max_bEnt:
            max_bEnt = this_max

            
max_ent  = int(max_ent[3:6])
max_bEnt = int(max_bEnt[4:7])
print 'Largest-numbered entity: {}'.format(max_ent)
print 'Largest-numbered BAD entity: {}'.format(max_bEnt)

Largest-numbered entity: 150
Largest-numbered BAD entity: 10


In [None]:
# Construct it!
save_path = '/Users/alex/Desktop/data manager/'
doc_dic, query_list, ent_vecs, bEnt_vecs = build_trainables(
    df,
    max_ent,
    max_bEnt*2,
    save_path)

# Save everything
with open(save_path + 'document_dictionary.pkl','wb') as f:
    cPickle.dump(doc_dic,f)
with open(save_path + 'query_list.pkl','wb') as f:
    cPickle.dump(query_list,f)
with open(save_path + 'entity_vectors.pkl','wb') as f:
    cPickle.dump(ent_vecs,f)
with open(save_path + 'bad_entity_vectors.pkl','wb') as f:
    cPickle.dump(bEnt_vecs,f)

In [877]:
# Sizes of the resulting things
D = 0
for k,v in doc_dic.iteritems():
    for k2,v2 in v.iteritems():
        try:
            nB = v2.nbytes
        except:
            nB = sys.getsizeof(v2)
        D += nB
Q = 0
for q in query_list:
    for k,v in q.iteritems():
        try:
            nB = v.nbytes
        except:
            nB = sys.getsizeof(v)
        Q += nB

print 'Size of document dictionary: {} MB'.format(D/1e6)
print 'Size of query list:          {} MB'.format(Q/1e6)

Size of document dictionary: 65.234588 MB
Size of query list:          70.675218 MB
