In [1]:
import json
import pandas as pd
import re
from nltk import word_tokenize as w_toke
from nltk import sent_tokenize as s_toke

SUPPORT CODE:

In [2]:
### PIPELINE for ENTITY DETECTION / ANONYMIZATION
#
# (1) Create entries for all known entities
# (2) Identify possibly ambiguous sub names
# (3) Track possible sources of ambiguous sub names
# (4) Create cleaned full text
# (5) Replace known, unambiguous entities 
# (6) Replace ambiguous entities

In [3]:
# (1) Create entries for all known entities
# (2) Identify possibly ambiguous sub names
# (3) Track possible sources of ambiguous sub names
def init_entity_dict(char_dict):
    
    def clean_name(charName):
        # Get rid of ...
        # ... nickname indicator things
        nuesance = ("(?<= )'","'(?= )",'(?<= )\"','\"(?= )','\(','\)')
        # ... seperators you sometimes see
        nuesance = (",(?= )","(?<= )and(?= )","(?<= )&(?= )","(?<= )of(?= )") + nuesance
        # ... Common abbreviations for titles
        nuesance = ('Mr\.? ','Mrs\.? ','Ms\.? ','Dr\.? ','Prof\.? ') + nuesance
        # ... Common titles
        nuesance = ('The ','the ','Miss ','Master ','Mister ','Misses ','Doctor ','Professor ','Judge ') + nuesance
        # ... Religious titles
        nuesance = ('Father ','Bishop ','Saint ','Sister ') + nuesance
        # ... Royalty
        nuesance = ('King ','Queen ','Prince ','Princess ','Duke ','Dutchess ','Count ','Countess ','Baron ','Knight ','Lord ','Lady ') + nuesance
        # ... Military rank
        nuesance = ('Private ','Private First Class ','Corporal ','Sergeant ','Lieutenant ','Captain ','Major ','Colonel ','General ','Commodore ','Officer ') + nuesance

        # Turn this into a regular expression search pattern
        p = "("
        for i,x in enumerate(nuesance):
            if i==len(nuesance)-1: p = p+x+")"
            else: p = p+x+"|"
        # Apply this first cleaning
        charName = re.sub( p,'',charName)

        # Address surrnames that show up as 2 words
        earlyParts = ('De','de','Del','del','Von','von')
        for ep in earlyParts:
            # Look for a match and replace the gap with an underscore
            charName = re.sub(ep+" \w+",lambda x:re.sub(' ','_',x.group(0)),charName)

        return charName 
    
    def enmrate_namings(charName):
        """Get all the possible name usages, assuming that order is preserved"""
        charName = re.sub('\s*$','',charName)
        # Apply whatever cleaning function you will use and convert to tokens
        # Note: the original name does not get "cleaned"
        subNames = w_toke(clean_name(charName))
        # Our underscored connections (de_Jesus) survive tokenization, now we can get rid of them
        subNames = [re.sub('_',' ',re.sub(' ','',str(x))) for x in list(subNames)]
        
        # Pull out all (however unlikely) name combos (within reason) that preserve name order
        def buildNames(subNames,idxPairs):
            return [ subNames[x[0]]+' '+subNames[x[1]] for x in idxPairs]
        
        if len(subNames) == 1:
            indNames = [str(charName)]
        if len(subNames) == 2:
            indNames = [str(charName)] + subNames
        if len(subNames) == 3:
            indNames = [str(charName)] + subNames + buildNames(subNames,( (0,1),(0,2),(1,2) ))
        if len(subNames) == 4:
            indNames = [str(charName)] + subNames + buildNames(subNames,( (0,1),(0,2),(0,3),(1,2),(1,3),(2,3) ))
        if len(subNames) >  4:
            indNames = [str(charName)] + subNames # (fuck it)
        return indNames

    
    # Intialize a dictionary for storing the entities.
    eDict = {}
    knownCount = 0
    badCount = 0
    for c in char_dict:
        # Break apart any and all possible name usages, assuming that order is preserved and "nicknames" are never more than 3 sub names
        allNames = enmrate_namings(c['name'])
        # Step through each of these new names
        origName = allNames[0]
        for n in allNames:
            # Does an entry already exist?
            if eDict.has_key(n):
                # We have found a conflict!
                # Don't need to create a new entry, but update the list of source conflicts
                eDict[n]['sources']  = eDict[n]['sources']  + [origName]
                eDict[n]['members'] += 1
                if n == origName:
                    # Uh oh. A character's original name should lead to a new entry
                    eDict[n]['orig_conflicts'] += 1
                    if not eDict[n]['has_orig']:
                        # I guess we should throw these in there for consistency
                        knownCount += 1
                        eDict[n]['has_orig'] = True
                        eDict[n]['ent'] = 'ent%03d' % knownCount
                        eDict[n]['last'] = 0
                    # We also want to distinguish bad entities
                    if not eDict[n].has_key('bEnt'):
                        # Looks like we found a new one...
                        badCount += 1
                        eDict[n]['bEnt'] = 'bEnt%03d' % badCount
                    
                
            else:
                # This is a new name
                # Create a new entity if it's an original
                if n==origName:
                    # It is an original
                    knownCount += 1
                    eDict[n] = {}
                    eDict[n]['has_orig'] = True
                    eDict[n]['ent'] = 'ent%03d' % knownCount
                    eDict[n]['last'] = -1
                else:
                    # It is a possible subname, do not devote an entity
                    eDict[n] = {}
                    eDict[n]['has_orig'] = False
                # These will be the same in either case
                eDict[n]['sources'] = [origName]
                eDict[n]['members'] = 1
                # We need to flag/count catestrophic confusions
                eDict[n]['orig_conflicts'] = 0
        
    # Make it easy to compare the level of entity badness
    eCounts = {}
    eCounts['TOT_ORIG_CONF'] = sum([eDict[x]['orig_conflicts'] for x in eDict.keys()])
    eCounts['COUNTKNOWN'] = knownCount
    eCounts['COUNTBAD']   = badCount
        
    return eDict, eCounts

In [4]:
# The co-reference/substitution procedure
def choose_entity_sub(mtchobj):
    """This function is called during a re.sub() routine as the replacement function"""
    global eDict
    name = mtchobj.group(0)
    loc  = mtchobj.start(0)
    # Resolve the root source (if it isn't resolved already)
    if not eDict[name]['has_orig']:
        # Not an original, but it might be unique
        if eDict[name]['members'] == 1:
            # Unique indeed. The source is our name.
            source = eDict[name]['sources'][0]
        else:
            # It isn't unique, we need to resolve the name further...
            # Assume that it refers to the most recently mentioned one.
            # Sort the possible sources by most recent appearance
            srtdSources = sorted( eDict[name]['sources'], key = lambda x: -eDict[x]['last'] )
            # Take the source from the top of the list!
            source = srtdSources[0]
    else:
        # It's an original name. So, there's your source
        source = name

    # We have resolved our source
    if not eDict[source]['orig_conflicts']:
        # We're good! Update that we saw this entity and return its code
        eDict[source]['last'] = loc
        return eDict[source]['ent']
    else:
        # This is too tricky to figure out. Return a bad-entity code
        return eDict[source]['bEnt']

In [5]:
# (4) Create cleaned full text
# (5) Replace known, unambiguous entities 
# (6) Replace ambiguous entities
def sub_known_entities(scraped_info,eDict,info_type):
    
    # Refresh the 'last' field
    for x in eDict.keys():
        if eDict[x].has_key('last'): eDict[x]['last'] = -1
    
    # List out all the possible known entities, sorted by length
    E = ['(?<=\W)'+re.escape(x)+'(?=\W)' for x in sorted(eDict.keys(), key = lambda x: -len(x))]
    # Create a greedy pattern lookup
    p = "("
    for i,x in enumerate(E):
        if i==len(E)-1: p = p+x+")"
        else: p = p+x+"|"
            
    def clean_desc_start(thisChar):
        # These first sentences of these seem to follow one of these formats:
        #   Character Name, blah blah blah ...
        #   Ethnic Character Name (PRO-nun-see-AY-shun), blah blah blah ...
        # Turn it into:
        #   Character Name is blah blah blah ...
        thisName = re.sub('\s*$','',thisChar['name']) # Remove lagging space
        thisChar['description'] = re.sub(thisName+'\s?(\s*\(.*\))?,',thisName+' is',thisChar['description'])
        return thisChar
            
    if info_type=='chars':
        # Get the full character description
        cleanChars = [ clean_desc_start(x) for x in scraped_info ]
        # Pad the full description with spaces to make entities findable everywhere
        fulldesc = ' '
        for x in cleanChars: fulldesc = fulldesc + ' ' + x['description']
        txt = fulldesc + ' '
    elif info_type=='summary':
        # Pad the summary with spaces to make entities findable everywhere
        txt = ' ' + scraped_info + ' '

    # Detect/replace the entities we know about
    known_subbed_text = re.sub( p, choose_entity_sub, txt)
    
    # Return stuff
    return known_subbed_text, txt

In [6]:
### THE QUERY GENERATION PIPELINE:
#
# For each book:
#    Entity sub descriptions
#    Store the dictionary used
# For each available summary:
#    Entity sub the summary
#    Store anonymized summary
#    Generate possible queries
#    Prune crappy queries
#    Enumerate queries
#    Store them

In [18]:
def query_maker(ks_desc,ks_summ):
    """Takes the known-substituted descriptions/summary and returns all the queries you can get from their combination"""
    # Our quantum query is a sentence from the full description. Atomize it.
    sents = s_toke(ks_desc)
    # For each sentence, get a list of the unique entities mentioned
    ents_mentioned = [set(re.findall('ent\d\d\d',x)) for x in s_toke(ks_desc)]
    # A key variable is how many entities are in each sentence
    num_ents_in_sent = [len(x) for x in ents_mentioned]

    # Simplify the ents_mentioned info by representing each entity with their ID number
    def ent_to_id(entString):
        return int(entString[3:])
    def set_to_ids(ent_set):
        return [ent_to_id(x) for x in list(ent_set)]
    def mentions_to_id(ent_mentions):
        return [set_to_ids(x) for x in ent_mentions]

    # List of entities (by ID) mentioned in each sentence of the char descriptions
    ids_mentioned = mentions_to_id( ents_mentioned )
    # Count total mentions of each known entity in the summary
    largest_ent = ent_to_id(max(set( re.findall('ent\d\d\d',ks_desc) )))
    tot_mentions = []
    for e in range(largest_ent+1):
        this_ent = 'ent%03d' % e
        tot_mentions = tot_mentions + [len( re.findall( this_ent, ks_summ) )]

    #For each sentence, how represented (in the summary) is the least represented entity
    min_rep = []
    for s in ids_mentioned:
        mentions_of_relevance = [tot_mentions[x] for x in s]
        if mentions_of_relevance: min_rep = min_rep + [min(mentions_of_relevance)]
        else: min_rep = min_rep + [0]


    # Now we have something to work with
    possible_queries = zip(num_ents_in_sent,ids_mentioned,min_rep,sents)

    # Cull shit we can't use
    # Minimum representation (only keep queries if each entity is mentioned at least 3 times in the summary)
    possible_queries = [x for x in possible_queries if x[2] >= 3]
    
    # Sort by representation density
    possible_queries = sorted(possible_queries, key = lambda x: x[2])
    
    # Queries involving just 1 entity (likely to focus on plot)
    possible_solo_queries = [x for x in possible_queries if x[0] == 1]
    # Queries involving multiple entities
    possible_mult_queries = [x for x in possible_queries if x[0] >= 2 and x[0] <= 4]

    def make_queries(possible_queries):
        # Now, we need to create the set of sentences that act as training queries for this summary
        # Each sentence will contribute x times, where x is the number of unique entities it has
        queries = []
        answers = []
        for p in possible_queries:
            # Work with this sentence
            this_sent = p[3]
            # Step through each unique entity in the sentence
            for e in p[1]:
                # Put this guy in the answers
                answers = answers + [e]
                # Articulate the character representation
                estr = 'ent%03d' % e
                # Generate a query by replacing this entity with some marker in our sentence
                this_query = re.sub(estr,'XXXXXX',this_sent)
                # Add it to our list of queries
                queries = queries + [this_query]
        return queries, answers
            
    return make_queries(possible_solo_queries), make_queries(possible_mult_queries), possible_queries

CREATE QUERIES:

In [19]:
# Read in the pandas dataframe with the complete raw data
df = pd.read_pickle('complete_database.pd')
# Shorthand the character descriptions and plot summaries
C = df['characters']
S2 = df['masterplots_ii_summary']
S4 = df['masterplots_fourth_edition_summary']
# Add the necessary fields to the table
df['eDict'] = None
df['anon_sum_ii'] = None
df['anon_sum_iiii'] = None
df['mult_queries_ii'] = None
df['solo_queries_ii'] = None
df['possible_queries_ii'] = None
df['mult_queries_iiii'] = None
df['solo_queries_iiii'] = None
df['possible_queries_iiii'] = None

In [20]:
# Put the whole fucking pipeline in action!!!
for BOOK in df.index:

    # Initialize the dictionary
    eDict, eCounts = init_entity_dict(C[BOOK])
    # Store it
    df['eDict'][BOOK] = (eDict, eCounts)
    
    # Get the full known-substituted and original character description summaries.
    ks_desc,trsh = sub_known_entities( C[BOOK],eDict,'chars')
    
    # Try the first summary
    if type(S2[BOOK])==float: # Silly, ari...
        # Make this a more reasonable type. It's currently a NaN
        S2[BOOK] = None
    elif S2[BOOK]: # Probably good to go
        # Get the full known-substituted and original summaries for MP2
        ks_summ,trsh = sub_known_entities(S2[BOOK],eDict,'summary')
        # Store it
        df['anon_sum_ii'][BOOK] = ks_summ
        # Generate the queries
        solo_queries, mult_queries, possible_queries = query_maker(ks_desc,ks_summ)
        # Store them
        df['solo_queries_ii'][BOOK] = solo_queries
        df['mult_queries_ii'][BOOK] = mult_queries
        df['possible_queries_ii'][BOOK] = possible_queries
        
    # Try the second summary
    if type(S4[BOOK])==float: # Silly, ari...
        # Make this a more reasonable type. It's currently a NaN
        S4[BOOK] = None
    elif S4[BOOK]: # Probably good to go
        # Get the full known-substituted and original summaries for MP2
        ks_summ,trsh = sub_known_entities(S4[BOOK],eDict,'summary')
        # Store it
        df['anon_sum_iiii'][BOOK] = ks_summ
        # Generate the queries
        solo_queries, mult_queries, possible_queries = query_maker(ks_desc,ks_summ)
        # Store them
        df['solo_queries_iiii'][BOOK] = solo_queries
        df['mult_queries_iiii'][BOOK] = mult_queries
        df['possible_queries_iiii'][BOOK] = possible_queries

In [21]:
# Boom.

all_solo_queries_ii   = [x[0] for x in df['solo_queries_ii']   if x]
all_solo_queries_iiii = [x[0] for x in df['solo_queries_iiii'] if x]
n_solo = 0
for q in all_solo_queries_ii:   n_solo = n_solo + len(q)
for q in all_solo_queries_iiii: n_solo = n_solo + len(q)
    
all_mult_queries_ii   = [x[0] for x in df['mult_queries_ii']   if x]
all_mult_queries_iiii = [x[0] for x in df['mult_queries_iiii'] if x]
n_mult = 0
for q in all_mult_queries_ii:   n_mult = n_mult + len(q)
for q in all_mult_queries_iiii: n_mult = n_mult + len(q)


print 'Total number of   single-entity queries: {}'.format(n_solo)
print 'Total number of multiple-entity queries: {}'.format(n_mult)
print 'Total number of queries:                 {}'.format(n_solo+n_mult)

Total number of   single-entity queries: 31030
Total number of multiple-entity queries: 36903
Total number of queries:                 67933


In [22]:
pd.to_pickle(df,'database_with_queries.pd')