In [1]:
import pickle
import en_coref_md
import en_coref_sm
import os
import re
import spacy
from termcolor import colored
cur = '/Users/apple/Desktop/钻/Projects/mlab-intuit-fa18/CNNDatasetAnalysis/'

In [2]:
fullTextTexts = pickle.load(open(cur+'processedData/Texts.pkl','rb'))
fullTextSentences = pickle.load(open(cur+'processedData/textSentences.pkl','rb'))

## Assume data is taken in as list of list of words (lsls)
## Since spacy takes the whole string as resolution input, so first need to merge lsls into one document text

In [42]:
## hard coding for some examples of exception 
period_ls = {'Mr.': 'Mr', 'Mrs.': 'Mrs', 'Dr.': 'Dr', 'Gov.':'Gov'}

def isPunctuation(word):
    return False if re.match("^[a-zA-Z0-9_\-]*$", word) else True

# merge documents from a list of list of list of words to one document str
def mergeDoc(lslsls):
    docs = []
    for doc in lslsls:
        doc_str = ""
        for sen in doc:    
            for word in sen:  
                ## prevent leading spaces
                if isPunctuation(word) or doc_str == "": 
                    doc_str += word
                ## remove new line
                elif word != '\n':   
                    doc_str += " " + word                
        docs.append(doc_str)
    return docs

# preprocess the document string (removing unusual periods: U.S.A, Dr.Who)
def preprocess(doc_str):

    ## Remove . in acronyms 
    ## NOTE: not distinguishing if the acronym ends the sentence (where period should be kept)
    doc = re.sub(r'(?<!\w)([A-Z])\.', r'\1', doc_str)
    ## TODO: use Regex not for-loop
    for k, v in period_ls.items():
        doc = doc.replace(k, v)
    return doc

In [44]:
fullText = [preprocess(x) for x in mergeDoc(fullTextSentences)]

In [46]:
fullText = pickle.load( open( "fullText.p", "rb" ) )
#pickle.dump(fullText, open("fullText.p", "wb"))

## Applying spacy's coreference resolution to obtain coreference clusters

In [49]:
nlp = en_coref_md.load()

## Functions to help locate coreference clusters and their mentions

Since spacy gives the arbitrary position of each mention (sometimes that does not align with the word positions we have depending on how they break their sentences/words/punctuations), building a dictionary that stores both sentence start and ending positions to help search flexibly in a range.

Note: funtions applies to a single document 

In [54]:
## input: list of list of words
## return: a dictionary (key: sentence positions; value: [start word pos, end word pos])
def labelPositions(lsls):
    sen_pos = 0
    word_pos = 0
    label_dic = {}
    for sen in lsls:
        start = word_pos
        end = word_pos + len(sen) - 1
        label_dic[sen_pos] = [start,end]
        word_pos = end + 1
        sen_pos += 1
    return label_dic

## Input: word position
## Return: the sentence position in which the word is located
def findSen(dic, word_pos):
    for k,v in dic.items():
        if(word_pos >= v[0] and word_pos <= v[1]):
            return k
    raise IndexError("word position is out of bound")
    return


## Find the sublist of ls that is an exact match to the pattern
## return the first set of index of sublist
def subfinderFirst(ls, pattern):
    match_index = []
    for wi in (range(len(ls))):
        for pi in (range(len(pattern))):
            if wi+pi >= len(ls) or ls[wi+pi].lower() != pattern[pi].lower():
                break
            if pi == len(pattern) - 1:
                match_index += list(range(wi,wi+len(pattern)))
                return match_index

## (Currently not used)            
## return all indices for elements of sublist
def subfinder(ls, pattern):
    match_index = []
    for wi in (range(len(ls))):
        for pi in (range(len(pattern))):
            if wi+pi >= len(ls) or ls[wi+pi].lower() != pattern[pi].lower():
                break
            if pi == len(pattern) - 1:
                match_index += list(range(wi,wi+len(pattern)))
    return match_index

## Resolution
Rules:
1. If the identity is in the same sentence, do not resolve;
2. Similarly, do not resolve more than once for the same identity in the same sentence;
3. When there are multiple choices (after passing rule 1 and 2), resolve the first reference. 
(**Issue: may replace the instance from another cluster in the same sentence**) 
4. Only replace references that come after the identity

Corner cases:
1. Replace possessive pronouns with identity + 's
2. Replace 're with identity + 'are'

In [163]:
POSSESSIVE_PRONOUNS = ['their', 'its', 'his', 'her', 'hers', 'theirs', 'my', 'mine', 'your', 'yours']
FUZZY_SEN = 6

# doc: a single document in a str (from FullText)
# dic: labeled word pos dictionary for a single doc
# mod: spacy model for a single doc
# lsls: list of list of words (from FullTextSentences)
def resolution(mod, dic, doc, lsls):
    
    for cluster in mod._.coref_clusters:
        identity = str(cluster.main)
        identity_sen = findSen(dic, cluster.main.start)  ## TODO: or use mention[0]
        replaced_sen = []
        for ref in cluster.mentions:
            if str(ref).lower() != str(cluster.main).lower(): ## e.g. Cluster like ["He", "he"] is ignored
                sen_index = findSen(dic,ref.start)
                
                ## Note: only replace references that come after the identity
                if sen_index > identity_sen and sen_index not in replaced_sen: 
                    possible_indices = []
                    possible_sen = []
                    for i in range(-1,FUZZY_SEN):
                        if identity_sen < sen_index+i < len(lsls):
                            possible_indices.append(sen_index+i)
                            possible_sen.append(lsls[sen_index+i])
                    
                    
                    ref_sen, id = selectReplace(possible_sen, identity, str(ref), dic)
                    if id is not None and ref_sen is not None:
                        ref_sen_index = possible_indices[id]
                        replaced_sen.append(ref_sen_index)
                    
                        ## mutate actual sentence
                        lsls[ref_sen_index] = ref_sen
    return

# sentences: a list of candidate sentence that may contain the reference
# identity: word str
# ref: word str
# dic: to update dictionary after each resolve
# return: resolved sentence as a word list, sentence index (in the list), change of word count to update dic
def selectReplace(sentences, identity, ref, dic): ## FOR DEBUGGING
    replace_str = identity
    
    ##1. Deal with corner cases:
    # sync capitalization between ref and identity
    if ref[0].isupper() and replace_str[0].islower():
        replace_str = replace_str[0].upper() + replace_str[1:]
        print(replace_str)
    
    if ref[0].islower() and replace_str[0].isupper():
        replace_str = replace_str[0].lower() + replace_str[1:]
        print(replace_str)
        
    # replacing possessive pronouns
    if ref.lower() in POSSESSIVE_PRONOUNS:
        if replace_str[-2:] != "'s": ## if 's is not already in str
            replace_str = replace_str+"'s" 
    
    ##2. Locate reference to be replaced within a fuzzy range:
    identity_ls = replace_str.split(" ")
    ref_ls = ref.split(" ")
    replace_index = subfinderFirst(sentences[0], ref_ls)
    sentence = sentences[0]
    sentence_id = 0
        
    for i in range(FUZZY_SEN): ##TODO: while-loop
        if replace_index is not None:
            sentence_id = i
            break
        if replace_index is None:
            replace_index = subfinderFirst(sentences[i], ref_ls)
            sentence = sentences[i]

    ## DEBUGGING
    if replace_index is None:
        if DEBUG:
            #print("ref.start: ", start)
            #s = findSen(dic, start)
            print("ref word: ", ref)
            print("in range: ", dic[s])
            for i in range(5):
                print("earlier: ", fullTextSentences[TEST_INDEX][s-i])
                print("later: ", fullTextSentences[TEST_INDEX][s+i])
        return
    
    if PRINT:
        print("###\nBefore: ") 
        mark_sen(sentence, replace_index)

    ## messy corner cases:
    ## "they're" --> "[identity] are"
    if sentence[replace_index[-1]+1] == '\'re':
        sentence[replace_index[-1]+1] = 'are'
    
    ##3. Resolve
    for j in replace_index:
        if j > len(sentence):
            print("j ", j)
            print("len", len(sentence))
        del sentence[j]
    identity_ls.reverse()
    for word in identity_ls:
        sentence.insert(replace_index[0], word)
    replace_pos = range(replace_index[0], replace_index[0]+len(identity_ls))   
    
    if PRINT:
        print("After: ")   
        mark_sen(sentence, replace_pos)
    
    
    return " ".join(sentence), sentence_id

def mark_sen(sen, highlight_index):
    for word, index in zip(sen,range(len(sen))):
        if(index in highlight_index):
            print(" ", colored(word, 'red'), end = "")
        elif(isPunctuation(word)):
            print(word, end= "")
        else:  
            print(" ", word, end= "")
    print("\n")
    return

## (currently not used)
def update_dic(dic, sen_pos, end_change):
    start = dic[sen_pos][0]
    end = dic[sen_pos][1]
    new_pos = [start, end+end_change]
    dic[sen_pos] = new_pos
    for k, v in dic.items():
        if k > sen_pos:
            v[0] += end_change
            v[1] += end_change
    return
    

In [162]:
DEBUG = 0
PRINT = 1
TEST_INDEX = 4
dic  = labelPositions(fullTextSentences[TEST_INDEX]) ## using fullTextSentences here
fullTextSentences = pickle.load(open(cur+'processedData/textSentences.pkl','rb')) ## replacement mutates the str list
mod = nlp(fullText[TEST_INDEX]) ## using fullText here
resolution(mod, dic, fullText[TEST_INDEX], fullTextSentences[TEST_INDEX])

louisiana Gov Bobby Jindal
###
Before: 
  Bobby  Jindal  on  Monday  stood  by  [31mhis[0m  criticism  of  so-called``  no-go''  zones  in  Europe,  where  sovereign  nations  allegedly  cede  authority  to  Muslim  immigrants,  a  controversial  idea  that  many  critics  say  is  overblown.

After: 
  Bobby  Jindal  on  Monday  stood  by  [31mlouisiana[0m  [31mGov[0m  [31mBobby[0m  [31mJindal's[0m  criticism  of  so-called``  no-go''  zones  in  Europe,  where  sovereign  nations  allegedly  cede  authority  to  Muslim  immigrants,  a  controversial  idea  that  many  critics  say  is  overblown.

louisiana Gov Bobby Jindal
###
Before: 
  And  the  potential  2016  Republican  presidential  candidate  decried  what  [31mhe[0m  called  immigrants'  insistence  on``  non-assimilation,  the  fact  that``  you've  got  people  who  want  to  come  to  our  country  but  not  adopt  our  values,''  which  he  called``  dangerous.

After: 
  And  the  potential  2016  Republica

TypeError: 'NoneType' object is not iterable

## Apply to all documents

In [None]:
nlp = en_coref_md.load()
def resolve_all(lslsls):
    for lsls in lslsls:
        dic  = labelPositions(lsls) 
        full = [preprocess(mergeDoc(lsls)[0])]
        mod = nlp(full)
        resolution(mod, dic, full, lsls) ## TODO: Does this mutate in place
    return lslsls