# Coreference task

This notebook processes text from biographies to execute a coreference task 
 
Pre-requirements:
- Text organised in sentences

Problem: missing entities in sentences, mainly due to pronoumns, reference to entities in previous paragraphs
This applies for people, places and time references
    
Our implementation:

    - Step 1
    - Step 2
    - ...
        
 
 
Input

Output

Requirements
- Coreferee (spacyDevelopment): https://github.com/richardpaulhudson/coreferee
    - !pip3 install coreferee
    - !python3 -m pip3 install coreferee
    - !python3 -m coreferee install en
    - !python -m spacy download en_core_web_trf
    - !python -m spacy download en_core_web_lg

In [1]:
#!pip3 install coreferee
#!python3 -m pip3 install coreferee
#!python3 -m coreferee install en
#!python -m spacy download en_core_web_trf
# !python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
     |████████████████████████████████| 587.7 MB 3.6 kB/s             ��████████▉   | 528.5 MB 35.0 MB/s eta 0:00:02
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [1]:
import spacy
from spacy.tokens import Doc
from spacy import displacy
# Coreference object
nlp = spacy.load('en_core_web_trf')
nlp.add_pipe('coreferee')

<coreferee.manager.CorefereeBroker at 0x7f1e66a169a0>

In [2]:
import os
import pandas as pd
import re
import numpy as np

### Identify complete and incomplete meetup sentences

In [3]:
# reading every CSV with indexed sentences
# return a list object of files in the given folder
files_list = [f for f in os.listdir('indexedSentences') if not f.startswith('.')]
# parse to dataframe
df_files = pd.DataFrame(files_list, columns=['file_name'])
# df_files.to_csv('totalBiographiesEntities.csv',index=False)

df_files.info()
df_files.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33309 entries, 0 to 33308
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_name  33309 non-null  object
dtypes: object(1)
memory usage: 260.4+ KB


Unnamed: 0,file_name
0,10002116.csv
1,10004137.csv
2,10006387.csv
3,1000684.csv
4,10009278.csv


In [4]:
## testing, using one biography
df_files = df_files.query("file_name=='2444917.csv'")
df_files.to_csv('coreferenceBiographiesList.csv',index=False)
print(len(df_files))

1


### Identify if the sentence can be a meetup

In [3]:
def evaluateCandidateMeetupsEntities(biography_df):
    column_entities = ["meetup","people","place","timeExpression"]
    entities_df = pd.DataFrame(columns=column_entities)
    
    for sentence_row in biography_df.itertuples():
        # print(sentence_row.paragraphIndex)
        # print(sentence_row.sentenceIndex)
        # read people and places entities
        meetupCandidatePP_df = pd.read_csv('extractedEntitiesPersonPlaceOnly/'+file_name_item.file_name.replace(".txt",".csv"))
        
        tempCandidatePP_df = meetupCandidatePP_df.query("paragraphIndex=={} & sentenceIndex=={}".format(sentence_row.paragraphIndex,
                                                                                                     sentence_row.sentenceIndex))
        
        # if count > 0 then, we found some entities, this is a meetup candidate
        # meetup: Y - YES, N - NO, C - Candidate
        meetup = "N"
        entPeople = 0
        entPlaces = 0
        entTimeExpressions = 0

        if len(tempCandidatePP_df) > 0:
            # Next, evaluate if the candidate is complete or incomplete
            # tag as COM INC
            entPP = pd.unique(tempCandidatePP_df['entType'])

            for i in entPP:
                if i == "person":
                    entPeople = len(tempCandidatePP_df[tempCandidatePP_df.entType == i])
                elif i == "place":
                    entPlaces = len(tempCandidatePP_df[tempCandidatePP_df.entType == i])
         
        # read time entities
        meetupCandidateTime_df = pd.read_csv('extractedTimeExpressions/'+file_name_item.file_name.replace(".txt",".csv"))
        tempCandidateTime_df = meetupCandidateTime_df.query("paragraphIndex=={} & sentenceIndex=={}".format(sentence_row.paragraphIndex,
                                                                                                     sentence_row.sentenceIndex))
        if len(tempCandidateTime_df) > 0:
            entTimeExpressions = len(tempCandidateTime_df)

        if entPeople == 0 and entPlaces == 0 and entTimeExpressions == 0:
            meetup = "N"
            datarow = pd.Series(data={'meetup':meetup,'people':entPeople,'place':entPlaces,'timeExpression':entTimeExpressions})
            # dfnotfound = dfnotfound.append(datarow, ignore_index=True)
            entities_df = entities_df.append(datarow, ignore_index=True)
            continue
            
        if entPeople > 0 and entPlaces > 0 and entTimeExpressions > 0:
            meetup = "Y"
        else:
            meetup = "C"
          
        datarow = pd.Series(data={'meetup':meetup,'people':entPeople,'place':entPlaces,'timeExpression':entTimeExpressions})
        # dfnotfound = dfnotfound.append(datarow, ignore_index=True)
        entities_df = entities_df.append(datarow, ignore_index=True)

    # merge with data to identify the paragraph and sentences index information
    sentencesEntities_df = biography_df.copy(deep=True)
    sentencesEntities_df = sentencesEntities_df.join(entities_df)
    return sentencesEntities_df

### Execute task to identify current entities numbers

In [7]:
# Reading the biographies in chunks of 50
# for chunk in pd.read_csv('coreferenceBiographiesList.csv', chunksize=50):
for chunk in pd.read_csv('list_wikiIdSample.csv', chunksize=50):
    df_file_name = pd.DataFrame()
    df_file_name['file_name'] = chunk['file_name']
    # for each file
    for file_name_item in df_file_name.itertuples():
        file_exists = os.path.isfile('indexedSentences/'+file_name_item.file_name.replace(".txt",".csv"))
        if file_exists:
            print(file_name_item.file_name.replace(".txt",".csv"))
            # read the biography organised by sentences
            biography_df = pd.read_csv('indexedSentences/'+file_name_item.file_name.replace(".txt",".csv"))
            # Read the file with all the identified entities
            # meetupEntitiesCandidate_df = pd.read_csv('extractedEntities/'+file_name_item.file_name.replace(".txt",".csv"))
            
            # entities_df = biography_df.copy(deep=True)
            
            # for each sentence in the biography, identify if they can be a 
            # meetup: Y - YES all entities present, N - NO zero entities, C - Candidate some entities
            # meetupsEval_df = evaluateCandidateMeetupsEntities(biography_df,meetupEntitiesCandidate_df)
            meetupsEval_df = evaluateCandidateMeetupsEntities(biography_df)
            
            # store results
            meetupsEval_df.to_csv('meetupsCandidatesBefCoref/'+file_name_item.file_name.replace(".txt",".csv"),index=False)

608845.csv
2232977.csv
409969.csv
1551347.csv
579599.csv
1174545.csv
2898019.csv
3450382.csv
2553865.csv
2320846.csv
144624.csv
827409.csv
1396921.csv
2815597.csv
858538.csv
2269540.csv
701860.csv
2334176.csv
8716.csv
576282.csv
113049.csv
529161.csv
167975.csv
3126224.csv
78231.csv
1022191.csv
223497.csv
3081864.csv
63747.csv
739770.csv
1566844.csv
439467.csv
181985.csv
51560453.csv
3236079.csv
720273.csv
419012.csv
994118.csv
341837.csv
3818460.csv
2657736.csv
623861.csv
489381.csv
1113259.csv
1433271.csv
756836.csv
165726.csv
165113.csv
313835.csv
67892.csv
2704521.csv
2771033.csv
526281.csv
450629.csv
2416191.csv
173225.csv
632683.csv
991714.csv
1023303.csv
481738.csv
1566892.csv
2659909.csv
12945.csv
3141790.csv
838629.csv
492026.csv
512518.csv
1429918.csv
58067.csv
1213916.csv
155965.csv
1253793.csv
49906.csv
5033.csv
100273.csv
442318.csv
1098118.csv
1321577.csv
553740.csv
523339.csv
20405.csv
238175.csv
1599087.csv
1786533.csv
797572.csv
655613.csv
379324.csv
1070521.csv
262463

## Coreference task

In [82]:
# executes coreference task. Input: text. Output: doc object
def executeCoref(text):
    doc = nlp(text)
    return doc

# Obtains the list of identified entities and their indexes in the text
# Input: doc object. Output: a list of lists. E.g., list([entity, indexes])
def indexCorefEntities(doc):
    indexes_coref = []
    for chain in doc._.coref_chains:
        indexes_temp = []
        resolve = False
        for mention in chain:
            for i in mention:
                indexes_temp.append(i)
        # indexes_temp = [doc._.coref_chains.resolve(doc[i])[0]] + indexes_temp 
            if not resolve:
                res1 = [doc._.coref_chains.resolve(doc[j]) for j in mention]
                res = list(filter((None).__ne__, res1))
                if len(res)>0:
                    resolve=True
        indexes_temp = res[0] + indexes_temp
        indexes_coref.append(indexes_temp)
    # print(indexes_coref)
    return indexes_coref

# coref_list format: [[Leipzig, 298, 338], [Elgar, 721, 729, 763, ]]
# output format:
#              token tokenIndex charIndex entType sentIndex corefEntityValue
# 0            Elgar          0         0  PERSON         0            Elgar
# 1               's          1         5                 0              NaN
# 2           mother          2         8                 0           mother
def parseDocToDf(doc, sentencesLength_list, coref_list):
    
    coref_details_df = pd.DataFrame(columns=['token','tokenIndex','charIndex','entType',
                                             'sentIndex'])
    sentIndex=0
    sentCharCount=0
    for token in doc:
        # the sentencesLength_list gives you a list with a number of string char count
        if (sentencesLength_list[sentIndex]-token.idx) == 0:
            sentCharCount += token.idx-sentCharCount
            sentIndex +=1
        #     print("change sentence",sentIndex)
        # print(token.idx, sentCharCount, token.idx-sentCharCount,sentencesLength_list[sentIndex])
        
        # add a line to restart the character index everytime it changes the sentence index
        
        # tokenIndex: index of the token in the paragraph
        # charIndex: index of the first token's character in the string
        datarow = pd.Series(data={'token':token.text,'tokenIndex':token.i,
                                  'charIndex':token.idx-sentCharCount,'entType':token.ent_type_,
                                 'sentIndex':sentIndex})
        coref_details_df = coref_details_df.append(datarow, ignore_index=True)
        # print(datarow)
    # Add a column to specify coref results: 
    coref_details_df['corefEntityValue'] = np.NaN
    for coref_item in coref_list:
        entityValue = ""
        for i, indexVal in enumerate(coref_item):
            # print(indexVal)
            if i == 0:
                entityValue = indexVal
                continue
            else:
                coref_details_df.at[indexVal,'corefEntityValue'] = entityValue
    return coref_details_df

# store coref results by biography
def storeCorefResults(doc_df, file_name):
    if os.path.isfile('cacheCoreferenceOutput/'+file_name.replace(".txt",".csv")):
        results = pd.read_csv('cacheCoreferenceOutput/'+file_name.replace(".txt",".csv"))
        results = results.append(doc_df, ignore_index=True)
        results.to_csv('cacheCoreferenceOutput/'+file_name.replace(".txt",".csv"),index=False)
    else:
        doc_df.to_csv('cacheCoreferenceOutput/'+file_name,index=False)
        
# store coref results by biography
# folder: cacheCoreferenceOutput/
def storeResults(doc_df, file_name, folder):
    if os.path.isfile(folder+file_name.replace(".txt",".csv")):
        results = pd.read_csv(folder+file_name.replace(".txt",".csv"))
        results = results.append(doc_df, ignore_index=True)
        results.to_csv(folder+file_name.replace(".txt",".csv"),index=False)
    else:
        doc_df.to_csv(folder+file_name,index=False)

# store doc objects results by biography, format: file_name_paragraphIndex - 10085_1
def storeCorefDocResults(doc, file_name,paragraph):
    doc.to_disk("cacheCoreferenceOutput/"+file_name.replace(".csv","")+"_"+str(paragraph)+".coref")
    
# retrieve doc              
def retriveCorefDocResults(file_name, paragraph):
    doc = Doc(nlp.vocab).from_disk("cacheCoreferenceOutput/"+file_name.replace(".csv","")+"_"+str(paragraph)+".coref")
    return doc

def validateCachedCorefObject(file_name, paragraph):
    if os.path.isfile('cacheCoreferenceOutput/'+file_name.replace(".csv","")+"_"+str(paragraph)+".coref"):
        return True
    else:
        return False
    
### Procesing text for coreference
# Concat all the sentences of a given paragraph
# Input: a df with all the sentences of a paragraph
# output: 1) a string concatenating all the sentences
# 2) a list with the lenght of each sentence, to be used to 
# identify the number of sentence in the paragraph
def obtainText_df(sentencesParagraph_df):
    text = ""
    sentencesLength_list = []
    for row_sent in sentencesParagraph_df.itertuples():
        text += row_sent.sentences + " "
        # print(len(text))
        sentencesLength_list.append(len(text))
    return text, sentencesLength_list

spacy.explain('NORP')
# NORP: Nationalities or religious or political groups
# GPE: Countries, cities, states
# LOC: Non-GPE locations, mountain ranges, bodies of water'
# PERSON: People, including fictional
# ORG: 'Companies, agencies, institutions, etc.'
# EVENT: Named hurricanes, battles, wars, sports events, etc.
# WORK_OF_ART: Titles of books, songs, etc.
# FAC: Buildings, airports, highways, bridges, etc.

# df = listTypeEntities(doc_coreferences)
# print(df)
# print(findTypeEntity(df,"ten"))
# print(findByTypeEntity(df,"DATE"))
    
# # doc = nlp("When Elgar was 29, he took on a new pupil, Caroline Alice Roberts, known as Alice, daughter of the late Major-General Sir Henry Roberts, and published author of verse and prose fiction. Eight years older than Elgar, Alice became his wife three years later.")
    
# temp_df = parseDocToDf(doc_coreferences,sentencesIndex_list,coreferences_index)
# # print(temp_df)
# # temp_df.info()
# print(indexCorefEntities(doc_coreferences))
# # print(parseDocToDf(doc_coreferences,sentencesIndex_list))
# # temp_df.tail(30)

# temp_df=temp_df.dropna(subset=['corefEntityValue'])
# temp_df.to_csv('cacheCoreferenceOutput/resultstest.csv',index=False)                

'Nationalities or religious or political groups'

In [53]:

# Obtains a list with the indexes of the tokens in the doc object
# Input: doc object. Output: list(token indexes), starts in zero
# def indexTextTokens(doc):
#     tok_list = list(token.text for token in doc)
#     # print(tok_list)
#     # print(tok_list[1])
#     return tok_list

# # Output: df
# def filterCorefByEntity(coref_df, value):
#     coref_df = coref_df.dropna(subset=['corefEntityValue'])
#     coref_df['corefEntityValue'] = coref_df['corefEntityValue'].astype(str)
#     return coref_df[coref_df['corefEntityValue'].str.contains(value)] 

# def validateEntityValue(coref_index,elementsCoref_df):
#     del_list = []
#     for ref in coref_index:
#         # ref[1] is the first index in coref results. E.g, [Elgar, 2,45,98] is 2
#         row_elementCoref = elementsCoref_df.iloc[[ref[1]]]
#         entType = str(row_elementCoref.entType)
        
#         if "PERSON" in entType or "DATE" in entType or "GPE" in entType:
#             pass
#         else:
#             del_list.append(ref)
#     for i in del_list:
#         coref_index.remove(i)
            
#     return coref_index

# # List the types of entities found in the text. Input: doc. Output: df [entity, entType]
# def listTypeEntities(doc):
#     # E.g., 
#     # W. H. "Billy" Reed PERSON
#     # Elgar PERSON
#     # English NORP
#     # Elgar PERSON
#     # about ten DATE
#     # forty years later DATE
#     # The Wand of Youth WORK_OF_ART
#     listEnts = []
#     # Find named entities, phrases and concepts
#     for entity in doc.ents:
#         element = [entity.text, entity.label_]
#         listEnts.append(element)
#         # print(entity.text, entity.label_)
#     return pd.DataFrame(listEnts, columns=['entity','entType']).drop_duplicates(keep='first')
        
# def findByEntity(entList_df, entity_string):
#     return entList_df[entList_df['entity'].str.contains(entity_string)] 

# def findByTypeEntity(entList_df, entType_string):
#     return entList_df[entList_df['entType'].str.contains(entType_string)] 


In [80]:
# Use coreference results to add entities to its corresponding sentence
# corefItems_df: each row is a token, the df with all the indexes and tokens
def completionCorefSentence(corefItems_df, coref_index, paragraph_index, sentences_partial_df):
    coref_df = corefItems_df.dropna(subset=['corefEntityValue'])
    coref_df['corefEntityValue'] = coref_df['corefEntityValue'].astype(str)
    # print(coref_df)
    # loop coref results, using the entity found in coreference as guidance
    # [Elgar, 1, 13, 25]
    for coref_item in coref_index:
        # print(coref_item)
        # [Elgar, 1, 13, 25, 27, 47, 59, 83, 88, 93, 97, 111, 118, 124, 135, 149, 200]
        # filter and verify that entType contains one or more of the types of interest
        # if "PERSON" in entType or "DATE" in entType or "GPE" in entType:
        entityValue = str(coref_item[0])
        # print(entityValue)
        coref_temp_df = coref_df.loc[coref_df['corefEntityValue'].isin([entityValue])]
        # print(coref_temp_df)
        # # list unique values 
        # entTypes_list = coref_temp_df.entType.unique()
        # # print(entTypes_list)
        # Part 1: dedicated to People
        # Validate PERSON entType
        coref_person_df = coref_temp_df.loc[coref_temp_df['entType'].isin(['PERSON'])]
        # print(coref_person_df)
        if not coref_person_df.empty:
            # print("PERSON")
            executeCorefCompletionLowLevel(coref_item,coref_df,paragraph_index,entityValue,coref_person_df,"person",sentences_partial_df)
         
        # search for places
        coref_place_df = coref_temp_df.loc[coref_temp_df['entType'].isin(['GPE'])]
        if not coref_place_df.empty:
            # print("GPE")
            executeCorefCompletionLowLevel(coref_item,coref_df,paragraph_index,entityValue,coref_place_df,"place",sentences_partial_df)
                
            # Second case: first entity is always the one in index [1] of coref results and has the exact or similar entity value as found in coref
            
    # is this a linked entity? if is not a linked entity I may not be able to linked so for now if it is not in the coref results
    # then there is not more actions to take

    # Part 2: dedicated to time expressions
    # use listTypeEntities(doc_coreferences) to validate DATE entTypes

# # Extra step to validate entities
# def validateEntitiesType():
#     return 1

def executeCorefCompletionLowLevel(coref_item,coref_df,paragraph_index,entityValue,coref_temp_df,entType, sentences_partial_df):
    coref_strings  = [str(i) for i in coref_item]
    # token_list = coref_strings.remove(entityValue)
    token_list = coref_strings[1:len(coref_strings)]
    # print(token_list)
    # validation for entities that have more than one entity
    try:
        token_list = list(map(int, token_list))
    except Exception as e: 
        print(e)
        return
    # print(token_list)
    # filter only the indexes in the coref results
    #     [Clara, 24, 46]
    #    tokenIndex  token charIndex entType sentIndex corefEntityValue
    # 0          24  Clara       112  PERSON         0            Clara
    # 1          46  Clara         0  PERSON         2            Clara       
    coref_temp_df = coref_df.set_index(['tokenIndex'])
    coref_temp_df = coref_temp_df.loc[coref_temp_df.index.isin(token_list)]
    # now we have a df with all the tokens of the coref
    coref_temp_df = coref_temp_df.reset_index()
    # [tokenIndex, token, charIndex, entType, sentIndex, corefEntityValue]

    # Search for the linked entity in the extracted Person and Places entities
    linkedEntities_df = pd.read_csv('extractedEntitiesPersonPlaceOnly/'+file_name_item.file_name.replace(".txt",".csv"))

    # print("Filter:",paragraph_index,coref_temp_df.iloc[0]['sentIndex'],coref_temp_df.iloc[0]['charIndex'],coref_temp_df.iloc[0]['corefEntityValue'])
    # First case: first entity is always the one in index [0] of coref results and starts at the same offset and charIndex
    entity_row = linkedEntities_df.loc[(linkedEntities_df['paragraphIndex'] == paragraph_index) & 
                                       (linkedEntities_df['sentenceIndex'] == coref_temp_df.iloc[0]['sentIndex']) & 
                                       (linkedEntities_df['offset'] == coref_temp_df.iloc[0]['charIndex'])]
    # print(coref_temp_df,entity_row)
    if entity_row.empty:
        entity_row = linkedEntities_df.loc[(linkedEntities_df['paragraphIndex'] == paragraph_index) & 
                                       (linkedEntities_df['sentenceIndex'] == coref_temp_df.iloc[0]['sentIndex'])]
        entity_row = entity_row[entity_row['entity'].str.contains(coref_temp_df.iloc[0]['corefEntityValue'])]
        entity_row.sort_values(by=['paragraphIndex','sentenceIndex'],ascending=True,inplace=True)
        # print("EMPTY ROW",entity_row)
    if entity_row.empty:
        entity_row = linkedEntities_df.loc[(linkedEntities_df['paragraphIndex'] == paragraph_index)]
        entity_row = entity_row[entity_row['entity'].str.contains(coref_temp_df.iloc[0]['corefEntityValue'])]
        entity_row.sort_values(by=['paragraphIndex','sentenceIndex'],ascending=True,inplace=True)
        
    # if len(entity_row) > 1:
    #     entity_row = entity_row.iloc[0]

    # print("ENTIRY ROW",entity_row)
    # if len(entity_row) == 1:
    if len(entity_row) > 0:
        # print("coref_temp_df",coref_temp_df)
        # Extract only the rows that have the same sentence and paragraph
        linkedEntities_temp_df = linkedEntities_df.loc[(linkedEntities_df['paragraphIndex'] == paragraph_index)]
        linkedEntities_temp_df = linkedEntities_temp_df.set_index(['sentenceIndex'])
        linkedEntities_temp_df = linkedEntities_temp_df.loc[linkedEntities_temp_df.index.isin(coref_temp_df['sentIndex'].drop_duplicates().to_list())]
        linkedEntities_temp_df = linkedEntities_temp_df.reset_index()
        # print("linkedEntities_temp_df",linkedEntities_temp_df)
        # linkedEntities_temp_df = linkedEntities_temp_df.loc[(linkedEntities_temp_df['URI'] == entity_row.iloc[0]['URI'])]
        list_sentences_temp = linkedEntities_temp_df.loc[(linkedEntities_temp_df['URI'] == entity_row.iloc[0]['URI'])].drop_duplicates(subset=['sentenceIndex']).sentenceIndex.unique()
        list_sentences_coref = coref_temp_df.drop_duplicates(subset=['sentIndex']).sentIndex.unique()
        list_sentences = list(set(list_sentences_coref)-set(list_sentences_temp))

        # print("==1, DFs")
        # print(list_sentences_coref,list_sentences_temp,list_sentences)

        # repeat row with entity in each sentence
        if len(list_sentences) > 0:
            coref_temp_df = coref_temp_df.set_index(['sentIndex'])
            coref_temp_df = coref_temp_df.loc[coref_temp_df.index.isin(list_sentences)]
            coref_temp_df = coref_temp_df.reset_index()
            coref_temp_df.sort_values(by=['tokenIndex'],ascending=True,inplace=True)
            coref_temp_df.drop_duplicates(subset=['sentIndex'],inplace=True)
            coref_temp_df.rename(columns={'sentIndex': 'sentenceIndex'}, inplace=True)
            # print("coref_temp_df",coref_temp_df)
            
            new_linkedEntities_df = sentences_partial_df.set_index(['sentenceIndex'])
            new_linkedEntities_df = new_linkedEntities_df.loc[new_linkedEntities_df.index.isin(list_sentences)]
            new_linkedEntities_df = new_linkedEntities_df.reset_index()
            new_linkedEntities_df.sort_values(by=['sentenceIndex'],ascending=True,inplace=True)
            new_linkedEntities_df.drop_duplicates(subset=['sentenceIndex'],inplace=True)
            
            # print("new_linkedEntities_df",new_linkedEntities_df)
            new_linkedEntities_df = sentences_partial_df.merge(coref_temp_df, on='sentenceIndex',how="right")

            # new_linkedEntities_df.drop(columns=['entity','offset','entType_y','tokenIndex','token'], inplace=True)
            new_linkedEntities_df.drop(columns=['tokenIndex','token'], inplace=True)
            new_linkedEntities_df.rename(columns={'wikiId':'wikiPageID','corefEntityValue':'entity','charIndex': 'offset'}, inplace=True)

            new_linkedEntities_df['URI'] = entity_row.iloc[0]['URI']
            new_linkedEntities_df['support'] = np.NaN
            new_linkedEntities_df['types'] = entity_row.iloc[0]['types']
            new_linkedEntities_df['similarityScore'] = np.NaN
            new_linkedEntities_df['percentageOfSecondRank'] = np.NaN
            new_linkedEntities_df['entType']= entType
            # print(new_linkedEntities_df)

            storeResults(new_linkedEntities_df, file_name_item.file_name.replace(".txt",".csv"), 'meetupsCorefOutputPP/')
    

In [81]:
# Now move to run the coreference task
# Reading the biographies in chunks of 50
for chunk in pd.read_csv('list_wikiIdSample.csv', chunksize=50):
    df_file_name = pd.DataFrame()
    df_file_name['file_name'] = chunk['file_name']
    # for each file
    for file_name_item in df_file_name.itertuples():
        file_exists = os.path.isfile('indexedSentences/'+file_name_item.file_name.replace(".txt",".csv"))
        if file_exists:
            # print(file_name_item.file_name.replace(".txt",".csv"))
            
            sentences_df = pd.read_csv('indexedSentences/'+file_name_item.file_name.replace(".txt",".csv"))
            # obtain list of paragraphs
            paragraphIndexes_list = sentences_df.paragraphIndex.unique()
            # iterate the paragraphs
            for p_index in paragraphIndexes_list:
                # obtain a df with all the sentences that belong to a paragraph
                sentences_partial_df = sentences_df.query('paragraphIndex == {}'.format(p_index))
                # sentences_partial_df = sentences_df.query('paragraphIndex == 1')
                
                # format text
                text, sentencesIndex_list = obtainText_df(sentences_partial_df)
                # print(text)
                # execute coreference task
                # Validate first if already executed
                if validateCachedCorefObject(file_name_item.file_name, p_index):
                    # retrieve object
                    doc_coreferences = retriveCorefDocResults(file_name_item.file_name, p_index)
                    # print("retrieved")
                else:
                    doc_coreferences = executeCoref(text)
                    # store doc object
                    storeCorefDocResults(doc_coreferences, file_name_item.file_name,p_index)
                # format results: [entityCoref, [word(s) index]
                coreferences_index = indexCorefEntities(doc_coreferences)
                # print(coreferences_index)
                
                # obtain token and coref indexes
                elementsCoreference_df = parseDocToDf(doc_coreferences,sentencesIndex_list,coreferences_index)
                # store results, send the df and the file_name
                storeCorefResults(elementsCoreference_df, file_name_item.file_name)
                
                # before executing coreftask verify if it is cached and retrive all the data
                completionCorefSentence(elementsCoreference_df, coreferences_index,p_index,sentences_partial_df)
                # completionCorefSentence(elementsCoreference_df, coreferences_index,1)

                # store corefResults
                storeResults(elementsCoreference_df, file_name_item.file_name, 'cacheCoreferenceOutput/')

2444917.csv
retrieved
[[Butt, 3, 34, 66]]


  res = list(filter((None).__ne__, res1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coref_df['corefEntityValue'] = coref_df['corefEntityValue'].astype(str)
  res = list(filter((None).__ne__, res1))


[Butt, 3, 34, 66]
PERSON
Filter: 0 0 17 Butt
ENTIRY ROW Empty DataFrame
Columns: [URI, support, types, entity, offset, similarityScore, percentageOfSecondRank, sentence, sentenceIndex, paragraphIndex, section, entType, wikiPageID]
Index: []
retrieved
[[Her, 0, 11], [recitalist, 6, 38, 46], [Gluck, 21, 29, 31]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coref_df['corefEntityValue'] = coref_df['corefEntityValue'].astype(str)


[Her, 0, 11]
[recitalist, 6, 38, 46]
[Gluck, 21, 29, 31]
PERSON
Filter: 1 1 55 Gluck
ENTIRY ROW                                                    URI  support  \
89   http://dbpedia.org/resource/Christoph_Willibal...     1290   
109  http://dbpedia.org/resource/Christoph_Willibal...     1290   

                                    types entity  offset  similarityScore  \
89   DBpedia:Person,DBpedia:MusicalArtist  Gluck      55              1.0   
109  DBpedia:Person,DBpedia:MusicalArtist  Gluck      55              1.0   

     percentageOfSecondRank  \
89             2.928648e-12   
109            2.928648e-12   

                                              sentence  sentenceIndex  \
89   She appeared in only two operatic productions,...              1   
109  She appeared in only two operatic productions,...              1   

     paragraphIndex section entType  wikiPageID  
89                1     NaN  person     2444917  
109               1     NaN  person     2444917  
coref_

  res = list(filter((None).__ne__, res1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coref_df['corefEntityValue'] = coref_df['corefEntityValue'].astype(str)


[Sussex, 7, 37]
GPE
Filter: 3 0 34 Sussex
ENTIRY ROW                                    URI  support  \
38  http://dbpedia.org/resource/Sussex    12070   
61  http://dbpedia.org/resource/Sussex    12070   

                                                types  entity  offset  \
38  Wikidata:Q3455524,Schema:Place,Schema:Administ...  Sussex      34   
61  Wikidata:Q3455524,Schema:Place,Schema:Administ...  Sussex      34   

    similarityScore  percentageOfSecondRank  \
38         0.952048                0.050315   
61         0.952048                0.050315   

                                             sentence  sentenceIndex  \
38  Clara Butt was born in Southwick, Sussex, the ...              0   
61  Clara Butt was born in Southwick, Sussex, the ...              0   

    paragraphIndex                      section entType  wikiPageID  
38               3  == Early life and career ==   place     2444917  
61               3  == Early life and career ==   place     2444917  
core

  res = list(filter((None).__ne__, res1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coref_df['corefEntityValue'] = coref_df['corefEntityValue'].astype(str)


[She, 0, 2]
[London, 15, 27]
GPE
Filter: 4 0 79 London
ENTIRY ROW                                    URI  support  \
46  http://dbpedia.org/resource/London   251270   
69  http://dbpedia.org/resource/London   251270   

                                                types  entity  offset  \
46  Wikidata:Q515,Wikidata:Q486972,Schema:Place,Sc...  London      79   
69  Wikidata:Q515,Wikidata:Q486972,Schema:Place,Sc...  London      79   

    similarityScore  percentageOfSecondRank  \
46         0.999472                0.000296   
69         0.999472                0.000296   

                                             sentence  sentenceIndex  \
46  She made her professional debut on 7 December ...              0   
69  She made her professional debut on 7 December ...              0   

    paragraphIndex                      section entType  wikiPageID  
46               4  == Early life and career ==   place     2444917  
69               4  == Early life and career ==   place     2

  res = list(filter((None).__ne__, res1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coref_df['corefEntityValue'] = coref_df['corefEntityValue'].astype(str)


[Butt, 4, 37, 54, 82, 119, 149, 169, 177, 193, 213, 231]
PERSON
Filter: 6 0 16 Butt
ENTIRY ROW                                        URI  support  \
13  http://dbpedia.org/resource/Clara_Butt      148   
29  http://dbpedia.org/resource/Clara_Butt      148   
34  http://dbpedia.org/resource/Clara_Butt      148   
37  http://dbpedia.org/resource/Clara_Butt      148   

                                                types      entity  offset  \
13  Http://xmlns.com/foaf/0.1/Person,Wikidata:Q729...  Clara Butt       0   
29  Http://xmlns.com/foaf/0.1/Person,Wikidata:Q729...  Clara Butt       0   
34  Http://xmlns.com/foaf/0.1/Person,Wikidata:Q729...  Clara Butt       0   
37  Http://xmlns.com/foaf/0.1/Person,Wikidata:Q729...  Clara Butt       0   

    similarityScore  percentageOfSecondRank  \
13              1.0                     0.0   
29              1.0                     0.0   
34              1.0                     0.0   
37              1.0                     0.0   

       

  res = list(filter((None).__ne__, res1))


[She, 0, 6, 12, 15, 21, 31, 45, 58]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coref_df['corefEntityValue'] = coref_df['corefEntityValue'].astype(str)


In [None]:
# http://dbpedia.org/resource/Composer
# http://dbpedia.org/resource/Solicitor
# http://dbpedia.org/resource/Conducting
# http://dbpedia.org/resource/Contralto
# http://dbpedia.org/resource/Tempo

dct:subject dbc:Occupations_in_music
<http://purl.org/dc/terms/subject> https://dbpedia.org/page/Category:Occupations_in_music
rdf:type dbo:PersonFunction
# <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/PersonFunction>

### Run the COREFERENCE task

In [48]:
# def corefText(doc):
#     doc._.coref_chains.print()
#     tok_list = list(token.text for token in doc)
#     c = 0
#     for chain in doc._.coref_chains:
#         for mention in chain:
#             res1 = [doc._.coref_chains.resolve(doc[i]) for i in mention]
#             res = list(filter((None).__ne__, res1))
#             if len(res) != 0:
#                 if len(res[0]) == 1:
#                     tok_list[mention[0] + c] = str(res[0][0])
#                 elif len(res[0]) > 1:
#                     tok_list[mention[0] + c] = str(res[0][0])
#                     for j in range(1, len(res[0])):
#                         tok_list.insert(mention[0] + c + j, str(res[0][j]))
#                         c = c + 1
#     textres = " ".join(tok_list)
#     print(textres)
#     return textres

# def executeCoreferenceParagraph(paragraph_df,file_name_item):
#     for paragraph_row in paragraph_df.itertuples():
#         doc = nlp(paragraph_row.paragraph)
#         print(doc)
#         break

# def executeCoreference(sentencesCandidates_df,meetupCandidate_df,file_name_item):
#     # filter only INCOMPLETE meetups
#     # to_check_df = sentencesCandidates_df.query("meetup!='Y'")
    
#     # paragraphSentences_df = meetupCandidate_df.query("paragraphIndex=={}".format(sentence_row.paragraphIndex))
#     paragraphSentences_df = pd.read_csv('indexedSentences/'+file_name_item.file_name.replace(".txt",".csv"))
    
#     count = 0
#      # iterate all the paragraphs
#     for paragraph_row in paragraphSentences_df.itertuples():
        
#         count+=1
#         print(paragraph_row.paragraphIndex)
#         doc = nlp(paragraph_row.paragraph)
        
#         for entity in doc.ents:
#             print(entity.text, entity.label_)
            
#             if entity.label_ == "PERSON" or entity.label_ == "GPE":
#                 continue_next_bool = False
#         crText_string = corefText(doc)
        
#         if count == 3:
#             break
# #     # iterate all the sentences
# #     for sentence_row in to_check_df.itertuples():
# #         print(sentence_row.paragraphIndex)
# #         print(sentence_row.sentenceIndex)                                           
# #         doc = nlp(sentence_row.sentences)
# #         continue_next_bool = True
        
# #         # run a process to double check if there are not entities, use corereference library if there is any entity: type PERSON, GPE, DATE
# #         for entity in doc.ents:
# #             print(entity.text, entity.label_)
            
# #             if entity.label_ == "PERSON" or entity.label_ == "GPE":
# #                 crText_string = corefText(doc)
# #                 continue_next_bool = False
                
#     # return
    

# def obtainEntitiesTypes(doc):
#     corefEnts_df = pd.DataFrame(columns=["type","entity","index"])
    
#     # print("Ents: ")
#     for entity in doc.ents:
#         # print(entity)
#         for i in entity:
#             # print(entity)
#             print(entity.text, entity.label_)
#         if "PERSON" in entity.label_ or "DATE" in entity.label_ or "GPE" in entity.label_:
#             # print("store")
#             datarow = pd.Series(data={'type':entity.label_,'entity':entity.text,'index':0})
#             corefEnts_df = corefEnts_df.append(datarow, ignore_index=True)

# def obtainCoref(doc):
#     coref = []
#     coref_ents = []
#     print("Chains:")
#     doc._.coref_chains.print()
#     n_tokens_int = 0
#     for chain in doc._.coref_chains:
#         indexes_coref = []
#         # indexes = [doc._.coref_chains.resolve(doc[i]) for i in mention]
        
#         for mention in chain:
#             print(mention)
#             indexes_coref.append(mention)
#             print(indexes_coref)
#             # store the indexes of the entities
#             # for i in mention:
#             #     value = doc._.coref_chains.resolve(doc[i])
#             #     # print(type(value))
#             #     # print(i)# position
#             #     # print(value)# the named entity
#             #     indexes_coref.append(i)
#         value.append(indexes_coref)
#         # print(value)
#         coref_ents.append(value)
#     print(coref_ents)
            

    
    # for sent in doc.sents:
    #     print(sent)
    #     # sent.string.strip()
    #     # for token in sent:
    #         # print(token, token.idx)
    #     n_tokens_int = n_tokens_int + len(sent)
    #     print(len(sent))
    #     print(n_tokens_int)
# doc = nlp("Sir Edward William Elgar, 1st Baronet,  ( (listen); 2 June 1857 – 23 February 1934) was an English composer, many of whose works have entered the British and international classical concert repertoire. Among his best-known compositions are orchestral works including the Enigma Variations, the Pomp and Circumstance Marches, concertos for violin and cello, and two symphonies. He also composed choral works, including The Dream of Gerontius, chamber music and songs. He was appointed Master of the King's Musick in 1924.")

# doc = nlp("Although Elgar is often regarded as a typically English composer, most of his musical influences were not from England but from continental Europe. He felt himself to be an outsider, not only musically, but socially. In musical circles dominated by academics, he was a self-taught composer; in Protestant Britain, his Roman Catholicism was regarded with suspicion in some quarters; and in the class-conscious society of Victorian and Edwardian Britain, he was acutely sensitive about his humble origins even after he achieved recognition. He nevertheless married the daughter of a senior British Army officer. She inspired him both musically and socially, but he struggled to achieve success until his forties, when after a series of moderately successful works his Enigma Variations (1899) became immediately popular in Britain and overseas. He followed the Variations with a choral work, The Dream of Gerontius (1900), based on a Roman Catholic text that caused some disquiet in the Anglican establishment in Britain, but it became, and has remained, a core repertory work in Britain and elsewhere. His later full-length religious choral works were well received but have not entered the regular repertory.")


# for i, token in enumerate(doc):
#     print(i, token)
obtainCoref(doc)

# for chain in doc._.coref_chains:
#     print("Chain: ")
#     print(chain)
#     for mention in chain:
#         print("Mention: ")
#         print(mention)
#         for i in mention:
#             print("i: ")            
#             print(i)
#             value = doc._.coref_chains.resolve(doc[i])
#             print(value)
#     print("----")
# obtainEntitiesCoref(doc)



Chains:
0: Elgar(1), he(5), Elgar(44), his(48)
1: Alice(18), Alice(46)
[1]
[[1]]
[5]
[[1], 1, [5]]
[44]
[[1], 1, [5], 5, [44]]
[48]
[[1], 1, [5], 5, [44], 44, [48]]
[18]
[[18]]
[46]
[[18], 18, [46]]
[[Elgar, [[1], 1, [5], 5, [44], 44, [48], 48]], [Alice, [[18], 18, [46], 46]]]


In [12]:
# doc = nlp('Sir Edward William Elgar, 1st Baronet,  listen; 2 June 1857 – 23 February 1934) was an English composer, ')
# many of whose works have entered the British and international classical concert repertoire.
# doc = nlp("Although Elgar is often regarded as a typically English composer, most of his musical influences were not from England but from continental Europe.")
doc = nlp("Sir Edward William Elgar, 1st Baronet,  ( (listen); 2 June 1857 – 23 February 1934) was an English composer, many of whose works have entered the British and international classical concert repertoire. Among his best-known compositions are orchestral works including the Enigma Variations, the Pomp and Circumstance Marches, concertos for violin and cello, and two symphonies. He also composed choral works, including The Dream of Gerontius, chamber music and songs. He was appointed Master of the King's Musick in 1924.")


In [13]:
print("Chains:")
doc._.coref_chains.print()
print("[]:")
doc[25]._.coref_chains.print()
print("resolve:")
value = doc._.coref_chains.resolve(doc[25])
print(value)
print("Ents: ")
for entity in doc.ents:
    print(entity.text, entity.label_)
print("====")
# print("Print tokens: ")
# for i in doc:
#     print(i.text, i.label_)
for chain in doc._.coref_chains:
    print("Chain: ")
    print(chain)
    for mention in chain:
        print("Mention: ")
        print(mention)
        for i in mention:
            print("i: ")            
            print(i)
            value = doc._.coref_chains.resolve(doc[i])
            print(value)
    print("----")

Chains:
0: composer(25), his(42), He(71), He(88)
[]:
0: composer(25), his(42), He(71), He(88)
resolve:
None
Ents: 
Edward William Elgar PERSON
1st ORDINAL
2 June 1857 – 23 February 1934 DATE
English NORP
British NORP
the Enigma Variations WORK_OF_ART
the Pomp and Circumstance Marches WORK_OF_ART
two CARDINAL
The Dream of Gerontius WORK_OF_ART
1924 DATE
====
Chain: 
0: [25], [42], [71], [88]
Mention: 
[25]
i: 
25
None
Mention: 
[42]
i: 
42
[composer]
Mention: 
[71]
i: 
71
[composer]
Mention: 
[88]
i: 
88
[composer]
----


In [189]:
print("Chains:")
doc._.coref_chains.print()
n_tokens_int = 0
for sent in doc.sents:
    print(sent)
    # sent.string.strip()
    # for token in sent:
        # print(token, token.idx)
    n_tokens_int = n_tokens_int + len(sent)
    print(len(sent))
    print(n_tokens_int)

Chains:
0: Elgar(1), his(13), He(25), himself(27), he(47), his(59), he(83), his(88), he(93), He(97), him(111), he(118), his(124), his(135), He(149), His(200)
1: Britain(57), Britain(81), Britain(145), Britain(181), Britain(196)
2: daughter(101), She(109)
3: Variations(137), Variations(152)
4: disquiet(175), it(184)
Although Elgar is often regarded as a typically English composer, most of his musical influences were not from England but from continental Europe.
25
25
He felt himself to be an outsider, not only musically, but socially.
15
40
In musical circles dominated by academics, he was a self-taught composer; in Protestant Britain, his Roman Catholicism was regarded with suspicion in some quarters; and in the class-conscious society of Victorian and Edwardian Britain, he was acutely sensitive about his humble origins even after he achieved recognition.
57
97
He nevertheless married the daughter of a senior British Army officer.
12
109
She inspired him both musically and socially, bu

In [171]:
doc = nlp("Although Elgar is often regarded as a typically English composer, most of his musical influences were not from England but from continental Europe. He felt himself to be an outsider, not only musically, but socially. In musical circles dominated by academics, he was a self-taught composer; in Protestant Britain, his Roman Catholicism was regarded with suspicion in some quarters; and in the class-conscious society of Victorian and Edwardian Britain, he was acutely sensitive about his humble origins even after he achieved recognition. He nevertheless married the daughter of a senior British Army officer. She inspired him both musically and socially, but he struggled to achieve success until his forties, when after a series of moderately successful works his Enigma Variations (1899) became immediately popular in Britain and overseas. He followed the Variations with a choral work, The Dream of Gerontius (1900), based on a Roman Catholic text that caused some disquiet in the Anglican establishment in Britain, but it became, and has remained, a core repertory work in Britain and elsewhere. His later full-length religious choral works were well received but have not entered the regular repertory.")




In [172]:
print("Chains:")
doc._.coref_chains.print()
print("[]:")
doc[25]._.coref_chains.print()
print("resolve:")
value = doc._.coref_chains.resolve(doc[25])
print(value)
print("====")
# print("Print tokens: ")
# for i in doc:
#     print(i.text, i.label_)
for chain in doc._.coref_chains:
    print("Chain: ")
    print(chain)
    for mention in chain:
        print("Mention: ")
        print(mention)
        for i in mention:
            print("i: ")            
            print(i)
            value = doc._.coref_chains.resolve(doc[i])
            print(value)
    print("----")

Chains:
0: Elgar(1), his(13), He(25), himself(27), he(47), his(59), he(83), his(88), he(93), He(97), him(111), he(118), his(124), his(135), He(149), His(200)
1: Britain(57), Britain(81), Britain(145), Britain(181), Britain(196)
2: daughter(101), She(109)
3: Variations(137), Variations(152)
4: disquiet(175), it(184)
[]:
0: Elgar(1), his(13), He(25), himself(27), he(47), his(59), he(83), his(88), he(93), He(97), him(111), he(118), his(124), his(135), He(149), His(200)
resolve:
[Elgar]
====
Chain: 
0: [1], [13], [25], [27], [47], [59], [83], [88], [93], [97], [111], [118], [124], [135], [149], [200]
Mention: 
[1]
i: 
1
None
Mention: 
[13]
i: 
13
[Elgar]
Mention: 
[25]
i: 
25
[Elgar]
Mention: 
[27]
i: 
27
[Elgar]
Mention: 
[47]
i: 
47
[Elgar]
Mention: 
[59]
i: 
59
[Elgar]
Mention: 
[83]
i: 
83
[Elgar]
Mention: 
[88]
i: 
88
[Elgar]
Mention: 
[93]
i: 
93
[Elgar]
Mention: 
[97]
i: 
97
[Elgar]
Mention: 
[111]
i: 
111
[Elgar]
Mention: 
[118]
i: 
118
[Elgar]
Mention: 
[124]
i: 
124
[Elgar]
Menti

In [70]:
doc._.coref_chains.print()

0: he(7), his(19)
1: composer(13), he(43), his(48), he(53)
2: Britain(17), Britain(41)
In musical circles dominated by academics , he was a self - taught composer ; in Protestant Britain , he Roman Catholicism was regarded with suspicion in some quarters ; and in the class - conscious society of Victorian and Edwardian Britain , composer was acutely sensitive about composer humble origins even after composer achieved recognition .


  res = list(filter((None).__ne__, res1))


In [67]:
doc._.coref_chains.resolve(doc[1])

In [71]:
# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['musical circles', 'academics', 'he', 'a self-taught composer', 'Protestant Britain', 'his Roman Catholicism', 'suspicion', 'some quarters', 'the class-conscious society', 'Victorian and Edwardian Britain', 'he', 'his humble origins', 'he', 'recognition']
Verbs: ['dominate', 'teach', 'regard', 'achieve']
Protestant NORP
Britain GPE
Roman Catholicism NORP
Victorian DATE
Edwardian NORP
Britain GPE


In [48]:
print(doc.ents)
for entity in doc.ents:
    print(entity)
    
doc_nouns = list(doc.noun_chunks)
print(doc_nouns)

(Elgar, English, England, Europe)
Elgar
English
England
Europe
[Elgar, a typically English composer, his musical influences, England, continental Europe]


In [7]:
[(i, i.ent_type_, i.is_stop) for i in doc]

[(Sir, '', False),
 (Edward, 'PERSON', False),
 (William, 'PERSON', False),
 (Elgar, 'PERSON', False),
 (,, '', False),
 (1st, 'ORDINAL', False),
 (Baronet, '', False),
 (,, '', False),
 ( , '', False),
 ((, '', False),
 ((, '', False),
 (listen, '', False),
 (), '', False),
 (;, '', False),
 (2, 'DATE', False),
 (June, 'DATE', False),
 (1857, 'DATE', False),
 (–, 'DATE', False),
 (23, 'DATE', False),
 (February, 'DATE', False),
 (1934, 'DATE', False),
 (), '', False),
 (was, '', True),
 (an, '', True),
 (English, 'NORP', False),
 (composer, '', False),
 (,, '', False),
 (many, '', True),
 (of, '', True),
 (whose, '', True),
 (works, '', False),
 (have, '', True),
 (entered, '', False),
 (the, '', True),
 (British, 'NORP', False),
 (and, '', True),
 (international, '', False),
 (classical, '', False),
 (concert, '', False),
 (repertoire, '', False),
 (., '', False)]

In [34]:
doc.to_json()

{'text': 'Although he was very busy with his work, Peter had had enough of it. He and his wife decided they needed a holiday. They travelled to Spain because they loved the country very much.',
 'ents': [{'start': 41, 'end': 46, 'label': 'PERSON'},
  {'start': 134, 'end': 139, 'label': 'GPE'}],
 'sents': [{'start': 0, 'end': 68},
  {'start': 69, 'end': 115},
  {'start': 116, 'end': 181}],
 'tokens': [{'id': 0,
   'start': 0,
   'end': 8,
   'tag': 'IN',
   'pos': 'SCONJ',
   'morph': '',
   'lemma': 'although',
   'dep': 'mark',
   'head': 2},
  {'id': 1,
   'start': 9,
   'end': 11,
   'tag': 'PRP',
   'pos': 'PRON',
   'morph': 'Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs',
   'lemma': 'he',
   'dep': 'nsubj',
   'head': 2},
  {'id': 2,
   'start': 12,
   'end': 15,
   'tag': 'VBD',
   'pos': 'AUX',
   'morph': 'Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin',
   'lemma': 'be',
   'dep': 'advcl',
   'head': 11},
  {'id': 3,
   'start': 16,
   'end': 20,
   'tag': 'R

In [18]:
# Nice display of sentence organisation
#displacy.serve(doc, style="dep")

In [8]:
# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['he', 'his work', 'Peter', 'it', 'He', 'his wife', 'they', 'a holiday', 'They', 'Spain', 'they', 'the country']
Verbs: ['have', 'decide', 'need', 'travel', 'love']
Peter PERSON
Spain GPE


In [23]:
doc1 = nlp('My sister has a dog. She loves him.')
doc1._.coref_chains.print()
print(doc1._.coref_chains.resolve(doc1[6]))

0: sister(1), She(6)
1: dog(4), him(8)
[sister]


In [43]:
tok_list = list(token.text for token in doc1)
c = 0
for chain in doc1._.coref_chains:
    for mention in chain:
        res1 = [doc1._.coref_chains.resolve(doc1[i]) for i in mention]
        res = list(filter((None).__ne__, res1))
        if len(res) != 0:
            if len(res[0]) == 1:
                tok_list[mention[0] + c] = str(res[0][0])
            elif len(res[0]) > 1:
                tok_list[mention[0] + c] = str(res[0][0])
                for j in range(1, len(res[0])):
                    tok_list.insert(mention[0] + c + j, str(res[0][j]))
                    c = c + 1
textres = " ".join(tok_list)
print(textres)

My sister has a dog . sister loves dog .


  res = list(filter((None).__ne__, res1))
