### identify PPE: Places, Persons, Events

### 1. Identify Places, Persons, Events using DBpedia Spotlight
### 2. Identify time entities using NLTK

In [27]:
# For DBpedia spotlight, PPE entities
import requests
import pycurl
from urllib.request import urlopen
from urllib.parse import quote
import json
import os
import pandas as pd
from _datetime import date
import time

In [28]:
# For nltk time entities
# time entity
import nltk.tokenize as nt
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import Tree

# if not installed
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')
# nltk.download('punkt')

In [29]:
# reading every CSV with indexed sentences
# return a list object of files in the given folder
files_list = [f for f in os.listdir('indexedSentences') if not f.startswith('.')]
# parse to dataframe
df_files = pd.DataFrame(files_list, columns=['file_name'])
df_files = df_files.query("file_name=='10085.csv'")

df_files.info()
df_files.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 3 to 3
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_name  1 non-null      object
dtypes: object(1)
memory usage: 16.0+ bytes


Unnamed: 0,file_name
3,10085.csv


In [9]:
# extract only the ones that do not exist in folder
files_list = [f for f in os.listdir('extractedEntities') if not f.startswith('.')]
# parse to dataframe
df_query = pd.DataFrame(files_list, columns=['file_name'])
df_result = df_files[~df_files['file_name'].isin(df_query['file_name'])]
df_files = df_result
df_files.info()
df_files.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_name  0 non-null      object
dtypes: object(1)
memory usage: 0.0+ bytes


Unnamed: 0,file_name


## 1. Identify PPE using DBpedia Spotlight

In [5]:
# using DBPedia spotlight to search for entities
# URL for local installation of DBpedia spotlight
# urlAnnotation = 'http://dbpedia-spotlight.en:80/rest/annotate/'
urlAnnotation = 'https://whise.kmi.open.ac.uk/rest/annotate'

# setting headers and parameters not using DBpedia categories
def setDbPediaAnnotationServiceParameters(text):
    """ Se parameters for querying Dbpedia
    args: text - text to be analysed
    return: headers
            params
    """
    headers = {
        'Accept':'application/json',
        "content-type":"application/x-www-form-urlencoded"
    }

    params = {
        'confidence': '0.4',
        "text":text,
    }
    return headers, params

# setting headers and parameters, filtering by categories
def setDbPediaAnnotationServiceParametersTypes(text,types):
    """ Se parameters for querying Dbpedia
    args: text - text to be analysed
        types: different categories of entities
    return: headers
            params
    """
    headers = {
        'Accept':'application/json',
        "content-type":"application/x-www-form-urlencoded"
    }

    params = {
        'types' : types,
        'confidence': '0.35',
        "text":text,
    }
    return headers, params

# return response in JSON format
def queryDBPediaAnnotation(url,header,params):
    try:
        response = requests.post(url,headers=header, params=params).json()
        
    except Exception as ex:
        if hasattr(ex, 'message'):
            print(ex.message)
        else:
            print(ex)
        raise Exception(ex)
    # finally:
        # print(response)

    return response

def executeQueryDbpedia(q, f='application/json'):
    epr = "http://dbpedia.org/sparql"
    try:
        params = {'query': q}
        resp = requests.get(epr, params=params, headers={'Accept': f})
    #    return resp.text
        return resp
    except Exception as e:
        print(e, file=sys.stdout)
        raise

## 1. Extract entities: People, places

In [7]:
# 1. Extract People and Places entities
entitiyTypes = ['DBpedia:Person','DBpedia:MusicalArtist','DBpedia:Place','DBpedia:SocietalEvent']

# Count the number of files read to include breaks
count = 0
for file_name in df_files.itertuples():
    count +=1
    # start = time.time()
    print(file_name.file_name)
    # Read file with segmented sentences
    biography_df = pd.read_csv('indexedSentences/'+file_name.file_name)
    df_entities = pd.DataFrame()
    
    # for each sentence in each biography
    for sentence_row in biography_df.itertuples():
        ## send sentence text and return params for query
        hdrs, prms = setDbPediaAnnotationServiceParameters(sentence_row.sentences)
        try:
            # obtain response using DBpedia spotlight
            responseJSON = queryDBPediaAnnotation(urlAnnotation,hdrs,prms)
            # UPDATE: save responses from DBP Spotlight
            if 'Resources' in responseJSON:
                file_exists = os.path.isfile('cacheSpotlightResponse/'+file_name.file_name)
                df_resources = pd.DataFrame.from_dict(responseJSON['Resources'])
                df_resources['sentence']=sentence_row.sentences
                df_resources['sentenceIndex']=sentence_row.sentenceIndex
                df_resources['paragraphIndex'] = sentence_row.paragraphIndex
                df_resources['section'] = sentence_row.section
                if not file_exists:
                    df_resources.to_csv('cacheSpotlightResponse/'+file_name.file_name,index=False)
                else:
                    df_resources.to_csv('cacheSpotlightResponse/'+file_name.file_name,mode='a',index=False,header=False)
                
        except Exception as ex:
            print("****")
            if hasattr(ex, 'message'):
                print(ex.message)
            else:
                print(ex)

        # if entities People, places, events using spotlight
        if 'Resources' in responseJSON:
            # parse response to a dataframe
            df_resources = pd.DataFrame.from_dict(responseJSON['Resources'])
            df_resources.rename(columns={'@URI':'URI','@types':'types','@surfaceForm':'surfaceForm','@support':'support','@offset':'offset','@similarityScore':'similarityScore',
                                        '@percentageOfSecondRank':'percentageOfSecondRank'}, inplace=True)
            
            # filter only rows for entities with a category
            df = df_resources[~df_resources['types'].isna()].copy()

            # UPDATE: improving entity recognition
            df_result = pd.DataFrame()
            #
            
            if not df.empty:
                #df_result = pd.DataFrame()
                # assign the type of entity found, according to the categories
                for entity in entitiyTypes:
                    df_temp = df[df['types'].str.contains(entity)].copy()
                    
                    if not df_temp.empty:
                        # df_temp.head(2)
                        if entity == 'DBpedia:Person' or entity == 'DBpedia:MusicalArtist':
                            df_temp['entType'] = 'person'
                        elif entity == 'DBpedia:Place':
                            df_temp['entType'] = 'place'
                        elif entity == 'DBpedia:SocietalEvent' or entity == 'DBpedia:Event':
                            df_temp['entType'] = 'event'
                            
                        df_result = df_result.append(df_temp)
                        
            # UPDATE:
            # Adding process to query entities without the type
            # filter only rows without category
            df = df_resources.loc[df_resources['types'] == ''] 
            if not df.empty:
                for item in df.itertuples():
                    uri = item.URI
                    query_text = "SELECT * WHERE { <" + uri + "> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>  ?o }"
                    # Execute query against sparql endpoint, query types
                    results = executeQueryDbpedia(query_text).json()
                    
                    # if query returns a response
                    if 'results' in results:
                        df_none = pd.DataFrame.from_dict(results['results']['bindings'])

                        for item in df_none.itertuples():
                            df_temp = pd.DataFrame()
                            # find types Place and Person
                            if 'http://dbpedia.org/ontology/Place' == item.o['value']:
                                df_result = df_result.append({'entType':'place','URI':uri,'types':item.o['value']}, ignore_index=True)
                            elif 'http://dbpedia.org/ontology/Person' == item.o['value']:
                                df_result = df_result.append({'entType':'person','URI':uri,'types':item.o['value']}, ignore_index=True)
            #
            if not df_result.empty:
                df_result['sentence']=sentence_row.sentences
                df_result['sentenceIndex']=sentence_row.sentenceIndex
                df_result['paragraphIndex'] = sentence_row.paragraphIndex
                df_result['section'] = sentence_row.section
                df_result.rename(columns = {'surfaceForm':'entity'},inplace = True)
                
                df_entities = df_entities.append(df_result)

    # append time
    df_entities['wikiPageID'] = sentence_row.wikiId
    df_entities.to_csv('extractedEntitiesPersonPlaceOnly/'+file_name.file_name,index=False)
    # end = time.time()
    # print("The time of execution of above program is :", end-start)
    time.sleep(1)
    
    if (count % 50) == 0:
        time.sleep(120)

905775.csv
90698.csv
907187.csv
909222.csv
910077.csv
916486.csv
918090.csv
922045.csv
92450.csv
926691.csv
933559.csv
936912.csv
939726.csv
948305.csv
950070.csv
952194.csv
965420.csv
966302.csv
9700.csv
97205.csv
973172.csv
975107.csv
980182.csv
981787.csv
98217.csv
984106.csv
984155.csv
984900.csv
984982.csv
98513.csv
991714.csv
994118.csv
997083.csv


In [281]:
"""
+ = match 1 or more
? = match 0 or 1 repetitions.
* = match 0 or MORE repetitions	  
. = Any character except a new line
CD	cardinal digit
NN	noun, singular 'desk'
NNS	noun plural	'desks'
NNP	proper noun, singular	'Harrison'
NNPS	proper noun, plural	'Americans'
IN	preposition/subordinating conjunction
RB	adverb	very, silently,
RBR	adverb, comparative	better
RBS	adverb, superlative	best
"""
def set_pattern_time():
    pattern = r"""DT: 
    {<CD?><NNP|CD?><CD?>} #complete dates Eg. 23 January 1983
    {<NNP?><CD?><,?><CD?>} # December 13, 2000
    {<CD?></><CD?></><CD?>} #complete dates 23/02/2021
    {<CD?><-><CD?><-><CD?>} #complete dates 23-02-2021
    HO:
    {<CD>+<NN>+} # hour only
    RN: 
    {<IN><CD>+<IN|TO|CC>+<CD>+} # between YYYY and <> YYYY, from 1938 to 1939
    {<IN>+<DT>?<\d>} # "in XXXX" (year)
    <\W?>{<CD>}<\W?> # year in between special characters
    {<NNP><CD>} #incomplete date January 2003, Fall 1994
    {<NN>?<IN|DT><CD>} # years dt the 1990s, leukemia in 1996, age of 43,the 1990s
    {<CD><IN>} # 1984 novel,1954–58
    REF:
    {<IN>+<NN>+<CD>+} # by age 43
    {<NN><IN><CD>} # Eg. fall/winter of 1345, age of 43
    """
    return pattern

## 2. Extract time expressions

In [136]:
pattern = set_pattern_time()
count = 0
for file_name in df_files.itertuples():
    count +=1
    # start = time.time()
    print(file_name.file_name)
    # Read file with segmented sentences
    biography_df = pd.read_csv('indexedSentences/'+file_name.file_name)
    df_time_ent = pd.DataFrame()
    # df_entities = pd.DataFrame()
    
    # for each sentence in each biography
    for sentence_row in biography_df.itertuples():
        # now use the same sentence to analyse if a time entity is present
        # for sentence_row in biography_df.itertuples():
        df_temp = pd.DataFrame()
        timeEntityList = []

        tokenized_sent=nt.word_tokenize(sentence_row.sentences)
        pos_sentences=nltk.pos_tag(tokenized_sent)
        cp = nltk.RegexpParser(pattern)
        cs = cp.parse(pos_sentences)

        # loop to search for the POST TAGs related to TIME
        for ne in cs:
            res = ""
            if hasattr(ne, "label"):
                # print(type(ne[0:]))
                # print(ne.label(), ne[0:])

                for i in ne[0:]:
                    res += i[0] + " "
                res = res.strip()
                # print(res)
                    # print(t)
                timeEntityList.append(res)

        # if we have some time entities indentified
        if timeEntityList:
            df_temp['entity']=timeEntityList
            df_temp['sentence']= sentence_row.sentences
            df_temp['sentenceIndex']=sentence_row.sentenceIndex
            df_temp['paragraphIndex'] = sentence_row.paragraphIndex
            df_temp['section'] = sentence_row.section
            df_temp['entType'] = 'time'
            df_temp['wikiPageID'] = sentence_row.wikiId

            df_time_ent = df_time_ent.append(df_temp)

    # # append time
    df_entities = pd.read_csv('extractedEntitiesPersonPlaceOnly/'+file_name.file_name)
    df_entities.append(df_time_ent).to_csv('extractedEntities/'+file_name.file_name,index=False)

10085.csv


In [None]:
### TEST

In [283]:
sentence = "In 1957, Auchincloss married Adele Burden Lawrence (1931–1991), the daughter of Florence Irvin (née Burden) Lawrence and Blake Leigh Lawrence. Martin was diagnosed with leukemia in 1996, and died on April 12, 1999 in Branson, Missouri at the age of 67. He applied to join the Naval Reserve as an intelligence specialist on December 4, 1940 and was appointed as a lieutenant on December 1, 1942. After taking a break to pursue full-time writing, Auchincloss returned to working as a lawyer, first as an associate (1954–58) and then as a partner (1958–86) at Hawkins, Delafield and Wood in New York City as a wills and trusts attorney, while writing at the rate of a book a year. Auchincloss was an associate at Sullivan & Cromwell from 1941 to 1951 (with an interruption for war service from 1942 to 1945 in the United States Navy during World War II, which might have inspired his 1947 novel The Indifferent Children). SourcesGeorge Plimpton (Fall 1994). One Evening in Chicago (1983). In 1957, Auchincloss married Adele Burden Lawrence (1931–1991), the daughter of Florence Irvin (née Burden) Lawrence and Blake Leigh Lawrence towards immigrants during the 1990s and . Las majas del bergantín ('81 and '86 versions)"
pattern = set_pattern_time()
tokenized_sent=nt.word_tokenize(sentence)
pos_sentences=nltk.pos_tag(tokenized_sent)
cp = nltk.RegexpParser(pattern)
cs = cp.parse(pos_sentences)
#print(cs)

# loop to search for the POST TAGs related to TIME
for ne in cs:
    res = ""
    if hasattr(ne, "label"):
        #print(type(ne[0:]))
        print(ne.label(), ne[0:])

        for i in ne[0:]:
            res += i[0] + " "
        res = res.strip()
        print(res)
        # print(t)
        timeEntityList.append(res)

RN [('In', 'IN'), ('1957', 'CD')]
In 1957
RN [('1931–1991', 'CD')]
1931–1991
RN [('leukemia', 'NN'), ('in', 'IN'), ('1996', 'CD')]
leukemia in 1996
DT [('April', 'NNP'), ('12', 'CD'), (',', ','), ('1999', 'CD')]
April 12 , 1999
RN [('age', 'NN'), ('of', 'IN'), ('67', 'CD')]
age of 67
DT [('December', 'NNP'), ('4', 'CD'), (',', ','), ('1940', 'CD')]
December 4 , 1940
DT [('December', 'NNP'), ('1', 'CD'), (',', ','), ('1942', 'CD')]
December 1 , 1942
RN [('1954–58', 'CD')]
1954–58
RN [('1958–86', 'CD')]
1958–86
RN [('from', 'IN'), ('1941', 'CD'), ('to', 'TO'), ('1951', 'CD')]
from 1941 to 1951
RN [('from', 'IN'), ('1942', 'CD'), ('to', 'TO'), ('1945', 'CD')]
from 1942 to 1945
RN [('1947', 'CD'), ('novel', 'IN')]
1947 novel
RN [('Fall', 'NNP'), ('1994', 'CD')]
Fall 1994
HO [('One', 'CD'), ('Evening', 'NN')]
One Evening
RN [('1983', 'CD')]
1983
RN [('In', 'IN'), ('1957', 'CD')]
In 1957
RN [('1931–1991', 'CD')]
1931–1991
RN [('the', 'DT'), ('1990s', 'CD')]
the 1990s
