### identify PPE: Places, Persons, Events

### 1. Identify Places, Persons, Events using DBpedia Spotlight
### 2. Identify time entities using NLTK

In [1]:
# For DBpedia spotlight, PPE entities
import requests
import pycurl
from urllib.request import urlopen
from urllib.parse import quote
import json
import os
import pandas as pd
from _datetime import date
import time

In [2]:
# For nltk time entities
# time entity
import nltk.tokenize as nt
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import Tree

# if not installed
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')
# nltk.download('punkt')

In [177]:
# reading every CSV with indexed sentences
# return a list object of files in the given folder
files_list = [f for f in os.listdir('indexedSentences') if not f.startswith('.')]
# parse to dataframe
df_files = pd.DataFrame(files_list, columns=['file_name'])
# df_files = df_files.query("file_name=='10085.csv'")

df_files.info()
df_files.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_name  1002 non-null   object
dtypes: object(1)
memory usage: 8.0+ KB


Unnamed: 0,file_name
0,1000228.csv
1,100273.csv
2,100487.csv
3,10085.csv
4,1009725.csv


In [4]:
# extract only the ones that do not exist in folder
files_list = [f for f in os.listdir('extractedEntities') if not f.startswith('.')]
# parse to dataframe
df_query = pd.DataFrame(files_list, columns=['file_name'])
df_result = df_files[~df_files['file_name'].isin(df_query['file_name'])]
df_files = df_result
df_files.info()
df_files.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_name  0 non-null      object
dtypes: object(1)
memory usage: 0.0+ bytes


Unnamed: 0,file_name


## 1. Identify PPE using DBpedia Spotlight

In [5]:
# using DBPedia spotlight to search for entities
# URL for local installation of DBpedia spotlight
# urlAnnotation = 'http://dbpedia-spotlight.en:80/rest/annotate/'
urlAnnotation = 'https://whise.kmi.open.ac.uk/rest/annotate'

# setting headers and parameters not using DBpedia categories
def setDbPediaAnnotationServiceParameters(text):
    """ Se parameters for querying Dbpedia
    args: text - text to be analysed
    return: headers
            params
    """
    headers = {
        'Accept':'application/json',
        "content-type":"application/x-www-form-urlencoded"
    }

    params = {
        'confidence': '0.4',
        "text":text,
    }
    return headers, params

# setting headers and parameters, filtering by categories
def setDbPediaAnnotationServiceParametersTypes(text,types):
    """ Se parameters for querying Dbpedia
    args: text - text to be analysed
        types: different categories of entities
    return: headers
            params
    """
    headers = {
        'Accept':'application/json',
        "content-type":"application/x-www-form-urlencoded"
    }

    params = {
        'types' : types,
        'confidence': '0.35',
        "text":text,
    }
    return headers, params

# return response in JSON format
def queryDBPediaAnnotation(url,header,params):
    try:
        response = requests.post(url,headers=header, params=params).json()
        
    except Exception as ex:
        if hasattr(ex, 'message'):
            print(ex.message)
        else:
            print(ex)
        raise Exception(ex)
    # finally:
        # print(response)

    return response

def executeQueryDbpedia(q, f='application/json'):
    epr = "http://dbpedia.org/sparql"
    try:
        params = {'query': q}
        resp = requests.get(epr, params=params, headers={'Accept': f})
    #    return resp.text
        return resp
    except Exception as e:
        print(e, file=sys.stdout)
        raise

## 1. Extract entities: People, places

In [178]:
# 1. Extract People and Places entities
entitiyTypes = ['DBpedia:Person','DBpedia:MusicalArtist','DBpedia:Place','DBpedia:SocietalEvent']

# Count the number of files read to include breaks
count = 0
for file_name in df_files.itertuples():
    count +=1
    # start = time.time()
    print(file_name.file_name)
    # Read file with segmented sentences
    biography_df = pd.read_csv('indexedSentences/'+file_name.file_name)
    df_entities = pd.DataFrame()
    
    # for each sentence in each biography
    for sentence_row in biography_df.itertuples():
        ## send sentence text and return params for query
        hdrs, prms = setDbPediaAnnotationServiceParameters(sentence_row.sentences)
        try:
            # obtain response using DBpedia spotlight
            responseJSON = queryDBPediaAnnotation(urlAnnotation,hdrs,prms)
            # UPDATE: save responses from DBP Spotlight
            if 'Resources' in responseJSON:
                file_exists = os.path.isfile('cacheSpotlightResponse/'+file_name.file_name)
                df_resources = pd.DataFrame.from_dict(responseJSON['Resources'])
                df_resources['sentence']=sentence_row.sentences
                df_resources['sentenceIndex']=sentence_row.sentenceIndex
                df_resources['paragraphIndex'] = sentence_row.paragraphIndex
                df_resources['section'] = sentence_row.section
                if not file_exists:
                    df_resources.to_csv('cacheSpotlightResponse/'+file_name.file_name,index=False)
                else:
                    df_resources.to_csv('cacheSpotlightResponse/'+file_name.file_name,mode='a',index=False,header=False)
                
        except Exception as ex:
            print("****")
            if hasattr(ex, 'message'):
                print(ex.message)
            else:
                print(ex)

        # if entities People, places, events using spotlight
        if 'Resources' in responseJSON:
            # parse response to a dataframe
            df_resources = pd.DataFrame.from_dict(responseJSON['Resources'])
            df_resources.rename(columns={'@URI':'URI','@types':'types','@surfaceForm':'surfaceForm','@support':'support','@offset':'offset','@similarityScore':'similarityScore',
                                        '@percentageOfSecondRank':'percentageOfSecondRank'}, inplace=True)
            
            # filter only rows for entities with a category
            df = df_resources[~df_resources['types'].isna()].copy()

            # UPDATE: improving entity recognition
            df_result = pd.DataFrame()
            #
            
            if not df.empty:
                #df_result = pd.DataFrame()
                # assign the type of entity found, according to the categories
                for entity in entitiyTypes:
                    df_temp = df[df['types'].str.contains(entity)].copy()
                    
                    if not df_temp.empty:
                        # df_temp.head(2)
                        if entity == 'DBpedia:Person' or entity == 'DBpedia:MusicalArtist':
                            df_temp['entType'] = 'person'
                        elif entity == 'DBpedia:Place':
                            df_temp['entType'] = 'place'
                        elif entity == 'DBpedia:SocietalEvent' or entity == 'DBpedia:Event':
                            df_temp['entType'] = 'event'
                            
                        df_result = df_result.append(df_temp)
                        
            # UPDATE:
            # Adding process to query entities without the type
            # filter only rows without category
            df = df_resources.loc[df_resources['types'] == ''] 
            if not df.empty:
                for item in df.itertuples():
                    uri = item.URI
                    query_text = "SELECT * WHERE { <" + uri + "> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>  ?o }"
                    # Execute query against sparql endpoint, query types
                    results = executeQueryDbpedia(query_text).json()
                    
                    # if query returns a response
                    if 'results' in results:
                        df_none = pd.DataFrame.from_dict(results['results']['bindings'])

                        for item in df_none.itertuples():
                            df_temp = pd.DataFrame()
                            # find types Place and Person
                            if 'http://dbpedia.org/ontology/Place' == item.o['value']:
                                df_result = df_result.append({'entType':'place','URI':uri,'types':item.o['value']}, ignore_index=True)
                            elif 'http://dbpedia.org/ontology/Person' == item.o['value']:
                                df_result = df_result.append({'entType':'person','URI':uri,'types':item.o['value']}, ignore_index=True)
            #
            if not df_result.empty:
                df_result['sentence']=sentence_row.sentences
                df_result['sentenceIndex']=sentence_row.sentenceIndex
                df_result['paragraphIndex'] = sentence_row.paragraphIndex
                df_result['section'] = sentence_row.section
                df_result.rename(columns = {'surfaceForm':'entity'},inplace = True)
                
                df_entities = df_entities.append(df_result)

    # append time
    df_entities['wikiPageID'] = sentence_row.wikiId
    df_entities.to_csv('extractedEntitiesPersonPlaceOnly/'+file_name.file_name,index=False)
    # end = time.time()
    # print("The time of execution of above program is :", end-start)
    time.sleep(1)
    
    if (count % 50) == 0:
        time.sleep(120)

1000228.csv


NameError: name 'setDbPediaAnnotationServiceParameters' is not defined

In [175]:
"""
+ = match 1 or more
? = match 0 or 1 repetitions.
* = match 0 or MORE repetitions	  
. = Any character except a new line
CD	cardinal digit
DT	determiner
NN	noun, singular 'desk'
NNS	noun plural	'desks'
NNP	proper noun, singular	'Harrison'
NNPS	proper noun, plural	'Americans'
IN	preposition/subordinating conjunction
RB	adverb	very, silently,
RBR	adverb, comparative	better
RBS	adverb, superlative	best
"""
def set_pattern_time():
    pattern = r"""DT1: #dates, time point
    {<CD?><NNP|CD?><CD?>} #complete dates Eg. 23 January 1983
    {<NNP?><CD?><,?><CD?>} # December 13, 2000
    {<CD?></><CD?></><CD?>} #complete dates 23/02/2021
    {<CD?><-><CD?><-><CD?>} #complete dates 23-02-2021
    HO: # HOURS
    {<CD>+<NN>+} # hour only
    RN: # range 
    {<IN><CD>+<IN|TO|CC>+<CD>+} # between YYYY and <> YYYY, from 1938 to 1939
    DT2: #date from explicit, to implicit DT2 [('each', 'DT'), ('one', 'CD')]
    {<IN>+<DT>?<\d>} # "in XXXX" (year)
    <\W?>{<CD>}<\W?> # year in between special characters
    {<NNP><CD>} #incomplete date January 2003, Fall 1994
    {<IN>+<DT>+<CD>+} # years dt the 1990s, leukemia in 1996, age of 43,the 1990s
    {<NN>+<IN|DT>+<CD>} # years dt the 1990s, leukemia in 1996, age of 43,the 1990s
    {<CD><IN>} # 1984 novel,1954–58
    DT3:
    <DT>+{<CD>}
    REF: # references, implicit
    {<IN>+<NN>+<CD>+} # by age 43
    {<NN><IN><CD>} # Eg. fall/winter of 1345, age of 43
    <NN>{<DT>+<CD>} # Eg. fall/winter of 1345, age of 43
    """
    return pattern

## 2. Extract time expressions

In [179]:
pattern = set_pattern_time()
count = 0
for file_name in df_files.itertuples():
    count +=1
    # start = time.time()
    print(file_name.file_name)
    # Read file with segmented sentences
    biography_df = pd.read_csv('indexedSentences/'+file_name.file_name)
    df_time_ent = pd.DataFrame()
    # df_entities = pd.DataFrame()
    
    # for each sentence in each biography
    for sentence_row in biography_df.itertuples():
        # now use the same sentence to analyse if a time entity is present
        # for sentence_row in biography_df.itertuples():
        df_temp = pd.DataFrame()
        timeEntityList = []
        #added to include timeEntityType
        timeEntityTypeList = []

        tokenized_sent=nt.word_tokenize(sentence_row.sentences)
        pos_sentences=nltk.pos_tag(tokenized_sent)
        cp = nltk.RegexpParser(pattern)
        cs = cp.parse(pos_sentences)

        # loop to search for the POST TAGs related to TIME
        for ne in cs:
            res = ""
            if hasattr(ne, "label"):
                # print(type(ne[0:]))
                # print(ne.label(), ne[0:])

                for i in ne[0:]:
                    res += i[0] + " "
                res = res.strip()
                # print(res)
                    # print(t)
                
                # added to include timeEntityType
                time_type = ""
                if ne.label() == "RN":
                    # then type is RANGE
                    time_type = "TimeRange"
                else:
                    time_type = "TimePoint"
                if ('–' in res):
                    time_type = "TimeRange"
                timeEntityTypeList.append(time_type)
                
                #added to include timeEntityType
                timeEntityList.append(res)
                
        # if we have some time entities indentified
        if timeEntityList:
            df_temp['entity']=timeEntityList
            df_temp['sentence']= sentence_row.sentences
            df_temp['sentenceIndex']=sentence_row.sentenceIndex
            df_temp['paragraphIndex'] = sentence_row.paragraphIndex
            df_temp['section'] = sentence_row.section
            df_temp['entType'] = 'time'
            df_temp['wikiPageID'] = sentence_row.wikiId
            #added to include timeEntityType
            df_temp['timeEntityType']=timeEntityTypeList

            df_time_ent = df_time_ent.append(df_temp)

    # # append time
    df_entities = pd.read_csv('extractedEntitiesPersonPlaceOnly/'+file_name.file_name)
    df_entities.append(df_time_ent).to_csv('extractedEntities/'+file_name.file_name,index=False)

1000228.csv
100273.csv
100487.csv
10085.csv
1009725.csv
1010510.csv
1010943.csv
10120.csv
1013900.csv
1022191.csv
1023303.csv
1024347.csv
1028178.csv
103549.csv
103566.csv
1035724.csv
1043762.csv
1047779.csv
1048151.csv
1048172.csv
1049483.csv
1052490.csv
1056463.csv
105767.csv
1058567.csv
1059399.csv
106366.csv
1065532.csv
1065581.csv
10671.csv
1068160.csv
1070521.csv
1073691.csv
1077508.csv
1081839.csv
1084179.csv
1089533.csv
1092607.csv
1093129.csv
1097923.csv
1098118.csv
1103482.csv
1107893.csv
1111730.csv
1113259.csv
1113588.csv
1115155.csv
1118112.csv
1127222.csv
1129635.csv
113049.csv
113560.csv
1142145.csv
1147577.csv
1148248.csv
1150533.csv
1151374.csv
1153177.csv
1164879.csv
1174545.csv
1174834.csv
1175222.csv
1178548.csv
1181295.csv
1181499.csv
1186480.csv
1188821.csv
1189627.csv
1194878.csv
1195717.csv
1196793.csv
1205991.csv
1207899.csv
1209484.csv
1209685.csv
1213916.csv
1215145.csv
1228021.csv
1232492.csv
1234422.csv
1234606.csv
1236563.csv
1239372.csv
1244554.csv
124548

In [None]:
### TEST

In [173]:
"""
+ = match 1 or more
? = match 0 or 1 repetitions.
* = match 0 or MORE repetitions	  
. = Any character except a new line
CD	cardinal digit
DT	determiner
NN	noun, singular 'desk'
NNS	noun plural	'desks'
NNP	proper noun, singular	'Harrison'
NNPS	proper noun, plural	'Americans'
IN	preposition/subordinating conjunction
RB	adverb	very, silently,
RBR	adverb, comparative	better
RBS	adverb, superlative	best
"""
def set_pattern_time():
    pattern = r"""DT1: #dates, time point
    {<CD?><NNP|CD?><CD?>} #complete dates Eg. 23 January 1983
    {<NNP?><CD?><,?><CD?>} # December 13, 2000
    {<CD?></><CD?></><CD?>} #complete dates 23/02/2021
    {<CD?><-><CD?><-><CD?>} #complete dates 23-02-2021
    HO: # HOURS
    {<CD>+<NN>+} # hour only
    RN: # range 
    {<IN><CD>+<IN|TO|CC>+<CD>+} # between YYYY and <> YYYY, from 1938 to 1939
    DT2: #date from explicit, to implicit DT2 [('each', 'DT'), ('one', 'CD')]
    {<IN>+<DT>?<\d>} # "in XXXX" (year)
    <\W?>{<CD>}<\W?> # year in between special characters
    {<NNP><CD>} #incomplete date January 2003, Fall 1994
    {<IN>+<DT>+<CD>+} # years dt the 1990s, leukemia in 1996, age of 43,the 1990s
    {<NN>+<IN|DT>+<CD>} # years dt the 1990s, leukemia in 1996, age of 43,the 1990s
    {<CD><IN>} # 1984 novel,1954–58
    DT3:
    <DT>+{<CD>}
    REF: # references, implicit
    {<IN>+<NN>+<CD>+} # by age 43
    {<NN><IN><CD>} # Eg. fall/winter of 1345, age of 43
    <NN>{<DT>+<CD>} # Eg. fall/winter of 1345, age of 43
    """
    return pattern

In [174]:
timeEntityList = []
timeEntityTypeList = []
sentence ="He is one of the leaders of musical art of modern times. She had a related UK single release as 'Jennifer' on London HLU 10278 in June 1969 with 'Let The Sunshine In' and 'Easy to Be Hard', licensed from the US Parrot label.	However, in the 1957 centenary symposium, several leading admirers of Elgar express reservations about one or both symphonies. By the age of eight, Elgar was taking piano and the 1990s, leukemia in 1996, age of 43, the 1990s violin lessons, and his father, who tuned the pianos at many grand houses in Worcestershire, would sometimes take him along, giving him the chance to display his skill to important local figures. The Variations have amused me because I've labelled them with the nicknames of my particular friends ... that is to say I've written the variations each one to represent the mood of the 'party' (the person) ... and have written what I think they would have written – if they were asses enough to compose'."
pattern = set_pattern_time()
tokenized_sent=nt.word_tokenize(sentence)
pos_sentences=nltk.pos_tag(tokenized_sent)
cp = nltk.RegexpParser(pattern)
cs = cp.parse(pos_sentences)
print(cs)

# loop to search for the POST TAGs related to TIME
for ne in cs:
    res = ""
    if hasattr(ne, "label"):
        # print(type(ne[0:]))
        # ne is a list
        print(ne.label(), ne[0:])
        time_type = ""
        if ne.label() == "RN":
            # then type is RANGE
            time_type = "TimeRange"
        else:
            time_type = "TimePoint"
        
        for i in ne[0:]:
            res += i[0] + " "
        res = res.strip()
        print(res)
        if ('–' in res):
            time_type = "TimeRange"
            # print(time_type)
        
        # print(t)
        print(time_type)
        timeEntityList.append(res)
        timeEntityTypeList.append(res)
        
"""
    Classification:
    Time FROM and TO: 
        RN
    Time point:
        DT1, DT2,HO, REF
"""

(S
  He/PRP
  is/VBZ
  (DT2 one/CD of/IN)
  the/DT
  leaders/NNS
  of/IN
  musical/JJ
  art/NN
  of/IN
  modern/JJ
  times/NNS
  ./.
  She/PRP
  had/VBD
  a/DT
  related/JJ
  UK/NNP
  single/JJ
  release/NN
  as/IN
  'Jennifer/NN
  '/''
  on/IN
  London/NNP
  (DT2 HLU/NNP 10278/CD)
  in/IN
  (DT2 June/NNP 1969/CD)
  with/IN
  'Let/PDT
  The/DT
  Sunshine/NNP
  In/IN
  '/POS
  and/CC
  'Easy/CD
  to/TO
  Be/VB
  Hard/NNP
  '/POS
  ,/,
  licensed/VBN
  from/IN
  the/DT
  US/NNP
  Parrot/NNP
  label/NN
  ./.
  However/RB
  ,/,
  (DT2 in/IN the/DT 1957/CD)
  centenary/JJ
  symposium/NN
  ,/,
  several/JJ
  leading/VBG
  admirers/NNS
  of/IN
  Elgar/NNP
  express/NN
  reservations/NNS
  about/IN
  one/CD
  or/CC
  both/DT
  symphonies/NNS
  ./.
  By/IN
  the/DT
  (DT2 age/NN of/IN eight/CD)
  ,/,
  Elgar/NNP
  was/VBD
  taking/VBG
  piano/NN
  and/CC
  the/DT
  (DT3 1990s/CD)
  ,/,
  (DT2 leukemia/NN in/IN 1996/CD)
  ,/,
  (DT2 age/NN of/IN 43/CD)
  ,/,
  the/DT
  (HO 1990s/CD violin/NN)
  

'\n    Classification:\n    Time FROM and TO: \n        RN\n    Time point:\n        DT1, DT2,HO, REF\n'