### identify PPE: Places, Persons, Events

### 1. Identify Places, Persons, Events using DBpedia Spotlight
### 2. Identify time entities using NLTK

In [2]:
# For DBpedia spotlight, PPE entities
import requests
import pycurl
from urllib.request import urlopen
from urllib.parse import quote
import json
import os
import pandas as pd
from _datetime import date
import time

In [3]:
# For nltk time entities
# time entity
import nltk.tokenize as nt
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import Tree

# if not installed
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')
# nltk.download('punkt')

In [4]:
# reading every CSV with indexed sentences
# return a list object of files in the given folder
files_list = [f for f in os.listdir('indexedSentences') if not f.startswith('.')]
# parse to dataframe
df_files = pd.DataFrame(files_list, columns=['file_name'])
# df_files = df_files.query("file_name=='10085.csv'")

df_files.info()
df_files.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_name  1002 non-null   object
dtypes: object(1)
memory usage: 8.0+ KB


Unnamed: 0,file_name
0,1000228.csv
1,100273.csv
2,100487.csv
3,10085.csv
4,1009725.csv


In [5]:
# extract only the ones that do not exist in folder
files_list = [f for f in os.listdir('extractedEntities') if not f.startswith('.')]
# parse to dataframe
df_query = pd.DataFrame(files_list, columns=['file_name'])
df_result = df_files[~df_files['file_name'].isin(df_query['file_name'])]
df_files = df_result
df_files.info()
df_files.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 727 entries, 275 to 1001
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_name  727 non-null    object
dtypes: object(1)
memory usage: 11.4+ KB


Unnamed: 0,file_name
275,185202.csv
276,18571.csv
277,186175.csv
278,1869640.csv
279,1870977.csv


## 1. Identify PPE using DBpedia Spotlight

In [6]:
# using DBPedia spotlight to search for entities
# URL for local installation of DBpedia spotlight
# urlAnnotation = 'http://dbpedia-spotlight.en:80/rest/annotate/'
urlAnnotation = 'https://whise.kmi.open.ac.uk/rest/annotate'

# setting headers and parameters not using DBpedia categories
def setDbPediaAnnotationServiceParameters(text):
    """ Se parameters for querying Dbpedia
    args: text - text to be analysed
    return: headers
            params
    """
    headers = {
        'Accept':'application/json',
        "content-type":"application/x-www-form-urlencoded"
    }

    params = {
        'confidence': '0.4',
        "text":text,
    }
    return headers, params

# setting headers and parameters, filtering by categories
def setDbPediaAnnotationServiceParametersTypes(text,types):
    """ Se parameters for querying Dbpedia
    args: text - text to be analysed
        types: different categories of entities
    return: headers
            params
    """
    headers = {
        'Accept':'application/json',
        "content-type":"application/x-www-form-urlencoded"
    }

    params = {
        'types' : types,
        'confidence': '0.35',
        "text":text,
    }
    return headers, params

# return response in JSON format
def queryDBPediaAnnotation(url,header,params):
    try:
        response = requests.post(url,headers=header, params=params).json()
        
    except Exception as ex:
        if hasattr(ex, 'message'):
            print(ex.message)
        else:
            print(ex)
        raise Exception(ex)
    # finally:
        # print(response)

    return response

def executeQueryDbpedia(q, f='application/json'):
    epr = "http://dbpedia.org/sparql"
    try:
        params = {'query': q}
        resp = requests.get(epr, params=params, headers={'Accept': f})
    #    return resp.text
        return resp
    except Exception as e:
        print(e, file=sys.stdout)
        raise

In [7]:
def set_pattern_time():
    pattern = r"""TI: 
    {<CD?><NNP|CD?><CD?>} #complete dates Eg. 23 January 1983
    {<NNP?><CD?><,?><CD?>} # December 13, 2345
    {<CD?></><CD?></><CD?>} #complete dates 23/02/2021
    {<NNP>+<CD>+} #incomplete date
    {<\AGE>.<IN+><CD?>} # age of 43
    THO:
    {<CD>+<NN>+} # hour only
    TYS:
    {<NN><IN><CD>} # Eg. fall/winter of
    {<IN><CD>} # "in XXXX" (year)
    #TS:
    #{<NN><TYS>} # Eg. fall/winter of 
    TYO:
    {<CD>} #year only
    """
    return pattern

In [None]:
entitiyTypes = ['DBpedia:Person','DBpedia:MusicalArtist','DBpedia:Place','DBpedia:SocietalEvent']
pattern = set_pattern_time()

# to break in chunks
count = 0
for file_name in df_files.itertuples():
    count +=1
    # start = time.time()
    print(file_name.file_name)
    biography_df = pd.read_csv('indexedSentences/'+file_name.file_name)
    df_entities = pd.DataFrame()
    df_time_ent = pd.DataFrame()
    
    # for each sentence in one biography
    for sentence_row in biography_df.itertuples():
        ## send text and return params
        hdrs, prms = setDbPediaAnnotationServiceParameters(sentence_row.sentences)
        try:
            # obtain response using DBpedia spotlight
            responseJSON = queryDBPediaAnnotation(urlAnnotation,hdrs,prms)
            # UPDATE: save results in file
            # Adding line to save in cache the responses from DBP Spotlight
            df_resources = pd.DataFrame.from_dict(responseJSON['Resources'])
            df_resources.to_csv('cacheSpotlightResponse/'+file_name.file_name,mode='a',index=False,header=False)
            #
        except Exception as ex:
            if hasattr(ex, 'message'):
                print('Exception')
                print(ex.message)
            else:
                print('Exception')
                print(ex)

        # if entities People, places, events using spotlight
        if 'Resources' in responseJSON:
            # parse response to a dataframe
            df_resources = pd.DataFrame.from_dict(responseJSON['Resources'])
            df_resources.rename(columns={'@URI':'URI','@types':'types','@surfaceForm':'surfaceForm','@support':'support','@offset':'offset','@similarityScore':'similarityScore',
                                        '@percentageOfSecondRank':'percentageOfSecondRank'}, inplace=True)
            
            # filter only rows with entities with a category
            df = df_resources[~df_resources['types'].isna()].copy()

            # UPDATE: improving entity recognition
            df_result = pd.DataFrame()
            #

            if not df.empty:
                # df_result = pd.DataFrame()
                # assign the type of entity found, according to the categories
                for entity in entitiyTypes:
                    df_temp = df[df['types'].str.contains(entity)].copy()
                    
                    if not df_temp.empty:
                        # df_temp.head(2)
                        if entity == 'DBpedia:Person' or entity == 'DBpedia:MusicalArtist':
                            df_temp['entType'] = 'person'
                        elif entity == 'DBpedia:Place':
                            df_temp['entType'] = 'place'
                        elif entity == 'DBpedia:SocietalEvent' or entity == 'DBpedia:Event':
                            df_temp['entType'] = 'event'
                            
                        df_result = df_result.append(df_temp)
            
            # UPDATE:
            # Adding process to query entities without the type
            # Query again
            # filter only rows without category
            df = df_resources.loc[df_resources['types'] == ''] 
            if not df.empty:
                for item in df.itertuples():
                    uri = item.URI
                    query_text = "SELECT * WHERE { <" + uri + "> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>  ?o }"
                    results = executeQueryDbpedia(query_text).json()
                    if 'results' in results:
                        df_none = pd.DataFrame.from_dict(results['results']['bindings'])

                        for item in df_none.itertuples():
                            df_temp = pd.DataFrame()
                            if 'http://dbpedia.org/ontology/Place' == item.o['value']:
                                df_result = df_result.append({'entType':'place','URI':uri,'types':item.o['value']}, ignore_index=True)
                            elif 'http://dbpedia.org/ontology/Person' == item.o['value']:
                                df_result = df_result.append({'entType':'person','URI':uri,'types':item.o['value']}, ignore_index=True)
            #
            
            if not df_result.empty:
                df_result['sentence']=sentence_row.sentences
                df_result['sentenceIndex']=sentence_row.sentenceIndex
                df_result['paragraphIndex'] = sentence_row.paragraphIndex
                df_result['section'] = sentence_row.section
                df_result.rename(columns = {'surfaceForm':'entity'},inplace = True)
                
                df_entities = df_entities.append(df_result)
                
        # now use the same sentence to analyse if a time entity is present
        # for sentence_row in biography_df.itertuples():
        df_temp = pd.DataFrame()
        timeEntityList = []

        tokenized_sent=nt.word_tokenize(sentence_row.sentences)
        pos_sentences=nltk.pos_tag(tokenized_sent)
        cp = nltk.RegexpParser(pattern)
        cs = cp.parse(pos_sentences)

        # loop to search for the POST TAGs related to TIME
        for ne in cs:
            res = ""
            if hasattr(ne, "label"):
                # print(type(ne[0:]))
                # print(ne.label(), ne[0:])

                for i in ne[0:]:
                    res += i[0] + " "
                res = res.strip()
                # print(res)
                    # print(t)
                timeEntityList.append(res)

        # if we have some time entities indentified
        if timeEntityList:
            df_temp['entity']=timeEntityList
            
            df_temp['sentence']= sentence_row.sentences
            df_temp['sentenceIndex']=sentence_row.sentenceIndex
            df_temp['paragraphIndex'] = sentence_row.paragraphIndex
            df_temp['section'] = sentence_row.section
            df_temp['entType'] = 'time'

            df_time_ent = df_time_ent.append(df_temp)

    # append time
    df_entities = df_entities.append(df_time_ent)
    df_entities['wikiPageID'] = sentence_row.wikiId
    df_entities.to_csv('extractedEntities/'+file_name.file_name,index=False)
    # end = time.time()
    # print("The time of execution of above program is :", end-start)
    time.sleep(1)
    
    if (count % 50) == 0:
        time.sleep(90)

185202.csv
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'
Exception
'Resources'