### Capturing Places and Persons

In [None]:
### This notebook processes text from music personalities' biographies and extract historical meetups information 
#### Pre-requirements:
#### Entity annotations after executing "02_queryDbpedia.ipynb" Notebook

#### For each file with entity annotations
#### - Read files from cacheSpotlightResponse/
#### - Identify People entities:
####   - "http://dbpedia.org/ontology/Person"
####   - "http://dbpedia.org/ontology/MusicalArtist"
#### - Identify Place entities:
####   - 'http://dbpedia.org/ontology/Place'
#### - When an entity has an empty type
####   - Read corresponding emptyEntity response from DBpedia in cacheSpotlightResponse/ directory
#### - Store annotations in extractedEntitiesPersonPlaceOnly/

#### Directories information:
#### cacheSpotlightResponse/ : collection of biographies in CSV format. Each biography contains the list of entities identified using DBpedia Spotlight, each linked to its corresponding sentence
#### extractedEntitiesPersonPlaceOnly/ : response from DBpedia Spotlight entity annotation grouped by biography

In [2]:
import json
import os
import pandas as pd
from _datetime import date
import time
from operator import itemgetter

# For DBpedia spotlight, PPE entities
import requests
import pycurl
from urllib.request import urlopen
from urllib.parse import quote

In [3]:
# For nltk time entities
# time entity
import nltk.tokenize as nt
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import Tree

# if not installed
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')
# nltk.download('punkt')

In [177]:
# reading every CSV with indexed sentences
# return a list object of files in the given folder
files_list = [f for f in os.listdir('indexedSentences') if not f.startswith('.')]
# parse to dataframe
df_files = pd.DataFrame(files_list, columns=['file_name'])
# df_files = df_files.query("file_name=='10085.csv'")
df_files.to_csv('totalBiographiesEntities.csv',index=False)
df_files.info()
df_files.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_name  1002 non-null   object
dtypes: object(1)
memory usage: 8.0+ KB


Unnamed: 0,file_name
0,1000228.csv
1,100273.csv
2,100487.csv
3,10085.csv
4,1009725.csv


In [4]:
# extract only the ones that do not exist in folder
files_list = [f for f in os.listdir('extractedEntities') if not f.startswith('.')]
# parse to dataframe
df_query = pd.DataFrame(files_list, columns=['file_name'])
df_result = df_files[~df_files['file_name'].isin(df_query['file_name'])]
df_files = df_result
df_files.to_csv('totalBiographiesEntities.csv',index=False)
df_files.info()
df_files.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_name  0 non-null      object
dtypes: object(1)
memory usage: 0.0+ bytes


Unnamed: 0,file_name


In [4]:
# USING ONLY FOR SAMPLING - 1002
files_list = [f for f in os.listdir('extractedEntities') if not f.startswith('.')]
# parse to dataframe
df_query = pd.DataFrame(files_list, columns=['file_name'])
df_query.to_csv('testingDatasetFileNames.csv',index=False)

## 1. Process DBpedial Spotlight entity annotation: functions

In [5]:
def executeQueryDbpedia(q, f='application/json'):
    epr = "http://dbpedia.org/sparql"
    try:
        params = {'query': q}
        resp = requests.get(epr, params=params, headers={'Accept': f})
    #    return resp.text
        return resp
    except Exception as e:
        # print(e, file=sys.stdout)
        if hasattr(e, 'message'):
            print(e.message)
        else:
            print(e)
        raise
        
# retrieve entities information when they are not in cache
def queryEntityLeft(uri,item):
    # retrieve the next id to store the empty entitites
    df_master = pd.read_csv('cacheSpotlightResponse/emptyTypes_master.csv')
    df_master['id'] = df_master['file_name'].str.replace('emptyTypes_','')
    df_master['id'] = df_master['id'].str.replace('.csv','')
    df_master['id'] = df_master['id'].astype(str).astype(int)
    df_master = df_master.sort_values(by='id', ascending=False)
    last_file_id = df_master['id'].loc[df_master.index[0]]
    last_file_id = int(last_file_id)+1
    # last_file_id +=1
    
    df_results = pd.DataFrame()
    query_text = "SELECT * WHERE { <" + uri + "> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>  ?o }"
    try:
        # Execute query against sparql endpoint, query types
        results = executeQueryDbpedia(query_text).json()
        # print(results)
        # if query returns a response
        if 'results' in results:
            # to obtain the list of types
            res_1 = list(map(itemgetter('o'), results['results']['bindings']))
            res_2 = list(map(itemgetter('value'), res_1))

            df_results['types'] = res_2
            df_results['URI'] = item.URI
            df_results['entity'] = item.entity
            df_results['support'] = item.support
            df_results['offset'] = item.offset
            df_results['similarityScore'] = item.similarityScore
            df_results['percentageOfSecondRank'] = item.percentageOfSecondRank

            df_results['sentenceIndex']=item.sentenceIndex
            df_results['paragraphIndex'] = item.paragraphIndex
            df_results['section'] = item.section

            df_new_master_row = pd.DataFrame({'URI':[item.URI],'entity':[item.entity],
                                                          'file_name':['emptyTypes_{}.csv'.format(str(last_file_id))]})
            df_new_master_row.to_csv('cacheSpotlightResponse/emptyTypes_master.csv',mode='a',
                                 index=False,header=False)
            # print("Saved master file: " + str(last_file_id) + ". Len: " + str(len(df_new_master_row)))

            file_exists = os.path.isfile('cacheSpotlightResponse/emptyTypes_'+str(last_file_id)+'.csv')
            if not file_exists:
                df_results.to_csv('cacheSpotlightResponse/emptyTypes_'+str(last_file_id)+'.csv',index=False)
            else:
                df_results.to_csv('cacheSpotlightResponse/emptyTypes_'+str(last_file_id)+'.csv',mode='a',
                                 index=False,header=False)
    except Exception as ex:
        print("Blank type: ****")
        if hasattr(ex, 'message'):
            print(ex.message)
        else:
            print(ex)
        # print(ex, file=sys.stdout)
    return df_results

In [6]:
# Create error file for later review
df_error = pd.DataFrame(columns=['file_name'])
df_error.to_csv('temp_error.csv',index=False)

df_error.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_name  0 non-null      object
dtypes: object(1)
memory usage: 0.0+ bytes


## 1. Extract entities: People, places

In [None]:
def checkPersonLinkedEntity():
    object_list = ["rdf:type dbo:PersonFunction","dct:subject dbc:Musical_terminology"]
    # iterate over all the person entities found
    for entity_row in entityList_df.itertuples():
        role = False
        for item in object_list:
            # build query
            query_text = "ASK  { <{}> {}}".format(entity_row.URI)
            #read cache
            
            # check if already in cache
            # if in cache and response is true:
            # role = True
            # break
            # else:
            # query DBpedia
            response = executeQueryDbpedia(query_text)
            if response == 'true':
                role = True
                break
        if role:
            # delete the entity from the results

In [12]:
# 1. Extract People and Places entities
entitiyTypes = ['DBpedia:Person','DBpedia:MusicalArtist','DBpedia:Place','DBpedia:SocietalEvent']
# http://dbpedia.org/ontology/Person
df_entityType = pd.DataFrame({'id':[1,2],'types':['http://dbpedia.org/ontology/Person',
                                                      'http://dbpedia.org/ontology/MusicalArtist']})
# read master with index of entities
df_master_cache = pd.read_csv('cacheSpotlightResponse/emptyTypes_master.csv')

# use chunk to load a small number of files in memory
for chunk in pd.read_csv('totalBiographiesEntities.csv', chunksize=30):
    df_files = pd.DataFrame()
    df_files['file_name'] = chunk['file_name']
    
    for file_name_item in df_files.itertuples():
        try:
            # start = time.time()
            print(file_name_item.file_name)
            # check if the file exists
            file_exists = os.path.isfile('cacheSpotlightResponse/'+file_name_item.file_name)

            if file_exists:
                # read the cached results from the query
                df = pd.read_csv('cacheSpotlightResponse/'+file_name_item.file_name)
                df.rename(columns = {'surfaceForm':'entity'},inplace = True)
                # select all that are not NA
                df_ne = df.loc[~df['types'].isna()] 

                df_result = pd.DataFrame()
                if len(df_ne) >0:
                    for entity in entitiyTypes:
                        df_temp = df_ne[df_ne['types'].str.contains(entity)].copy()

                        if not df_temp.empty:
                            if entity == 'DBpedia:Person' or entity == 'DBpedia:MusicalArtist':
                                df_temp['entType'] = 'person'
                            elif entity == 'DBpedia:Place':
                                df_temp['entType'] = 'place'
                            elif entity == 'DBpedia:SocietalEvent' or entity == 'DBpedia:Event':
                                df_temp['entType'] = 'event'

                            df_result = df_result.append(df_temp)

                # now filter all the entities that are empty
                df_e = df.loc[df['types'].isna()] 

                if not df_e.empty:
                    # list of entities and the name of the files that store the results from the query
                    df_merge = df_master_cache.merge(df_e, on=['URI','entity'],how='right')
                    # df_merge.to_csv('extractedEntitiesPersonPlaceOnly/df_merge.csv',index=False)

                    # for all documents not in cache
                    df_cache = df_merge.loc[df_merge['file_name'].isna()]
                    print(len(df_cache))
                    if len(df_cache)>0:
                        for item_empty in df_cache.itertuples():
                            df_temp = queryEntityLeft(item_empty.URI,item_empty)
                        df_master_cache = pd.read_csv('cacheSpotlightResponse/emptyTypes_master.csv')
                        df_merge = df_master_cache.merge(df_e, on=['URI','entity'],how='right')
                    # df_merge.to_csv('extractedEntitiesPersonPlaceOnly/df_merge.csv',index=False)

                    df_cache = df_merge.loc[~df_merge['file_name'].isna()] 
                    for item_empty in df_cache.itertuples():
                        # print(item_empty.entity)
                        appendRow = False
                        typeEntity = ''

                        # read cache file
                        file_exists = os.path.isfile('cacheSpotlightResponse/'+item_empty.file_name)
                        if file_exists:
                            df_empty_type = pd.read_csv('cacheSpotlightResponse/'+item_empty.file_name)
                            df_empty_type.drop_duplicates(subset=['entity','URI','types'],keep='first')
                            try:
                                df_empty_type = df_empty_type.query("""entity == "{}" and URI=="{}" """.format(item_empty.entity,item_empty.URI))
                            except SyntaxError as ex:
                                # df_empty_type = df_empty_type.query("""entity == '{}' and URI=='{}'""".format(item_empty.entity,item_empty.URI))
                                df_empty_type = df_empty_type[df_empty_type['URI'].str.contains(item_empty.URI)]
                            except TokenError as te:
                                df_empty_type = df_empty_type.query("entity == '{}' and URI=='{}'".format(item_empty.entity,item_empty.URI))

                            # find types Place
                            df_temp = df_empty_type[df_empty_type['types'].str.contains('http://dbpedia.org/ontology/Place')].copy()
                            if not df_temp.empty:
                                types = df_temp['types'].loc[df_temp.index[0]]
                                entType = 'place'
                                appendRow = True
                            df_temp = df_empty_type.query('types == "http://dbpedia.org/ontology/Person" or types=="http://dbpedia.org/ontology/MusicalArtist"')
                            df_temp = df_temp.sort_values(by='types', ascending=False)
                            if not df_temp.empty:
                                types = df_temp['types'].loc[df_temp.index[0]]
                                entType = 'person'
                                appendRow = True
                            # else:
                            #     print(item_empty.URI)
                            #     print("Empty: " + item_empty.file_name)

                            if appendRow:
                                df_new_row = pd.DataFrame([[item_empty.URI, item_empty.support, types, item_empty.entity, item_empty.offset, item_empty.similarityScore,
                                                          item_empty.percentageOfSecondRank, item_empty.sentence, item_empty.sentenceIndex, item_empty.paragraphIndex, 
                                                           item_empty.section]],columns=list(df))
                                df_new_row['entType'] = entType
                                df_result = df_result.append(df_new_row,ignore_index=True)
                        else:
                            print(df_merge.file_name)
                            print(df_merge.URI)
                            break;

                    # print('No empty types')
            # IF file does not exists, then the 

            df_result['wikiPageID'] = file_name_item.file_name.replace('.csv','')
            df_result.to_csv('extractedEntitiesPersonPlaceOnly/'+file_name_item.file_name,index=False)
            # end = time.time()
            # print("The time of execution of above program is :", end-start)
        except (pd.errors.ParserError, KeyError) as pe:
            print('Error: ' + file_name_item.file_name)
            if hasattr(pe, 'message'):
                print(pe.message)
            else:
                print(pe)
            df_error = pd.DataFrame([[file_name_item.file_name]],columns=['file_name'])
            df_error.to_csv('temp_error.csv',mode='a',index=False,header=False)
    time.sleep(120)

1035724.csv
Error: 1035724.csv
Error tokenizing data. C error: Expected 7 fields in line 126, saw 11

1043762.csv
Error: 1043762.csv
Error tokenizing data. C error: Expected 7 fields in line 104, saw 11

1047779.csv
Error: 1047779.csv
Error tokenizing data. C error: Expected 7 fields in line 106, saw 11

1048151.csv
Error: 1048151.csv
Error tokenizing data. C error: Expected 7 fields in line 310, saw 11

1048172.csv
Error: 1048172.csv
Error tokenizing data. C error: Expected 7 fields in line 190, saw 11

1049483.csv
Error: 1049483.csv
Error tokenizing data. C error: Expected 7 fields in line 393, saw 11

1052490.csv
Error: 1052490.csv
Error tokenizing data. C error: Expected 7 fields in line 673, saw 11

1056463.csv
Error: 1056463.csv
Error tokenizing data. C error: Expected 7 fields in line 589, saw 11

105767.csv
Error: 105767.csv
Error tokenizing data. C error: Expected 7 fields in line 357, saw 11

1058567.csv
Error: 1058567.csv
Error tokenizing data. C error: Expected 7 fields in 

  df_master['id'] = df_master['id'].str.replace('.csv','')


9555834.csv
1
9559305.csv
1
9559601.csv
3
956078.csv
2
956082.csv
3
956147.csv
2
956166.csv
1
956338.csv
1
956432.csv
1
9565227.csv
0
956561.csv
1
9565871.csv
0
9566528.csv
0
9566772.csv
4
9572609.csv
0
957280.csv
0
957400.csv
0
9574007.csv
0
9577719.csv
2
9578639.csv
0
9579930.csv
0
9581533.csv
2
9582754.csv
0
9582791.csv
0
9583516.csv
0
95849.csv
6
9584945.csv
1
9585107.csv
1
958564.csv
1
9587658.csv
0
9587975.csv
3
9588366.csv
0
9588919.csv
0
9589287.csv
1
9589996.csv
1
9590626.csv
6
959183.csv
1
9592193.csv
0
9592833.csv
3
959309.csv
28
959635.csv
0
9596961.csv
0
9597238.csv
1
9598149.csv
2
959907.csv
1
9606071.csv
5
960698.csv
0
960719.csv
2
960995.csv
0


  df_empty_type = df_empty_type[df_empty_type['URI'].str.contains(item_empty.URI)]


9610231.csv
1
9610507.csv
0
9611501.csv
0
9611687.csv
0
9614249.csv
1
9615793.csv
0
961610.csv
1
9619552.csv
0
962005.csv
1
9620823.csv
0
9622272.csv
0
9624017.csv
2
9625304.csv
0
9625982.csv
0
962932.csv
0
962945.csv
8
9630296.csv
2
9630314.csv
0
9632534.csv
4
9632659.csv
0
9636031.csv
0
963637.csv
2
9637582.csv
0
9637998.csv
0
9639026.csv
1
963936.csv
4
964053.csv
4
9640637.csv
1
9640956.csv
1
9641694.csv
0
9642079.csv
0
964258.csv
2
9643503.csv
0
9647782.csv
1
9648733.csv
0
9650157.csv
0
965046.csv
0
9650786.csv
0
9650856.csv
0
965107.csv
1
965109.csv
1
9651675.csv
0
965420.csv
0
9654804.csv
1
9656399.csv
2
965761.csv
0
965824.csv
4
965919.csv
0
965956.csv
0
965995.csv
1
966302.csv
5
966347.csv
0
966488.csv
0
9665678.csv
2
9669378.csv
2
9669951.csv
0
967009.csv
1
96701.csv
2
9670175.csv
0
9671048.csv
0
9671420.csv
0
9672429.csv
2
967453.csv
1
968302.csv
3
968346.csv
3
9685734.csv
0
968617.csv
0
9688889.csv
1
9689709.csv
0
969048.csv
0
969120.csv
1
969231.csv
0
9692702.csv
0
9692767.