# Identify entities:
### Places and Persons using DbPedia

In [3]:
import requests
import pycurl
from urllib.request import urlopen
from urllib.parse import quote
import json
import os
import pandas as pd
from _datetime import date

In [2]:
# 1. give each paragraph an id
# 2. for each paragraph obtain the list of people entities
# 3. check each people found in step 2, if they are type Music
# 4. check identify places

In [4]:
# using DBPedia spotlight to search for entities
urlAnnotation = 'http://dbpedia-spotlight.en:80/rest/annotate/'
def setDbPediaAnnotationServiceParameters(text):
    """ Se parameters for querying Dbpedia
    args: text - text to be analysed
    return: headers
            params
    """
    headers = {
        'Accept':'application/json',
        "content-type":"application/x-www-form-urlencoded"
    }

    params = {
        'confidence': '0.4',
        "text":text,
    }
    return headers, params

def setDbPediaAnnotationServiceParametersTypes(text,types):
    """ Se parameters for querying Dbpedia
    args: text - text to be analysed
        types: different types of entities
    return: headers
            params
    """
    headers = {
        'Accept':'application/json',
        "content-type":"application/x-www-form-urlencoded"
    }

    params = {
        'types' : types,
        'confidence': '0.35',
        "text":text,
    }
    return headers, params

# return response in JSON format
def queryDBPediaAnnotation(url,header,params):
    response = requests.post(url,headers=header, params=params).json()
    return response

In [8]:
def listFiles (directory):
    patientList = [f for f in os.listdir(directory) if not f.startswith('.')] 
    return patientList
def openTextFile(directory,filename):
    fileObject = open(directory+filename, "r")
#     data = fileObject.read()
    data = fileObject.readlines()
#     print(data)
    return data
def readCESVFile(directory, file):
    return pd.read_csv(directory+file)

In [14]:
# for each paragraph in the item
# people = {'Glenn_Miller':'64610','Andrés_Segovia':'71932','Django_Reinhardt':'9039','Maria_Callas':'64966','Édith_Piaf':'64963','John_Lennon':'15852'}
# people = {'64610':'Glenn_Miller','71932':'Andrés_Segovia','9039':'Django_Reinhardt','64966':'Maria_Callas','64963':'Édith_Piaf','15852':'John_Lennon'}
people_list = {'9039':'Django_Reinhardt'}

# files = listFiles('cleanText/')
dfsummary = pd.DataFrame()
for file in people_list:
    print(people_list[file])
    # open file
    biography = openTextFile('cleanText/',file+'.txt')
    # start counter for paragraph identifier
    prgId = 0
    dfparagraphs = pd.DataFrame()
    
    for prg in biography:
#         print(prg)
        prgId += 1
        ## send text and return params
        hdrs, prms = setDbPediaAnnotationServiceParameters(prg)
        responseJSON = queryDBPediaAnnotation(urlAnnotation,hdrs,prms)
#         print(responseJSON)
        
        if 'Resources' in responseJSON:
            df = pd.DataFrame.from_dict(responseJSON['Resources'])
#         if len(df)>0:
            df.sort_values(by=['@similarityScore'],inplace=True,ascending=[False])
            df.drop_duplicates(subset = ['@URI','@similarityScore','@surfaceForm'],keep = "first", inplace = True)
        df['paragraph']=prg
        dfparagraphs = dfparagraphs.append(df)
#         dfparagraphs.info()
        
    dfparagraphs['wikiPageID'] = file.replace('.txt','')
    dfparagraphs['artistName'] = people_list[file.replace('.txt','')]
    dfsummary = dfsummary.append(dfparagraphs)

Django_Reinhardt
<class 'pandas.core.frame.DataFrame'>
Int64Index: 17 entries, 18 to 11
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   @URI                     17 non-null     object
 1   @support                 17 non-null     object
 2   @types                   17 non-null     object
 3   @surfaceForm             17 non-null     object
 4   @offset                  17 non-null     object
 5   @similarityScore         17 non-null     object
 6   @percentageOfSecondRank  17 non-null     object
 7   paragraph                17 non-null     object
dtypes: object(8)
memory usage: 1.2+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 29 entries, 18 to 3
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   @URI                     29 non-null     object
 1   @support                 29 non-null     obj

In [7]:
dfsummary.info()
dfsummary.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 848 entries, 14 to 2
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   @URI                     848 non-null    object
 1   @support                 848 non-null    object
 2   @types                   848 non-null    object
 3   @surfaceForm             848 non-null    object
 4   @offset                  848 non-null    object
 5   @similarityScore         848 non-null    object
 6   @percentageOfSecondRank  848 non-null    object
 7   paragraph                848 non-null    object
 8   wikiPageID               848 non-null    object
 9   artistName               848 non-null    object
dtypes: object(10)
memory usage: 72.9+ KB


Unnamed: 0,@URI,@support,@types,@surfaceForm,@offset,@similarityScore,@percentageOfSecondRank,paragraph,wikiPageID,artistName
14,http://dbpedia.org/resource/American_Patrol,29,"Wikidata:Q386724,Wikidata:Q2188189,Schema:Musi...",American Patrol,440,1.0,0.0,"Alton Glenn Miller (March 1, 1904 – disappeare...",64610,Glenn_Miller
10,http://dbpedia.org/resource/Moonlight_Serenade,106,"Wikidata:Q386724,Wikidata:Q2188189,Schema:Musi...",Moonlight Serenade,304,1.0,9.764469070529063e-55,"Alton Glenn Miller (March 1, 1904 – disappeare...",64610,Glenn_Miller
27,http://dbpedia.org/resource/English_Channel,7343,"Schema:Place,Schema:BodyOfWater,DBpedia:Place,...",English Channel,937,1.0,5.021093162121786e-22,"Alton Glenn Miller (March 1, 1904 – disappeare...",64610,Glenn_Miller
19,http://dbpedia.org/resource/Elvis_Presley,10377,"Http://xmlns.com/foaf/0.1/Person,Wikidata:Q5,W...",Elvis Presley,625,1.0,7.21480869762467e-26,"Alton Glenn Miller (March 1, 1904 – disappeare...",64610,Glenn_Miller
17,http://dbpedia.org/resource/Anvil_Chorus,38,,Anvil Chorus,517,1.0,0.0,"Alton Glenn Miller (March 1, 1904 – disappeare...",64610,Glenn_Miller


In [15]:
dfsummary.to_csv('datasets/'+str(date.today())+'_9039_entities_filter.csv',index=False)

In [27]:
# files_list = listFiles ("cleanText/")
# files_list = {'9039':'Django_Reinhardt'}
files_list = {'9039_codedParagraphs.csv':'Django_Reinhardt'}

for biographyFileName in files_list:
    if biographyFileName.endswith(".csv"):
        print(biographyFileName[0:(biographyFileName.find('_'))])
        resultEntity_df = findPeoplePlacesEntities(biographyFileName)
        
        resultEntity_df.to_csv('datasets/'+biographyFileName[0:biographyFileName.find('_')]+'_codedParagraphs.csv',index=False)

9039


In [25]:
def findPeoplePlacesEntities(file):
    entitiyTypes = 'DBpedia:Person,DBPedia:Place,dbo:MusicalArtist'
    # for each paragraph in the item
    # people = {'Glenn_Miller':'64610','Andrés_Segovia':'71932','Django_Reinhardt':'9039','Maria_Callas':'64966','Édith_Piaf':'64963','John_Lennon':'15852'}
    people_list = {'64610':'Glenn_Miller','71932':'Andrés_Segovia','9039':'Django_Reinhardt','64966':'Maria_Callas','64963':'Édith_Piaf','15852':'John_Lennon'}
#     people_list = {'9039':'Django_Reinhardt'}

    # files = listFiles('cleanText/')
    dfsummary = pd.DataFrame()
#     for file in people_list:
    #     print(people_list[file.replace('.txt','')])
        # open file
#         biography = openTextFile('cleanText/',file+'.txt')
    biography_df = readCESVFile('cleanText/',file)
    # start counter for paragraph identifier
    prgId = 0
    dfparagraphs = pd.DataFrame()
    for prg in biography_df.itertuples():
#         print(prg)
        prgId += 1
        ## send text and return params
        hdrs, prms = setDbPediaAnnotationServiceParametersTypes(prg.paragraph,entitiyTypes)
        responseJSON = queryDBPediaAnnotation(urlAnnotation,hdrs,prms)
#         print(responseJSON)

        if 'Resources' in responseJSON:
            df = pd.DataFrame.from_dict(responseJSON['Resources'])
#         if len(df)>0:
            df.sort_values(by=['@similarityScore'],inplace=True,ascending=[False])
            df.drop_duplicates(subset = ['@URI','@similarityScore','@surfaceForm'],keep = "first", inplace = True)
        else:
            continue
        df['paragraph']=prg.paragraph
        df['prgIndex'] = prg.paragraphIndex
        df['section'] = prg.section
        dfparagraphs = dfparagraphs.append(df)
#         dfparagraphs.info()

    dfparagraphs['wikiPageID'] = file[0:file.find('_')]
    dfparagraphs['artistName'] = people_list[file[0:file.find('_')]]
    dfsummary = dfsummary.append(dfparagraphs)
    
    return dfsummary

In [41]:
# import nltk.tokenize as nt
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
# from nltk.chunk import ne_chun
from nltk import Tree
# from chunkers import sub_leaves

import pandas as pd
from _datetime import date
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
# import nltk_contrib
# import timex
# from nltk_contrib.timex import *

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/jovyan/nltk_data...
[nltk_data]   Package words is already up-to-date!


ModuleNotFoundError: No module named 'nltk_contrib'

In [None]:
# obtain the entities for each sentence
for biographyFileName in files_list:
    if biographyFileName.endswith(".csv"):
        print(biographyFileName[0:(biographyFileName.find('_'))])
        time_entities = getTimeEntities(biographyFileName)

In [None]:
def getTimeEntities(file):
    biography_df = readCESVFile('cleanText/',file)
    # start counter for paragraph identifier
    dfsummary = pd.DataFrame()
    for prg in biography_df.itertuples():
        ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(prg.paragraph)))
        # extract 2 types of entities for now: PERSON, LOCATION
        keyPersons = []
        keyLocations = []
        for t in ne_tree.subtrees():
            print(t)

In [44]:
def setPattern(typeConfiguration):
    pattern = ""
    if typeConfiguration == 'date':
        pattern = r"""NPD: {<CD>?<NNP>*<CD>} #complete dates Eg. 23 January 
        """
    else:
        pattern = r"""NBAR:
    
            {<NN.*>*<NN.*>} # Nouns and Adjectives, terminated with Nouns
            NP:
            {<NBAR>} 
            {<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
            """
    return pattern

In [49]:

# line = 'In the fall of 1919, he joined the high school American football team, the Maroons, which won the Northern Colorado American Football Conference in 1920.'
line = "Jean Reinhardt (23 January 1910 – 16 May 1953), known to all by his Romani nickname Django (French: [dʒãŋɡo ʁɛjnaʁt] or [dʒɑ̃ɡo ʁenɑʁt]), was a Belgian-born Romani-French jazz guitarist and composer. He was the first major jazz talent to emerge from Europe and remains the most significant.With violinist Stéphane Grappelli, Reinhardt formed the Paris-based Quintette du Hot Club de France in 1934. The group was among the first to play jazz that featured the guitar as a lead instrument. Reinhardt recorded in France with many visiting American musicians, including Coleman Hawkins and Benny Carter, and briefly toured the United States with Duke Ellington's orchestra in 1946. He died suddenly of a stroke in 1953 at the age of 43."
pattern = setPattern('date')
ss=nt.sent_tokenize(line)
tokenized_sent=[nt.word_tokenize(sent) for sent in ss]
pos_sentences=[nltk.pos_tag(sent) for sent in tokenized_sent]
print(pos_sentences)
cp = nltk.RegexpParser(pattern)
cs = cp.parse(pos_sentences[4])
print(cs)

[[('Jean', 'JJ'), ('Reinhardt', 'NNP'), ('(', '('), ('23', 'CD'), ('January', 'NNP'), ('1910', 'CD'), ('–', 'NNP'), ('16', 'CD'), ('May', 'NNP'), ('1953', 'CD'), (')', ')'), (',', ','), ('known', 'VBN'), ('to', 'TO'), ('all', 'DT'), ('by', 'IN'), ('his', 'PRP$'), ('Romani', 'NNP'), ('nickname', 'NN'), ('Django', 'NNP'), ('(', '('), ('French', 'JJ'), (':', ':'), ('[', 'JJ'), ('dʒãŋɡo', 'NN'), ('ʁɛjnaʁt', 'NNP'), (']', 'NNP'), ('or', 'CC'), ('[', 'NNP'), ('dʒɑ̃ɡo', 'VBP'), ('ʁenɑʁt', 'NNP'), (']', 'NNP'), (')', ')'), (',', ','), ('was', 'VBD'), ('a', 'DT'), ('Belgian-born', 'JJ'), ('Romani-French', 'JJ'), ('jazz', 'NN'), ('guitarist', 'NN'), ('and', 'CC'), ('composer', 'NN'), ('.', '.')], [('He', 'PRP'), ('was', 'VBD'), ('the', 'DT'), ('first', 'JJ'), ('major', 'JJ'), ('jazz', 'NN'), ('talent', 'NN'), ('to', 'TO'), ('emerge', 'VB'), ('from', 'IN'), ('Europe', 'NNP'), ('and', 'CC'), ('remains', 'VBZ'), ('the', 'DT'), ('most', 'RBS'), ('significant.With', 'JJ'), ('violinist', 'NN'), ('Stép

In [30]:
pattern = r"""NBAR:
    
    {<NN.*>*<NN.*>} # Nouns and Adjectives, terminated with Nouns
    NP:
    {<NBAR>} 
    {<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
    """

# NP: {<DT>?<JJ>*<NN>}
line = 'In the fall of 1919, he joined the high school American football team, the Maroons, which won the Northern Colorado American Football Conference in 1920.'
# line = 	"In 1926, Miller toured with several groups, landing a good spot in Ben Pollack's group in Los Angeles."
# pattern = 'PERSON: {<NN.*>*<NN.*>}'
# text = readText(person)
ss=nt.sent_tokenize(line)
tokenized_sent=[nt.word_tokenize(sent) for sent in ss]
pos_sentences=[nltk.pos_tag(sent) for sent in tokenized_sent]
print(pos_sentences)
cp = nltk.RegexpParser(pattern)
cs = cp.parse(pos_sentences[0])
print(cs)

[[('In', 'IN'), ('the', 'DT'), ('fall', 'NN'), ('of', 'IN'), ('1919', 'CD'), (',', ','), ('he', 'PRP'), ('joined', 'VBD'), ('the', 'DT'), ('high', 'JJ'), ('school', 'NN'), ('American', 'NNP'), ('football', 'NN'), ('team', 'NN'), (',', ','), ('the', 'DT'), ('Maroons', 'NNP'), (',', ','), ('which', 'WDT'), ('won', 'VBD'), ('the', 'DT'), ('Northern', 'NNP'), ('Colorado', 'NNP'), ('American', 'NNP'), ('Football', 'NNP'), ('Conference', 'NNP'), ('in', 'IN'), ('1920', 'CD'), ('.', '.')]]
(S
  In/IN
  the/DT
  (NP (NBAR fall/NN))
  of/IN
  1919/CD
  ,/,
  he/PRP
  joined/VBD
  the/DT
  high/JJ
  (NP (NBAR school/NN American/NNP football/NN team/NN))
  ,/,
  the/DT
  (NP (NBAR Maroons/NNP))
  ,/,
  which/WDT
  won/VBD
  the/DT
  (NP
    (NBAR
      Northern/NNP
      Colorado/NNP
      American/NNP
      Football/NNP
      Conference/NNP))
  in/IN
  1920/CD
  ./.)
