# Information Extraction

In [1]:
#importing the required libraries
import nltk

#importing library for regex
import re
from statistics import mode
import json
from pyld import jsonld
import pandas as pd

#importing library for decoding non-english letters.
import unidecode

In [2]:
#reading in the text file 
inputfile='football_players.txt' #Location of the file
buf=open(inputfile, encoding="utf8")

#listing out the documents in the text file
list_of_doc = buf.read().split('\n')

#printing each document.
#Note: each document is seperated by a new line.
for i in list_of_doc:
    print(i)

Cristiano Ronaldo dos Santos Aveiro, ComM, GOIH (born 5 February 1985) is a Portuguese professional footballer who plays for Spanish club Real Madrid and the Portugal national team. He is a forward and serves as captain for Portugal. In 2008, he won his first Ballon d'Or and FIFA World Player of the Year awards. He then won the FIFA Ballon d'Or in 2013 and 2014. In 2015, Ronaldo scored his 500th senior career goal for club and country. Often ranked as the best player in the world, Ronaldo was named the best Portuguese player of all time by the Portuguese Football Federation, during its 100th anniversary celebrations in 2015. He is the only player to win four European Golden Shoe awards. One of the most marketable athletes in sport, in 2016 Forbes named Ronaldo the world's best paid athlete. In June 2016, ESPN ranked him the world's most famous athlete. Ronaldo began his club career playing for Sporting CP, before signing with Manchester United at age 18 in 2003. After winning his first

# Pre-Process Text
Write a function that takes each document and performs:
1) sentence segmentation 2) tokenization 3) part-of-speech tagging

Please keep in mind that the expected output is a list within a list as shown below.


In [3]:
#defining a function for the first task.
def ie_preprocess(document): 
    
    #performing sentence segmentation
    seg_sentence = ''.join(nltk.sent_tokenize(document)) #step 1
    
    #performing Tokenization
    tokens = nltk.word_tokenize(seg_sentence)
    
    #performing parts of speech tagging
    pos_sent = nltk.pos_tag(tokens)
    
    #returning POS tags from the function
    return pos_sent

In [4]:
#extrcating the first document in the text file and passing it as an argument to the function defined above
first_doc=list_of_doc[0]

#storing the value returned from the function into a variable
pos_sent=ie_preprocess(first_doc)

#Printing the pos tags
pos_sent

[('Cristiano', 'NNP'),
 ('Ronaldo', 'NNP'),
 ('dos', 'NN'),
 ('Santos', 'NNP'),
 ('Aveiro', 'NNP'),
 (',', ','),
 ('ComM', 'NNP'),
 (',', ','),
 ('GOIH', 'NNP'),
 ('(', '('),
 ('born', 'VBN'),
 ('5', 'CD'),
 ('February', 'NNP'),
 ('1985', 'CD'),
 (')', ')'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('Portuguese', 'JJ'),
 ('professional', 'JJ'),
 ('footballer', 'NN'),
 ('who', 'WP'),
 ('plays', 'VBZ'),
 ('for', 'IN'),
 ('Spanish', 'JJ'),
 ('club', 'NN'),
 ('Real', 'NNP'),
 ('Madrid', 'NNP'),
 ('and', 'CC'),
 ('the', 'DT'),
 ('Portugal', 'NNP'),
 ('national', 'JJ'),
 ('team.He', 'NN'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('forward', 'NN'),
 ('and', 'CC'),
 ('serves', 'NNS'),
 ('as', 'IN'),
 ('captain', 'NN'),
 ('for', 'IN'),
 ('Portugal.In', 'NNP'),
 ('2008', 'CD'),
 (',', ','),
 ('he', 'PRP'),
 ('won', 'VBD'),
 ('his', 'PRP$'),
 ('first', 'JJ'),
 ('Ballon', 'NNP'),
 ("d'Or", 'NN'),
 ('and', 'CC'),
 ('FIFA', 'NNP'),
 ('World', 'NNP'),
 ('Player', 'NNP'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('Year', 'NNP'),

Run the following code to check your result for the first document (Ronaldo).

Expected output
 [...[('He', 'PRP'),
  ('is', 'VBZ'),
  ('a', 'DT'),
  ('forward', 'NN'),
  ('and', 'CC'),
  ('serves', 'NNS'),
  ('as', 'IN'),
  ('captain', 'NN'),
  ('for', 'IN'),
  ('Portugal', 'NNP'),
  ('.', '.')], ...]

# Create NE
Write a function that will take the list of tokens with POS tags for each sentence and returns the named entities (NE). 

Use binary=True while calling NE chunk function

In [5]:
#defining the named entity function. This function takes in the POS tags generated from the previous function and display the named entities.
def named_entity_finding(pos_tags):
    #ne_chunk returns a nested NLTK tree of the pos tags.
    tree = nltk.ne_chunk(pos_tags, binary=True)
    #declaring an empty list to store the named entities
    named_entities = []
    #iterating over the trees to access the subtress
    for subtree in tree.subtrees():
#         print(subtree)
        #extracting those subtrees which has NE label
        if subtree.label() == 'NE':
            #declaring a variable to store the entities
            entity = ""
            #iterating over the leaves in a subtree
            for leaf in subtree.leaves():
                #storing the entities in the variable
                entity = entity + leaf[0] + " "
            #appending the values in entities to the named entities list.
            named_entities.append(entity.strip())
    #returning named entities
    return named_entities



In [9]:
#calling the function with pos_sent argument. pos_sent contains the POS tags for the first document in the list.
output=named_entity_finding(pos_sent)
print(list(set(output)))

['European', 'England', 'GOIH', 'Santos Aveiro', 'Real Madrid', 'Michel Platini', 'European Golden Shoe', 'Cristiano Ronaldo', 'FIFA', 'Portuguese Football Federation', 'ESPN', 'Spanish', 'Spain', 'FIFA World Cups', 'United', 'Ronaldo', 'UEFA Champions League', 'Portugal', 'FIFA Club', 'FIFA Ballon', 'Lionel Messi', 'Silver Boot', 'Portuguese', 'ComM', 'UEFA European', 'France', 'Madrid', 'La Liga', 'Manchester United', 'Ballon']


Expected output ['Cristiano Ronaldo',
 'Santos Aveiro',
 'ComM',
 'GOIH',
 'Portuguese',
 'Portuguese',
 'Spanish',
 'Real Madrid',
 'Portugal']

# Extract NE from Text

Now use the named_entity_finding() function to extract all NEs for each document.

pos_sents holds the list of lists of tokens with POS tags

In [10]:
#Defining an empty list for storing pos tags for each doc in list_of_doc
pos_sents=[]
for d in list_of_doc:
    pos_sents.append(ie_preprocess(d))
    
#Defining function to stire the extracted NEs for each doc.
def NE_flat_list_fn(pos_sents): 
    NE=[]
    for i in pos_sents:
        NE.append(named_entity_finding(i))
    
    #Flattening list
    NE_flat_list = [y for x in NE for y in x]
    
    return NE_flat_list

#Calling the function
NE_flat_list_fn(pos_sents)

['Cristiano Ronaldo',
 'Santos Aveiro',
 'ComM',
 'GOIH',
 'Portuguese',
 'Spanish',
 'Real Madrid',
 'Portugal',
 'Ballon',
 'FIFA',
 'FIFA Ballon',
 'Ronaldo',
 'Ronaldo',
 'Portuguese',
 'Portuguese Football Federation',
 'European Golden Shoe',
 'ESPN',
 'Manchester United',
 'England',
 'United',
 'UEFA Champions League',
 'FIFA Club',
 'Ballon',
 'FIFA',
 'Manchester United',
 'Madrid',
 'Spain',
 'UEFA Champions League',
 'Ronaldo',
 'La Liga',
 'Ronaldo',
 'UEFA Champions League',
 'Real Madrid',
 'La Liga',
 'Lionel Messi',
 'Portugal',
 'Portugal',
 'European',
 'FIFA World Cups',
 'Portuguese',
 'Portugal',
 'Portugal',
 'Portugal',
 'Ronaldo',
 'UEFA European',
 'European',
 'Michel Platini',
 'Portugal',
 'France',
 'Silver Boot',
 'Lionel Andrés',
 'Spanish',
 'Argentina',
 'Messi',
 'FIFA Ballons',
 'European Golden',
 'Messi',
 'La Liga',
 'La Liga',
 'Copa',
 'Argentina',
 'Messi',
 'Spain',
 'Barcelona',
 'Barcelona',
 'Messi',
 'Ballon',
 'FIFA',
 'Barcelona',
 'Span

# Extract Information Complete

Write functions to extract the name of the player, country of origin and date of birth as well as the following relations: team(s) of the player and position(s) of the player.

Hint: Use the re.compile() function to create the extraction patterns

Reference: https://docs.python.org/3/howto/regex.html

In [11]:
#Defining function for extracting name of the player.
def name_of_the_player(doc):
    
    #Validation Check. By default there is an empty line present at each odd index in list_of_doc. If the entered index is odd
    #a message is displayed to the user
    if doc == '':
        return("odd numbered documents are empty lines, please enter another document number")
    
    #Else extracting the names. The named_entity_finding() extracts all the named entities from the document. The first element
    # in the returned list has the name of the player. Thus, using the functions defined above to extract the
    #name of the player here. Another method is implemented here using String.split() which extracts the names along with 
    # various title as well.
    else:
        pos_sent1=ie_preprocess(doc)
        ne_1=named_entity_finding(pos_sent1)
        name = ne_1[0]
        
        #Extracting Name with various title.
        full_name=doc.split("(")[0]
        print(full_name)
    return name

#############################################################################################################

#Defining a function to extract country.
def country_of_origin(doc):
    #declaring an empty list to store the country values
    country = []
    
    #Validation Check. By default there is an empty line present at each odd index in list_of_doc. If the entered index is odd
    #a message is displayed to the user
    if doc == '':
        return ("odd numbered documents are empty lines, please enter another document number")
    else:
        #Extracting the sentence which contains the words national team. The word before national is name pof the country.
        c1=re.findall(r'\b \w+ national team\b', doc)
        
        #Extracting the first word of the sentence fragment.
        c2=c1[0]
        
        #Storing the country name
        country=re.findall(r'\w+',c2)[0]
    
    #returning the country name
    return country

#############################################################################################################

#Defining a function to extract date of birth
def date_of_birth(doc):
    
    #Empty list for storing date
    date = []
    #Validation Check. By default there is an empty line present at each odd index in list_of_doc. If the entered index is odd
    #a message is displayed to the user
    if doc == '':
        return("odd numbered documents are empty lines, please enter another document number")
    else:   
        #Finding first occurence of date in the given format.
        date=re.findall(r'\d{1,2}\s(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4}', doc)
        
        #unlisting the value extracted
        date=''.join(date)
    #returning date
    return date

#############################################################################################################

#Extracting the team of the player
def team_of_the_player(doc):
    #empty list for storing the team extracted
    team=[]
    #Validation Check. By default there is an empty line present at each odd index in list_of_doc. If the entered index is odd
    #a message is displayed to the user
    
    #There are two types of team present in the document. One is the national team of the player
    #another is the club for which the player plays. The club name starts with either the word club or FC or United.
    
    #Checking for the presence of the term club and rextracting the sentence.
    if 'club' in doc:
        #extravcting the club of the player
        r1=re.findall(r'\w+ club [^.]*\b',doc)
        tg_sent=ie_preprocess(r1[0])
        #Name has NNP tag, thus extracting the NNPs fromt he returned set.
        propernouns = [word for word,pos in tg_sent if pos == 'NNP']
        team1=' '.join(propernouns)
#         print(team1)
        #National team name
        team2=re.findall(r'\b \w+ national team\b', doc)[0]
        #Storing both the team names in a list
        team.append(team1)
        team.append(team2)
    
    #Checking for the presence of the term United or FC and rextracting the sentence.
    else:
        r1=re.findall(r'\w+ United|FC [^.]*\b',doc)
        tg_sent=ie_preprocess(r1[0])
        propernouns = [word for word,pos in tg_sent if pos == 'NNP']
        team1=' '.join(propernouns) 
#         print(team1)
        #National team
        team2=re.findall(r'\b \w+ national team\b', doc)[0]
        team.append(team1)
        team.append(team2)
       
    #Returning team
    return team

#############################################################################################################

#Defining Function to extract the position of the players in the team.
def position_of_the_player(doc):
    #code goes here
    
    #Defining a corpus which has all the different types of positions available
    positions=['Goalkeeper', 'Defender', 'Centre back', 'Sweeper', 'Full back', 'Striker', 'Wing back', 'Midfielder', 'Centre midfield', 'Defensive midfield', 'Attacking midfield', 
    'Wide midfield', 'Forward', 'Centre forward','Second striker', 'Winger', 'Player styles', 'Goalkeeper', 'Defensive', 'Midfield', 'Attacking']
    
    #Validation Check. By default there is an empty line present at each odd index in list_of_doc. If the entered index is odd
    #a message is displayed to the user 
    if doc == '':
        return("odd numbered documents are empty lines, please enter another document number")
    else:
        #Extracting position matched from the dictionary
        i1=[]
        for i in positions:
            if str(i).lower() in str(doc).lower():
                i1.append(i)
                position=' '.join(i1)
    #Returning the extracted posiiton.
    return position.split()

### Calling the Functions defined above

In [12]:
name_of_the_player(list_of_doc[4])

Neymar da Silva Santos Júnior 


'Neymar'

In [13]:
country_of_origin(list_of_doc[4])        

'Brazil'

Execute the below command to check your fuction


In [14]:
date_of_birth(list_of_doc[4])

'5 February 1992'

Expected output '5 February 1992'

In [15]:
team_of_the_player(list_of_doc[4])

['FC Barcelona Brazil', ' Brazil national team']

In [16]:
position_of_the_player(list_of_doc[4])

['Forward', 'Attacking']

# Create JSON for Data Consumption

Write a function using the outputs from the previous functions to generate JSON-LD output as follows.

Reference: https://json-ld.org/primer/latest/

{ "@id": "http://my-soccer-ontology.com/footballer/name_of_the_player",

    "name": "",
    "born": "",
    "country": "",
    "position": [
        { "@id": "http://my-soccer-ontology.com/position",
            "type": ""
        }
     ]   
     "team": [
        { "@id": "http://my-soccer-ontology.com/team",
            "name": ""
        }   
     ]
}


In [17]:
#Creating Json LD for the above defined function for all the documents in the list.

#Length of list_of_doc is 20 (as it has one empty line in between each doc).
for i in range (0,20):
    #Every odd numbered line is an empty line therefore when the line number is even, the functions are called
    #Storing the values returned by each function in the arg variables.
    if i%2 == 0:
        arg1=name_of_the_player(list_of_doc[i]).encode().decode()       
        arg2=date_of_birth(list_of_doc[i])
        arg3=country_of_origin(list_of_doc[i])
        arg4=position_of_the_player(list_of_doc[i])
        arg5=team_of_the_player(list_of_doc[i])
        
        #Defining function to generate the json ld
        def generate_jsonld(arg1,arg2,arg3,arg4,arg5):
            
            #Json LD has Key value pair i.e context and doc as the body
            #Assigning Values to each of the keys present in the context in the doc area.       
            #Name, Born, Country, Position and team hold the values generated from the functions defined above.
            #The output will be given out in this format.
            context = {
                
                #Defining the keys.
                "name": "http://schema.org/name",
                "born": "http://schema.org/born",
                "country": "http://schema.org/country",
                "position":{"@id":"http://schema.org/position",
                            "@type":"@id"},
                "team": {"@id":"http://schema.org/team",
                        "@type":"@id"}
            }
            
            #Defining the values. unicode.unidecod() is used to convert non english characters to readable characters. Otherwise
            #they are printed in \u**** format.
            doc = {
                "@id": "http://my-soccer-ontology.com/footballer/name_of_the_player",
                "http://schema.org/name": unidecode.unidecode(arg1),
                "http://schema.org/born": arg2,
                "http://schema.org/country": arg3,                
                "http://schema.org/position":{"@id":"http://schema.org/position",
                            "@type":arg4} ,
                "http://schema.org/team":{"@id":"http://schema.org/position",
                            "@type":arg5} 
            }
            
            #Generating a compacted version of the doc anf context.
            compacted = jsonld.compact(doc, context)
            
            #We need to visualise on the doc part therefore popping the @context part
            compacted.pop("@context")
            
            #Returning the value
            return json.dumps(compacted, indent=2)
#             



        #Calling the Json ld function witht the required arguments.
        print(generate_jsonld(arg1,arg2,arg3,arg4,arg5))
# Note: Because i have printed the name in two ways. Extraction through one way is returned and the extracted name through
#the second way is printed. The full Name is thus printed before the json ld gets printed for each document. It also enhances
#readability. Thus, i have not removed it.

Cristiano Ronaldo dos Santos Aveiro, ComM, GOIH 
{
  "@id": "http://my-soccer-ontology.com/footballer/name_of_the_player",
  "born": "5 February 1985",
  "country": "Portugal",
  "name": "Cristiano Ronaldo",
  "position": {
    "@id": "http://schema.org/position",
    "@type": "/Forward"
  },
  "team": {
    "@id": "http://schema.org/position",
    "@type": [
      "/Real Madrid Portugal",
      "/ Portugal national team"
    ]
  }
}
Lionel Andrés "Leo" Messi 
{
  "@id": "http://my-soccer-ontology.com/footballer/name_of_the_player",
  "born": "24 June 1987",
  "country": "Argentina",
  "name": "Lionel Andres",
  "position": {
    "@id": "http://schema.org/position",
    "@type": "/Forward"
  },
  "team": {
    "@id": "http://schema.org/position",
    "@type": [
      "/FC Barcelona Argentina",
      "/ Argentina national team"
    ]
  }
}
Neymar da Silva Santos Júnior 
{
  "@id": "http://my-soccer-ontology.com/footballer/name_of_the_player",
  "born": "5 February 1992",
  "country": "B

# Data Relationship
Identify one other relation (besides team and player) and write a function to extract this. Also extend the JSON-LD output accordingly.

In [18]:
#Extracting debut information about each player.
#Using regex to find the lines giving information about the debut matches of the player for each doc.

#Defining the function.
def debut_info(doc):   
    
    #Fetching information related to debut match and returning the result.
    return (re.findall(r'[^.]* debut [^.]*\.|[^.]* debut [^.]*\.', doc))

In [19]:

#Extending Json LD

#This part of the code is done with the doc at 0 in list_of_doc
arg_1=name_of_the_player(list_of_doc[0])
arg_2=date_of_birth(list_of_doc[0])
arg_3=country_of_origin(list_of_doc[0])
arg_4=position_of_the_player(list_of_doc[0])
arg_5=team_of_the_player(list_of_doc[0])
arg_6=debut_info(list_of_doc[0])

#Creating Json LD using all the 6 arguments here
def generate_jsonld2(arg_1,arg_2,arg_3,arg_4,arg_5,arg_6):
   
    #The Value part of the Json LD part    
    doc = {
                "@id": "http://my-soccer-ontology.com/footballer",
                "http://schema.org/name": unidecode.unidecode(arg_1),
                "http://schema.org/born": arg_2,
                "http://schema.org/country": arg_3,                
                "http://schema.org/position":{"@id":"http://schema.org/position",
                            "@type":arg_4} ,
                "http://schema.org/debut":{"@id":"http://schema.org/debut",
                            "@type":arg_6}
            }
    #The Key part of the Json LD part
    context = {

                "name": "http://schema.org/name",
                "born": "http://schema.org/born",
                "country": "http://schema.org/country",
                "position":{"@id":"http://schema.org/position",
                            "@type":"@id"},
                "team": {"@id":"http://schema.org/team",
                        "@type":"@id"},
                "debut": {"@id":"http://schema.org/debut",
                        "@type":"@id"}
            }
    compacted = jsonld.compact(doc, context)
    compacted.pop("@context")
    #Printing final Json LD model
    print(json.dumps(compacted, indent=2))

#Calling the function
generate_jsonld2(arg_1,arg_2,arg_3,arg_4,arg_5, arg_6)

Cristiano Ronaldo dos Santos Aveiro, ComM, GOIH 
{
  "@id": "http://my-soccer-ontology.com/footballer",
  "born": "5 February 1985",
  "country": "Portugal",
  "debut": {
    "@id": "http://schema.org/debut",
    "@type": "/ Ronaldo made his international debut for Portugal in August 2003, at the age of 18."
  },
  "name": "Cristiano Ronaldo",
  "position": {
    "@id": "http://schema.org/position",
    "@type": "/Forward"
  }
}
