In [140]:
"""
Main module for the wikitree application.
"""

import argparse
import spacy
nlp = spacy.load('en_core_web_sm')
! pip install wikipedia
import wikipedia
from wikipedia.exceptions import DisambiguationError, PageError
from tabulate import tabulate
from spacy.matcher import Matcher
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize

#parser = argparse.ArgumentParser()
#parser.add_argument('query', type=str, metavar='Q',
#                    help='Query to retrieve as root from Wikipedia.')
#parser.add_argument('--depth', '-d', type=int, default=2,
#                   help='Max tree depth.')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [141]:

class RelationshipGraph(object):
    """
    This class represents a relationship graph containing multiple nodes and the edges connecting them.
    """

    def __init__(self, query: str, depth: int = 2, width: int = 2) -> None:
        """
        Initialize an empty graph with an initial query.
        :param query: This is the value for the query to generate the initial node in the graph. The value 
            will be updated when the node is fetched if there is a disambiguation error or if the name of 
            the page is different.
        """
        self.initial_query = query
        self.nodes = {}  # nodes indexed by key/name
        self.edges = []  # set of tuples of three elements, the keys for the nodes at either end of the 
                         # edge and the label for the relationship
        self.depth = depth
        self.width = width

    def fetch(self):
        """
        Fetch nodes and their relationships from Wikipedia.
        """
        initial_node = GraphNode(self.initial_query)
        initial_node.fetch(self, self.depth, self.width)

    def display(self):
        print(tabulate(
            self.edges,
            headers=('From', 'To', 'Label')
        ))
        print('\n' + '\n'.join([_.name for _ in self.nodes.values()]))


class GraphNode(object):
    """
    This class represents a node in the concept graph.
    """

    def __init__(self, query):
        """
        Initialize graph node.
        :param query: Query for this node.
        """
        self.query = query
        self.page = None
        self.name = None

    def fetch(self, graph: RelationshipGraph, depth: int = 2, width: int = 2):
        """
        Retrive information for this node in the graph from Wikipedia. Determine candidates for adjacent 
        nodes and fetch for those as well with depth-1.
        :param depth: Depth of search.
        """
        print(f'Fetching: {self.query}')
        try:
            self.page = wikipedia.page(self.query, auto_suggest=False)
        except DisambiguationError as err:
            print(f'Disambiguating to {err.args[1][0]}')
            self.page = wikipedia.page(err.args[1][0], auto_suggest=False)
        self.name = self.page.title

        graph.nodes[self.name] = self
        
        if depth > 0:
            # Extract entities
            entities = nlp(self.page.content).ents  # Entities extracted from the text
            entity_counts = {}

            for e in entities:
                entity_counts[(e.text, e.label_)] = entity_counts.get((e.text, e.label_), 0) + 1

            # Select entities
            labels = ('PERSON', )
            candidate_entities = [k[0] for k, v in sorted(entity_counts.items(), key=lambda _: _[1]) if k[1] in labels]
            selected_entities = []
            while candidate_entities and len(selected_entities) < width:
                candidate = candidate_entities.pop()
                try:
                    page = wikipedia.page(candidate, auto_suggest=False)
                except DisambiguationError as err:
                    page = wikipedia.page(err.args[1][0], auto_suggest=False)
                except PageError:
                    continue
                print(f'{candidate} -> {page.title}')
                if page.title != self.page.title and page.title not in graph.nodes:
                    selected_entities.append(candidate)

            # Get selected entitites
            for query in selected_entities:
                if query in graph.nodes:
                    node = graph.nodes.get(query)
                else:
                    node = GraphNode(query)
                    node.fetch(graph, depth=depth - 1, width=width)
                
                graph.edges.append((self.name, node.name, 'UNK'))




In [142]:
if __name__ == '__main__':
    print('Welcome to Wikitree!')
    #args = parser.parse_args()

    graph = RelationshipGraph("Lionel Messi")
    graph.fetch()

    graph.display()


Welcome to Wikitree!
Fetching: Lionel Messi
Messi -> Lionel Messi
Maradona -> Diego Maradona
Cristiano Ronaldo -> Cristiano Ronaldo
Fetching: Maradona
Maradona -> Diego Maradona
Diego Maradona -> Diego Maradona
Boca Juniors -> Boca Juniors
Diego -> Diego
Fetching: Boca Juniors
Fetching: Diego
Fetching: Cristiano Ronaldo
Ronaldo -> Ronaldo
Cristiano Ronaldo -> Cristiano Ronaldo
Atlético Madrid -> Atlético Madrid
Fetching: Ronaldo
Fetching: Atlético Madrid
From               To                 Label
-----------------  -----------------  -------
Diego Maradona     Boca Juniors       UNK
Diego Maradona     Diego              UNK
Lionel Messi       Diego Maradona     UNK
Cristiano Ronaldo  Ronaldo            UNK
Cristiano Ronaldo  Atlético Madrid    UNK
Lionel Messi       Cristiano Ronaldo  UNK

Lionel Messi
Diego Maradona
Boca Juniors
Diego
Cristiano Ronaldo
Ronaldo
Atlético Madrid


In [143]:
test = RelationshipGraph("Lionel Messi")
test.fetch()
test.display()

Fetching: Lionel Messi
Messi -> Lionel Messi
Maradona -> Diego Maradona
Cristiano Ronaldo -> Cristiano Ronaldo
Fetching: Maradona
Maradona -> Diego Maradona
Diego Maradona -> Diego Maradona
Boca Juniors -> Boca Juniors
Diego -> Diego
Fetching: Boca Juniors
Fetching: Diego
Fetching: Cristiano Ronaldo
Ronaldo -> Ronaldo
Cristiano Ronaldo -> Cristiano Ronaldo
Atlético Madrid -> Atlético Madrid
Fetching: Ronaldo
Fetching: Atlético Madrid
From               To                 Label
-----------------  -----------------  -------
Diego Maradona     Boca Juniors       UNK
Diego Maradona     Diego              UNK
Lionel Messi       Diego Maradona     UNK
Cristiano Ronaldo  Ronaldo            UNK
Cristiano Ronaldo  Atlético Madrid    UNK
Lionel Messi       Cristiano Ronaldo  UNK

Lionel Messi
Diego Maradona
Boca Juniors
Diego
Cristiano Ronaldo
Ronaldo
Atlético Madrid


In [144]:
# Get content again of Wikipedia page
try:
  pag = wikipedia.page("Lionel Messi", auto_suggest=False).content
except DisambiguationError as err:
  pag = wikipedia.page(err.args[1][0], auto_suggest=False).content

In [145]:
# Nodes:
nodes = test.nodes
nodes

{'Atlético Madrid': <__main__.GraphNode at 0x7f1ddddfd9d0>,
 'Boca Juniors': <__main__.GraphNode at 0x7f1ddde11510>,
 'Cristiano Ronaldo': <__main__.GraphNode at 0x7f1ddde24dd0>,
 'Diego': <__main__.GraphNode at 0x7f1ddddfd8d0>,
 'Diego Maradona': <__main__.GraphNode at 0x7f1dc2172710>,
 'Lionel Messi': <__main__.GraphNode at 0x7f1dc212add0>,
 'Ronaldo': <__main__.GraphNode at 0x7f1dc11d5290>}

In [146]:
# Function to get a list of the nodes names
def getList(dict):
    list = []
    for key in dict.keys():
        list.append(key)
    return list

In [147]:
#List of nodes:
List=getList(nodes)
List

['Lionel Messi',
 'Diego Maradona',
 'Boca Juniors',
 'Diego',
 'Cristiano Ronaldo',
 'Ronaldo',
 'Atlético Madrid']

In [148]:
# Create list with different relationshops between de nodes.
# (Obviamente esto hay que hacerlo mejor)
Nivel_1_a = [List[0],List[1]]
Nivel_1_b = [List[0],List[4]]
Nivel_2_a = [List[1],List[2]]
Nivel_2_b = [List[1],List[3]]
Nivel_2_c = [List[4],List[5]]
Nivel_2_d = [List[4],List[6]]

In [149]:
# Function that extracts the ralationship from a sentance
# (Sacada del articulo del profesor)
def get_relation(sent):

  doc = nlp(sent)

  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", None, pattern) 

  matches = matcher(doc)
  k = len(matches) - 1

  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

In [150]:
# El exto no los nombra con nombre y apellido entonces reemplazo por solo apellidos para probar las funciones:
Nivel_1_a = ['Messi', 'Maradona']


In [151]:
#Function that extracts sentences that contain both entities
def get_sen_w_words(text,list1):
  sentences_with_word = []
  for sen in sent_tokenize(text):
     l = list(nlp(sen).ents)
     l_strings  = [i.text for i in l]
     if len(set(l_strings).intersection(list1))>1:
        sentences_with_word.append(sen)
  return(sentences_with_word)

In [152]:
# Obtain sentences from the Wikipedia page that have both entities
sentences_to_analyze = get_sen_w_words(pag,Nivel_1_a)

In [154]:
relations=[]
for sen in sentences_to_analyze:
  relations.append(get_relation(sen))
relations


['proved',
 'collected',
 'criticised for',
 'scored',
 'outlined',
 'followed in',
 'gained greater',
 'was',
 'said in',
 'been',
 'echoed',
 'held in lesser',
 'is in',
 'questioned']