In [1]:
# Import libraries 
import wikipedia as wiki 
import re 
import requests 
import spacy 
import spacy_transformers 
from spacy import displacy 
from spacy.matcher import Matcher 
import networkx as nx 
from pyvis.network import Network 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set up language of response 
wiki.set_lang("en") 

# Open any article from Wikipedia 
title = "Hoboken" 
data = wiki.page(title).content 

print(data)

Hoboken ( HOH-boh-kən; Unami: Hupokàn) is a city in Hudson County in the U.S. state of New Jersey. Hoboken is part of the New York metropolitan area and is the site of Hoboken Terminal, a major transportation hub. As of the 2020 United States census, the city's population was 60,419, an increase of 10,414 (+20.8%) from the 2010 census count of 50,005, which in turn reflected an increase of 11,428 (+29.6%) from the 38,577 counted in the 2000 census. The Census Bureau's Population Estimates Program calculated a population of 57,010 for 2023, making it the 708th-most populous municipality in the nation. With more than 42,400 inhabitants per square mile (16,400/km2) in data from the 2010 census, Hoboken was ranked as the third-most densely populated municipality in the United States among cities with a population above 50,000. In the 2020 census, the city's population density climbed to more than 48,300 inhabitants per square mile (18,600/km2) of land, ranked fourth in the county behind Gu

In [3]:
def preprocess_data(data: str) -> str:
    """
    Preprocess the data by removing references and formatting.
    """
    # Convert to lowercase and replace newlines 
    data = data.lower().replace('\n', "") 
    # Remove last part of the text, certain punctuation marks, headings, as well as any text within the parentheses 
    data = re.sub('== see also ==.*|[@#:&\"]|===.*?===|==.*?==|\(.*?\)', '', data) 
    return data 

data = preprocess_data(data) 
print(data)

hoboken  is a city in hudson county in the u.s. state of new jersey. hoboken is part of the new york metropolitan area and is the site of hoboken terminal, a major transportation hub. as of the 2020 united states census, the city's population was 60,419, an increase of 10,414  from the 2010 census count of 50,005, which in turn reflected an increase of 11,428  from the 38,577 counted in the 2000 census. the census bureau's population estimates program calculated a population of 57,010 for 2023, making it the 708th-most populous municipality in the nation. with more than 42,400 inhabitants per square mile  in data from the 2010 census, hoboken was ranked as the third-most densely populated municipality in the united states among cities with a population above 50,000. in the 2020 census, the city's population density climbed to more than 48,300 inhabitants per square mile  of land, ranked fourth in the county behind guttenberg, union city and west new york.hoboken was first settled by eu

In [4]:
# Load language model 
nlp = spacy.load("en_core_web_lg") 
doc = nlp(data) 

# Display named entities (comment out to see the output, quite large output)
#displacy.render(doc, style="ent", jupyter=True) 

In [5]:
# Compute coreference clusters 
# Add coreference resolution to the pipeline 
nlp.add_pipe("coreferee") 

# Process the document 
doc = nlp(data) 

In [6]:
# Print the coreference clusters 
doc._.coref_chains.print()

0: hoboken(0), hoboken(17)
1: county(7), state(12)
2: york(23), city(48)
3: states(142), city(157)
4: york.hoboken(187), city(214), hoboken(260)
5: township(242), it(244)
6: institute(275), it(290)
7: states(288), city(309), city(321)
8: jersey(334), city(352), city(430)
9: hoboken(412), hoboken(500), hoboken(567), hoboken(585), hoboken(628)
10: stevens(418), he(420)
11: hackingh(479), its(484)
12: weehawken(482), city(529), town(583), city(601), it(604)
13: population(624), it(638)
14: bergen.hoboken(652), hoboken(780)
15: hudson(722), his(736), hudson(758)
16: company(732), company(826)
17: area(776), area(798), area(934)
18: pauw(811), he(838), he(951), his(956)
19: land(869), land(948), land(977)
20: pavonia(942), pavonia(1037)
21: putten(981), putten(1025), his(1029)
22: amsterdam.in(1051), they(1070)
23: hoboken(1101), it(1109)
24: dutch(1131), it(1134)
25: bayard(1149), bayard(1202), bayard(1223)
26: york(1168), city(1173)
27: stevens(1235), stevens(1247), stevens(1263)
28: worl

In [7]:
# Resolving coreferences 
resolved_data: str = "" 

for token in doc: 
    resolved_coref = doc._.coref_chains.resolve(token) 
    if resolved_coref: 
        resolved_data += " " + " and ".join(r.text for r in resolved_coref) 
    elif token.dep_ == "punct":  
        resolved_data += token.text 
    else: 
        resolved_data += " " + token.text 

In [8]:
print(resolved_data) 

 hoboken   is a city in hudson county in the u.s . county of new jersey. hoboken is part of the new york metropolitan area and is the site of hoboken terminal, a major transportation hub. as of the 2020 united states census, the york 's population was 60,419, an increase of 10,414   from the 2010 census count of 50,005, which in turn reflected an increase of 11,428   from the 38,577 counted in the 2000 census. the census bureau 's population estimates program calculated a population of 57,010 for 2023, making it the 708th- most populous municipality in the nation. with more than 42,400 inhabitants per square mile   in data from the 2010 census, hoboken was ranked as the third- most densely populated municipality in the united states among cities with a population above 50,000. in the 2020 census, the states 's population density climbed to more than 48,300 inhabitants per square mile   of land, ranked fourth in the county behind guttenberg, union city and west new york.hoboken was firs

In [9]:
# Extract relationships 
def extract_relationship(sentence): 
    doc = nlp(sentence) 

    first, last = None, None 

    for chunk in doc.noun_chunks: 
        if not first: 
            first = chunk 
        else: 
            last = chunk 
    
    if first and last: 
        return (first.text.strip(), last.text.strip(), str(doc[first.end:last.start]).strip()) 
    
    return (None, None, None) 

In [15]:
print_five_words = lambda sentence: '\n'.join(' '.join(sentence.split()[i:i+5]) for i in range(0, len(sentence.split()), 5))

In [16]:
# Create a graph 
graph_doc = nlp(resolved_data) 

nx_graph = nx.DiGraph() 

for sent in enumerate(graph_doc.sents): 
    if len(sent[1]) > 3: 
        (a, b, c) = extract_relationship(str(sent[1])) 

        # Add the relationship to the graph 
        if a and b: 
            nx_graph.add_node(a, size=5) 
            nx_graph.add_node(b, size=5) 
            nx_graph.add_edge(a, b, weight=1, title=print_five_words(c), arrows="to")

In [17]:
g = Network(notebook=True, cdn_resources='in_line', width="100%", height="600px") 
g.from_nx(nx_graph)  
g.show("graph.html")

graph.html


In [18]:
print(nx_graph.edges(['stevens']))

[('stevens', 'steel piers'), ('stevens', 'coastal stability'), ('stevens', 'delivery')]
