In [1]:
import fitz  # PyMuPDF
import spacy
import pandas as pd

def extract_text_from_pdf(pdf_path):
    """Extracts all text from a given PDF."""
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    """Preprocess text using spaCy for NLP."""
    doc = nlp(text)
    return doc

def extract_entities(doc):
    """Extract entities like diseases, drugs, and treatments from text."""
    # Expanded the list of entity types to capture more relevant medical entities
    target_labels = [
        "DISEASE", "PRODUCT", "TREATMENT", "PERSON", "ORG", "GPE", "FACILITY",
        "MEDICAL_DEVICE", "PROCEDURE", "CLINICAL_TRIAL", "RESEARCHER", 
        "STUDY_CENTER", "INTERVENTION", "MEDICATION", "COMORBIDITY", 
        "FOLLOW_UP_VISIT", "HEALTHCARE_PROVIDER", "EVENT"
    ]
    
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in target_labels]
    return entities

def extract_relationships(doc):
    """Extract subject-verb-object triples for relationships."""
    relationships = []
    
    # Expanding logic to capture more relationships by including compound subjects/objects
    for sent in doc.sents:
        subject = None
        object_ = None
        relationship = None

        for token in sent:
            if token.dep_ in ('nsubj', 'nsubjpass', 'csubj', 'csubjpass'):  # Subject types
                subject = token.text
            if token.dep_ in ('dobj', 'pobj', 'attr', 'prep'):  # Object types
                object_ = token.text
            if token.dep_ == 'ROOT':  # Verb (Relationship)
                relationship = token.text
        
        if subject and object_ and relationship:
            relationships.append((subject, relationship, object_))
    
    return relationships

# Store extracted entities and relationships in lists
entities_list = []
relationships_list = []

# List of PDF paths
pdf_paths = [
    "C:/Users/user/Downloads/41467_2022_Article_32307.pdf", 
    "C:/Users/user/Downloads/K_G_2.pdf", 
    "C:/Users/user/Downloads/K_G_3.pdf", 
    "C:/Users/user/Downloads/K_G_4.pdf", 
    "C:/Users/user/Downloads/K_G_5.pdf"
]

# Initialize total entity and relationship counts
total_entities = 0
total_relationships = 0

for path in pdf_paths:
    # Extract text from PDF
    text = extract_text_from_pdf(path)
    
    # Preprocess the text
    doc = preprocess_text(text)
    
    # Extract entities and relationships
    entities = extract_entities(doc)
    relationships = extract_relationships(doc)
    
    # Append the results to the lists
    entities_list.extend(entities)
    relationships_list.extend(relationships)

    # Update total counts
    total_entities += len(entities)
    total_relationships += len(relationships)

# Convert to pandas DataFrame for easier viewing
entities_df = pd.DataFrame(entities_list, columns=["Entity", "Label"])
relationships_df = pd.DataFrame(relationships_list, columns=["Subject", "Relationship", "Object"])

# Display total entity and relationship counts
print(f"Total entities extracted from all PDFs: {total_entities}")
print(f"Total relationships extracted from all PDFs: {total_relationships}")

# Display first 5 rows of entities and relationships for quick inspection
print("\nEntities:")
print(entities_df.head())

print("\nRelationships:")
print(relationships_df.head())


Total entities extracted from all PDFs: 2600
Total relationships extracted from all PDFs: 1247

Entities:
                                              Entity    Label
0  Article\nhttps://doi.org/10.1038/s41467-022-32...  PRODUCT
1                                              GLP-1      ORG
2                                         ﬁve years1   PERSON
3                                            foods20  PRODUCT
4                                      liraglutide21   PERSON

Relationships:
  Subject  Relationship    Object
0    loss       appears      loss
1  adults    randomized      loss
2   which           was    weight
3     who  investigated  protocol
4   group     decreased       day


In [1]:
!pip install numpy<2.0


The system cannot find the file specified.


In [2]:
!pip install matplotlib

import fitz  # PyMuPDF
import spacy
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network

def extract_text_from_pdf(pdf_path):
    """Extracts all text from a given PDF."""
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    """Preprocess text using spaCy for NLP."""
    doc = nlp(text)
    return doc

def extract_entities(doc):
    """Extract entities like diseases, drugs, and treatments from text."""
    # Expanded the list of entity types to capture more relevant medical entities
    target_labels = [
        "DISEASE", "PRODUCT", "TREATMENT", "PERSON", "ORG", "GPE", "FACILITY",
        "MEDICAL_DEVICE", "PROCEDURE", "CLINICAL_TRIAL", "RESEARCHER", 
        "STUDY_CENTER", "INTERVENTION", "MEDICATION", "COMORBIDITY", 
        "FOLLOW_UP_VISIT", "HEALTHCARE_PROVIDER", "EVENT"
    ]
    
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in target_labels]
    return entities

def extract_relationships(doc):
    """Extract subject-verb-object triples for relationships."""
    relationships = []
    
    # Expanding logic to capture more relationships by including compound subjects/objects
    for sent in doc.sents:
        subject = None
        object_ = None
        relationship = None

        for token in sent:
            if token.dep_ in ('nsubj', 'nsubjpass', 'csubj', 'csubjpass'):  # Subject types
                subject = token.text
            if token.dep_ in ('dobj', 'pobj', 'attr', 'prep'):  # Object types
                object_ = token.text
            if token.dep_ == 'ROOT':  # Verb (Relationship)
                relationship = token.text
        
        if subject and object_ and relationship:
            relationships.append((subject, relationship, object_))
    
    return relationships

def create_knowledge_graph(relationships):
    """Create a knowledge graph using NetworkX from extracted relationships."""
    G = nx.DiGraph()  # Directed graph to show relationships

    for subj, rel, obj in relationships:
        G.add_edge(subj, obj, label=rel)  # Create an edge with a relationship label

    return G

def draw_knowledge_graph(G):
    """Draws the knowledge graph using NetworkX and Matplotlib."""
    pos = nx.spring_layout(G)  # Layout for positioning nodes
    plt.figure(figsize=(12, 8))
    
    # Draw nodes and edges
    nx.draw(G, pos, with_labels=True, node_size=3000, node_color="skyblue", font_size=10, font_weight="bold", arrows=True, arrowstyle="->", arrowsize=20)

    # Draw edge labels (relationships)
    edge_labels = nx.get_edge_attributes(G, 'label')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_color='red')

    plt.title("Knowledge Graph of Extracted Entities and Relationships")
    plt.show()


# all nodes along with their relationships (edges)
def visualize_graph_pyvis_with_relationships(entities, relationships):
    # Initialize pyvis Network object with physics enabled for interaction
    net = Network(notebook=True, height='750px', width='100%', directed=True)
    
    # Create a set to track nodes and avoid duplicates
    nodes_set = set()

    # Add nodes for entities
    for entity, label in entities:
        if entity not in nodes_set:
            net.add_node(entity, label=entity)
            nodes_set.add(entity)

    # Add edges for relationships (subject-relationship-object)
    for subject, relationship, object_ in relationships:
        if subject not in nodes_set:
            net.add_node(subject, label=subject)
            nodes_set.add(subject)

        if object_ not in nodes_set:
            net.add_node(object_, label=object_)
            nodes_set.add(object_)

        # Add edge (relationship) between subject and object
        net.add_edge(subject, object_, label=relationship)

    # Generate and display the interactive graph with nodes and relationships
    return net.show("nodes_with_relationships_graph.html")

# Call the function to visualize nodes with relationships
visualize_graph_pyvis_with_relationships(entities_list, relationships_list)


# Store extracted entities and relationships in lists
entities_list = []
relationships_list = []

# List of PDF paths
pdf_paths = [
    "C:/Users/user/Downloads/41467_2022_Article_32307.pdf", 
    "C:/Users/user/Downloads/K_G_2.pdf", 
    "C:/Users/user/Downloads/K_G_3.pdf", 
    "C:/Users/user/Downloads/K_G_4.pdf", 
    "C:/Users/user/Downloads/K_G_5.pdf"
]

# Initialize total entity and relationship counts
total_entities = 0
total_relationships = 0

for path in pdf_paths:
    # Extract text from PDF
    text = extract_text_from_pdf(path)
    
    # Preprocess the text
    doc = preprocess_text(text)
    
    # Extract entities and relationships
    entities = extract_entities(doc)
    relationships = extract_relationships(doc)
    
    # Append the results to the lists
    entities_list.extend(entities)
    relationships_list.extend(relationships)

    # Update total counts
    total_entities += len(entities)
    total_relationships += len(relationships)

# Convert to pandas DataFrame for easier viewing
entities_df = pd.DataFrame(entities_list, columns=["Entity", "Label"])
relationships_df = pd.DataFrame(relationships_list, columns=["Subject", "Relationship", "Object"])

# Display total entity and relationship counts
print(f"Total entities extracted from all PDFs: {total_entities}")
print(f"Total relationships extracted from all PDFs: {total_relationships}")

# Display first 5 rows of entities and relationships for quick inspection
print("\nEntities:")
print(entities_df.head())

print("\nRelationships:")
print(relationships_df.head())

# Create and display the knowledge graph
G = create_knowledge_graph(relationships_list)
draw_knowledge_graph(G)


Defaulting to user installation because normal site-packages is not writeable


ModuleNotFoundError: No module named 'matplotlib.pyplot'

In [6]:
import sys
print(sys.executable)
print(sys.version)


C:\ProgramData\anaconda3\python.exe
3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]


In [2]:
!pip install --upgrade spacy torch matplotlib


Defaulting to user installation because normal site-packages is not writeable
Collecting torch
  Obtaining dependency information for torch from https://files.pythonhosted.org/packages/5a/6a/775b93d6888c31f1f1fc457e4f5cc89f0984412d5dcdef792b8f2aa6e812/torch-2.4.1-cp311-cp311-win_amd64.whl.metadata
  Downloading torch-2.4.1-cp311-cp311-win_amd64.whl.metadata (27 kB)
Collecting sympy (from torch)
  Obtaining dependency information for sympy from https://files.pythonhosted.org/packages/99/ff/c87e0622b1dadea79d2fb0b25ade9ed98954c9033722eb707053d310d4f3/sympy-1.13.3-py3-none-any.whl.metadata
  Downloading sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting numpy>=1.19.0 (from spacy)
  Obtaining dependency information for numpy>=1.19.0 from https://files.pythonhosted.org/packages/eb/57/3a3f14d3a759dcf9bf6e9eda905794726b758819df4663f217d658a58695/numpy-2.0.2-cp311-cp311-win_amd64.whl.metadata
  Downloading numpy-2.0.2-cp311-cp311-win_amd64.whl.metadata (59 kB)
     ---------------------

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\user\\AppData\\Roaming\\Python\\Python311\\site-packages\\~orch\\lib\\asmjit.dll'
Check the permissions.




   ------- -------------------------------- 1.1/6.2 MB 2.7 MB/s eta 0:00:02
   -------- ------------------------------- 1.2/6.2 MB 2.6 MB/s eta 0:00:02
   --------- ------------------------------ 1.4/6.2 MB 2.6 MB/s eta 0:00:02
   --------- ------------------------------ 1.5/6.2 MB 2.6 MB/s eta 0:00:02
   ---------- ----------------------------- 1.7/6.2 MB 2.7 MB/s eta 0:00:02
   ----------- ---------------------------- 1.8/6.2 MB 2.7 MB/s eta 0:00:02
   ----------- ---------------------------- 1.8/6.2 MB 2.5 MB/s eta 0:00:02
   ------------ --------------------------- 1.9/6.2 MB 2.5 MB/s eta 0:00:02
   ------------- -------------------------- 2.0/6.2 MB 2.5 MB/s eta 0:00:02
   ------------- -------------------------- 2.1/6.2 MB 2.5 MB/s eta 0:00:02
   -------------- ------------------------- 2.2/6.2 MB 2.4 MB/s eta 0:00:02
   -------------- ------------------------- 2.3/6.2 MB 2.4 MB/s eta 0:00:02
   --------------- ------------------------ 2.4/6.2 MB 2.4 MB/s eta 0:00:02
   --------

In [1]:
import fitz  # PyMuPDF
import spacy
import pandas as pd

def extract_text_from_pdf(pdf_path):
    """Extracts all text from a given PDF."""
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    """Preprocess text using spaCy for NLP."""
    doc = nlp(text)
    return doc

def extract_entities(doc):
    """Extract entities like diseases, drugs, and treatments from text."""
    # Expanded the list of entity types to capture more relevant medical entities
    target_labels = [
        "DISEASE", "PRODUCT", "TREATMENT", "PERSON", "ORG", "GPE", "FACILITY",
        "MEDICAL_DEVICE", "PROCEDURE", "CLINICAL_TRIAL", "RESEARCHER", 
        "STUDY_CENTER", "INTERVENTION", "MEDICATION", "COMORBIDITY", 
        "FOLLOW_UP_VISIT", "HEALTHCARE_PROVIDER", "EVENT"
    ]
    
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in target_labels]
    return entities

def extract_relationships(doc):
    """Extract subject-verb-object triples for relationships."""
    relationships = []
    
    # Expanding logic to capture more relationships by including compound subjects/objects
    for sent in doc.sents:
        subject = None
        object_ = None
        relationship = None

        for token in sent:
            if token.dep_ in ('nsubj', 'nsubjpass', 'csubj', 'csubjpass'):  # Subject types
                subject = token.text
            if token.dep_ in ('dobj', 'pobj', 'attr', 'prep'):  # Object types
                object_ = token.text
            if token.dep_ == 'ROOT':  # Verb (Relationship)
                relationship = token.text
        
        if subject and object_ and relationship:
            relationships.append((subject, relationship, object_))
    
    return relationships

# Store extracted entities and relationships in lists
entities_list = []
relationships_list = []

# List of PDF paths
pdf_paths = [
    "C:/Users/user/Downloads/41467_2022_Article_32307.pdf", 
    "C:/Users/user/Downloads/K_G_2.pdf", 
    "C:/Users/user/Downloads/K_G_3.pdf", 
    "C:/Users/user/Downloads/K_G_4.pdf", 
    "C:/Users/user/Downloads/K_G_5.pdf"
]

# Initialize total entity and relationship counts
total_entities = 0
total_relationships = 0

for path in pdf_paths:
    # Extract text from PDF
    text = extract_text_from_pdf(path)
    
    # Preprocess the text
    doc = preprocess_text(text)
    
    # Extract entities and relationships
    entities = extract_entities(doc)
    relationships = extract_relationships(doc)
    
    # Append the results to the lists
    entities_list.extend(entities)
    relationships_list.extend(relationships)

    # Update total counts
    total_entities += len(entities)
    total_relationships += len(relationships)

# Convert to pandas DataFrame for easier viewing
entities_df = pd.DataFrame(entities_list, columns=["Entity", "Label"])
relationships_df = pd.DataFrame(relationships_list, columns=["Subject", "Relationship", "Object"])

# Display total entity and relationship counts
print(f"Total entities extracted from all PDFs: {total_entities}")
print(f"Total relationships extracted from all PDFs: {total_relationships}")

# Display first 5 rows of entities and relationships for quick inspection
print("\nEntities:")
print(entities_df.head())

print("\nRelationships:")
print(relationships_df.head())


Total entities extracted from all PDFs: 2600
Total relationships extracted from all PDFs: 1247

Entities:
                                              Entity    Label
0  Article\nhttps://doi.org/10.1038/s41467-022-32...  PRODUCT
1                                              GLP-1      ORG
2                                         ﬁve years1   PERSON
3                                            foods20  PRODUCT
4                                      liraglutide21   PERSON

Relationships:
  Subject  Relationship    Object
0    loss       appears      loss
1  adults    randomized      loss
2   which           was    weight
3     who  investigated  protocol
4   group     decreased       day


In [2]:
from pyvis.network import Network
# all nodes along with their relationships (edges)
def visualize_graph_pyvis_with_relationships(entities, relationships):
    # Initialize pyvis Network object with physics enabled for interaction
    net = Network(notebook=True, height='750px', width='100%', directed=True)
    
    # Create a set to track nodes and avoid duplicates
    nodes_set = set()

    # Add nodes for entities
    for entity, label in entities:
        if entity not in nodes_set:
            net.add_node(entity, label=entity)
            nodes_set.add(entity)

    # Add edges for relationships (subject-relationship-object)
    for subject, relationship, object_ in relationships:
        if subject not in nodes_set:
            net.add_node(subject, label=subject)
            nodes_set.add(subject)

        if object_ not in nodes_set:
            net.add_node(object_, label=object_)
            nodes_set.add(object_)

        # Add edge (relationship) between subject and object
        net.add_edge(subject, object_, label=relationship)

    # Generate and display the interactive graph with nodes and relationships
    return net.show("nodes_with_relationships_graph.html")

# Call the function to visualize nodes with relationships
visualize_graph_pyvis_with_relationships(entities_list, relationships_list)


nodes_with_relationships_graph.html


In [3]:
from pyvis.network import Network
#visualize only the nodes without any edges or relationships
def visualize_graph_pyvis_nodes_only(entities):
    # Initialize pyvis Network object
    net = Network(notebook=True, height='750px', width='100%')
    
    # Create a set to track nodes and avoid duplicates
    nodes_set = set()

    # Add nodes to the graph (without edges)
    for entity, label in entities:
        if entity not in nodes_set:
            net.add_node(entity, label=entity)
            nodes_set.add(entity)

    # Generate and display the graph
    return net.show("nodes_only_graph.html")

# Call the function to visualize only nodes
visualize_graph_pyvis_nodes_only(entities_list)


nodes_only_graph.html


In [4]:
from pyvis.network import Network
# only the relationships (edges) between entities without displaying individual nodes explicitly
def visualize_graph_pyvis_relationships_only(relationships):
    # Initialize pyvis Network object
    net = Network(notebook=True, height='750px', width='100%', directed=True)
    
    # Create a set to track nodes and avoid duplicates
    nodes_set = set()

    # Add edges (relationships) between nodes
    for subject, relationship, object_ in relationships:
        if subject not in nodes_set:
            net.add_node(subject, label="")  # Add node without labels
            nodes_set.add(subject)
        
        if object_ not in nodes_set:
            net.add_node(object_, label="")  # Add node without labels
            nodes_set.add(object_)

        # Add edge (relationship) between subject and object
        net.add_edge(subject, object_, label=relationship)

    # Generate and display the graph focusing on relationships
    return net.show("relationships_only_graph.html")

# Call the function to visualize only relationships
visualize_graph_pyvis_relationships_only(relationships_list)


relationships_only_graph.html


In [5]:
from pyvis.network import Network
# only the relationships (edges) between entities without displaying individual nodes explicitly
def visualize_graph_pyvis_relationships_only(relationships):
    # Initialize pyvis Network object
    net = Network(notebook=True, height='750px', width='100%', directed=True)
    
    # Create a set to track nodes and avoid duplicates
    nodes_set = set()

    # Add edges (relationships) between nodes
    for subject, relationship, object_ in relationships:
        if subject not in nodes_set:
            net.add_node(subject, label="")  # Add node without labels
            nodes_set.add(subject)
        
        if object_ not in nodes_set:
            net.add_node(object_, label="")  # Add node without labels
            nodes_set.add(object_)

        # Add edge (relationship) between subject and object
        net.add_edge(subject, object_, label=relationship)

    # Generate and display the graph focusing on relationships
    return net.show("relationships_only_graph.html")

# Call the function to visualize only relationships
visualize_graph_pyvis_relationships_only(relationships_list)


relationships_only_graph.html


In [2]:
import sys
print(sys.executable)


C:\ProgramData\anaconda3\python.exe


In [7]:
import matplotlib.pyplot as plt
print("Matplotlib is installed successfully!")


ModuleNotFoundError: No module named 'matplotlib.pyplot'

In [11]:
import fitz  # PyMuPDF
import spacy
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network

# Load spaCy model for NLP
nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(pdf_path):
    """Extracts all text from a given PDF."""
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

def preprocess_text(text):
    """Preprocess text using spaCy for NLP."""
    return nlp(text)

def extract_entities(doc):
    """Extract entities like diseases, drugs, and treatments from text."""
    target_labels = [
        "DISEASE", "PRODUCT", "TREATMENT", "PERSON", "ORG", "GPE", "FACILITY",
        "MEDICAL_DEVICE", "PROCEDURE", "CLINICAL_TRIAL", "RESEARCHER", 
        "STUDY_CENTER", "INTERVENTION", "MEDICATION", "COMORBIDITY", 
        "FOLLOW_UP_VISIT", "HEALTHCARE_PROVIDER", "EVENT"
    ]
    return [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in target_labels]

def extract_relationships(doc):
    """Extract subject-verb-object triples for relationships."""
    relationships = []
    for sent in doc.sents:
        subject = None
        object_ = None
        relationship = None

        for token in sent:
            if token.dep_ in ('nsubj', 'nsubjpass', 'csubj', 'csubjpass'):  # Subject types
                subject = token.text
            if token.dep_ in ('dobj', 'pobj', 'attr', 'prep'):  # Object types
                object_ = token.text
            if token.dep_ == 'ROOT':  # Verb (Relationship)
                relationship = token.text
        
        if subject and object_ and relationship:
            relationships.append((subject, relationship, object_))
    
    return relationships

def create_knowledge_graph(relationships):
    """Create a knowledge graph using NetworkX from extracted relationships."""
    G = nx.DiGraph()
    for subj, rel, obj in relationships:
        G.add_edge(subj, obj, label=rel)
    return G

def draw_knowledge_graph(G):
    """Draws the knowledge graph using NetworkX and Matplotlib."""
    pos = nx.spring_layout(G)
    plt.figure(figsize=(12, 8))
    nx.draw(G, pos, with_labels=True, node_size=3000, node_color="skyblue", font_size=10, arrows=True, arrowstyle="->", arrowsize=20)
    edge_labels = nx.get_edge_attributes(G, 'label')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_color='red')
    plt.title("Knowledge Graph of Extracted Entities and Relationships")
    plt.show()

def visualize_graph_pyvis_with_relationships(entities, relationships):
    """Visualize graph with PyVis for interactive exploration."""
    net = Network(notebook=True, height='750px', width='100%', directed=True)
    nodes_set = set()

    for entity, label in entities:
        if entity not in nodes_set:
            net.add_node(entity, label=entity)
            nodes_set.add(entity)

    for subject, relationship, object_ in relationships:
        if subject not in nodes_set:
            net.add_node(subject, label=subject)
            nodes_set.add(subject)

        if object_ not in nodes_set:
            net.add_node(object_, label=object_)
            nodes_set.add(object_)

        net.add_edge(subject, object_, label=relationship)

    net.show("nodes_with_relationships_graph.html")

# List of PDF paths
pdf_paths = [
    "C:/Users/user/Downloads/41467_2022_Article_32307.pdf", 
    "C:/Users/user/Downloads/K_G_2.pdf", 
    "C:/Users/user/Downloads/K_G_3.pdf", 
    "C:/Users/user/Downloads/K_G_4.pdf", 
    "C:/Users/user/Downloads/K_G_5.pdf"
]

# Initialize lists to store entities and relationships
entities_list = []
relationships_list = []
total_entities, total_relationships = 0, 0

# Process each PDF
for path in pdf_paths:
    text = extract_text_from_pdf(path)
    doc = preprocess_text(text)
    
    # Extract entities and relationships
    entities = extract_entities(doc)
    relationships = extract_relationships(doc)
    
    # Append results to lists
    entities_list.extend(entities)
    relationships_list.extend(relationships)

    total_entities += len(entities)
    total_relationships += len(relationships)

# Convert extracted entities and relationships to pandas DataFrames
entities_df = pd.DataFrame(entities_list, columns=["Entity", "Label"])
relationships_df = pd.DataFrame(relationships_list, columns=["Subject", "Relationship", "Object"])

# Display total counts and preview data
print(f"Total entities extracted: {total_entities}")
print(f"Total relationships extracted: {total_relationships}")
print("\nEntities:\n", entities_df.head())
print("\nRelationships:\n", relationships_df.head())

# Visualize knowledge graph with NetworkX
G = create_knowledge_graph(relationships_list)
draw_knowledge_graph(G)

# Visualize interactive graph with PyVis
visualize_graph_pyvis_with_relationships(entities_list, relationships_list)



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\ProgramData\anaconda3\Lib\runpy.py", line 198, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\ProgramData\anaconda3\Lib\runpy.py", line 88, in _run_code
    exec(code, run_globals)
  File "C:\ProgramData\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\ProgramData\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\ProgramData\anaconda3\Lib\site-pac

AttributeError: _ARRAY_API not found

ImportError: initialization failed

In [6]:
!pip show matplotlib


Name: matplotlib
Version: 3.9.2
Summary: Python plotting package
Home-page: 
Author: John D. Hunter, Michael Droettboom
Author-email: Unknown <matplotlib-users@python.org>
License: License agreement for matplotlib versions 1.3.0 and later

1. This LICENSE AGREEMENT is between the Matplotlib Development Team
("MDT"), and the Individual or Organization ("Licensee") accessing and
otherwise using matplotlib software in source or binary form and its
associated documentation.

2. Subject to the terms and conditions of this License Agreement, MDT
hereby grants Licensee a nonexclusive, royalty-free, world-wide license
to reproduce, analyze, test, perform and/or display publicly, prepare
derivative works, distribute, and otherwise use matplotlib
alone or in any derivative version, provided, however, that MDT's
License Agreement and MDT's notice of copyright, i.e., "Copyright (c)
2012- Matplotlib Development Team; All Rights Reserved" are retained in
matplotlib  alone or in any derivative versio

In [8]:
import matplotlib.pyplot as plt
print("Matplotlib is installed successfully!")



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\ProgramData\anaconda3\Lib\runpy.py", line 198, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\ProgramData\anaconda3\Lib\runpy.py", line 88, in _run_code
    exec(code, run_globals)
  File "C:\ProgramData\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\ProgramData\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\ProgramData\anaconda3\Lib\site-pac

AttributeError: _ARRAY_API not found

ImportError: initialization failed