In [1]:
!pip install pandas networkx matplotlib spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m94.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import pandas as pd


file_path = '/content/cord-ner-full.parquet.gzip'
data = pd.read_parquet(file_path)

# basic info
print("Data Shape:", data.shape)
print("First Few Rows:")
print(data.head())

# Check unique entity types
print("\nUnique Entity Types:", data['entity'].unique())


Data Shape: (63308700, 3)
First Few Rows:
                     word            entity  sentence
0  angiotensin-converting  B-GENE_OR_GENOME         0
1                  enzyme  I-GENE_OR_GENOME         0
2                       2  I-GENE_OR_GENOME         0
3                    ace2  B-GENE_OR_GENOME         0
4                      as             Other         0

Unique Entity Types: ['B-GENE_OR_GENOME' 'I-GENE_OR_GENOME' 'Other' 'B-CORONAVIRUS'
 'B-CHEMICAL' 'B-EVOLUTION' 'B-WILDLIFE' 'B-NORP' 'I-NORP' 'B-ORGANISM'
 'B-EUKARYOTE' 'B-PERSON' 'I-PERSON' 'B-PHYSICAL_SCIENCE'
 'I-PHYSICAL_SCIENCE' 'I-CHEMICAL' 'B-DISEASE_OR_SYNDROME'
 'I-DISEASE_OR_SYNDROME' 'I-ORGANISM' 'B-DATE' 'B-CARDINAL'
 'B-MOLECULAR_FUNCTION' 'B-EXPERIMENTAL_MODEL_OF_DISEASE'
 'I-EXPERIMENTAL_MODEL_OF_DISEASE' 'B-CELL'
 'B-BODY_PART_ORGAN_OR_ORGAN_COMPONENT' 'B-LIVESTOCK' 'B-ORG' 'B-TISSUE'
 'I-TISSUE' 'B-GROUP' 'I-CELL' 'B-CELL_COMPONENT' 'I-CELL_COMPONENT'
 'B-FOOD' 'B-CELL_FUNCTION' 'I-DATE' 'B-GPE'
 'B-DAILY_O

In [None]:
import networkx as nx
from collections import defaultdict

# Initialize structures
entity_pairs = []
sentences = data['sentence'].unique()
entity_dict = defaultdict(list)

# Extract entity pairs
for sentence_id in sentences:
    sentence_data = data[data['sentence'] == sentence_id]
    entities = sentence_data[sentence_data['entity'] != 'Other']

    # Form entity pairs
    if len(entities) > 1:
        for i in range(len(entities) - 1):
            entity_pairs.append((entities.iloc[i]['word'], entities.iloc[i + 1]['word']))

            # Save entity relationships
            entity_dict[entities.iloc[i]['word']].append(entities.iloc[i + 1]['word'])

# Print extracted entity pairs
print("\nSample Entity Pairs:")
print(entity_pairs[:5])

In [None]:
import matplotlib.pyplot as plt

# Initialize a directed graph
G = nx.DiGraph()

# Add nodes and edges
for pair in entity_pairs:
    G.add_edge(pair[0], pair[1])

# Visualize the graph
plt.figure(figsize=(12, 12))
nx.draw_networkx(
    G,
    with_labels=True,
    node_color='skyblue',
    node_size=3000,
    font_size=10,
    font_color='black',
    edge_color='gray'
)
plt.title("Knowledge Graph")
plt.show()

In [None]:
# Count entity frequencies
entity_counts = data[data['entity'] != 'Other']['word'].value_counts()

# Plot top entities
entity_counts.head(10).plot(kind='bar', figsize=(10, 6), title="Top 10 Frequent Entities")
plt.show()


In [None]:
sentence_entities = data.groupby('sentence')['word'].apply(list)

# Display entities in a specific sentence
sentence_id = 0  # Change as needed
print("\nEntities in Sentence ID", sentence_id, ":", sentence_entities[sentence_id])


In [None]:
# Save graph to file
nx.write_gexf(G, "knowledge_graph.gexf")

# Download the graph (for local viewing in tools like Gephi)
from google.colab import files
files.download("knowledge_graph.gexf")

In [None]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Example sentence analysis
example_sentence = "The spike protein of SARS-CoV-2 interacts with ACE2 for host cell entry."
doc = nlp(example_sentence)

# Extract entities and relations
for ent in doc.ents:
    print(f"Entity: {ent.text}, Type: {ent.label_}")

# Dependency relations
for token in doc:
    print(f"Word: {token.text}, Head: {token.head.text}, Relation: {token.dep_}")
