In [6]:
# ==============================================================================
# FINAL SCRIPT 1: KNOWLEDGE GRAPH CREATION (Nodes + Relationships with Cleaning)
# This script is designed to run without errors from start to finish.
# ==============================================================================

# --- Step 1: Setup and Installations ---
print("--- Initializing Setup for Knowledge Graph Creation ---")
!pip install spacy py2neo langdetect tqdm -q --progress-bar off
!python -m spacy download en_core_web_lg -q --progress-bar off

import json
import spacy
import os
import re
from py2neo import Graph, Node
from langdetect import detect, LangDetectException
from tqdm.notebook import tqdm
import pandas as pd
from IPython.display import display

# --- Step 2: Configuration and Connections ---
print("\n--- Configuring Connections for KG ---")

# Neo4j Configuration
NEO4J_URI = "neo4j+s://9dae82f0.databases.neo4j.io"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "DUHXAf9g5PK25qfmJ63RbEbaw9tYyWeSu9MJjPwAnic"

# Connect to Neo4j (Initial connection test)
try:
    graph = Graph(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
    graph.run("RETURN 1") # Test connection
    print("✅ Successfully connected to Neo4j AuraDB!")
except Exception as e:
    print(f"❌ Failed to connect to Neo4j. Please check your credentials. Error: {e}")
    raise e # Critical: Stop execution if initial connection fails

# Load spaCy Model
nlp = spacy.load("en_core_web_lg")
print("✅ spaCy model 'en_core_web_lg' loaded.")

# Mount Drive and Copy Corpus Locally for reliable I/O
from google.colab import drive
drive.mount('/content/drive', force_remount=True) # Ensure fresh mount every time
print("\nCopying unified_corpus.json from Google Drive to local Colab for KG processing...")
DRIVE_CORPUS_PATH = '/content/drive/MyDrive/extracted_content/unified_corpus.json'
LOCAL_CORPUS_PATH = '/content/unified_corpus.json' # Local path in Colab

# Ensure parent directory exists for local copy
os.makedirs(os.path.dirname(LOCAL_CORPUS_PATH), exist_ok=True)
!cp -f "{DRIVE_CORPUS_PATH}" "{LOCAL_CORPUS_PATH}" # Use -f to force overwrite if exists
print("Copy complete. KG will use local corpus.")

# Load Corpus from LOCAL file (now guaranteed to be there and accessible)
corpus_path = LOCAL_CORPUS_PATH
try:
    print(f"Loading data from local corpus: {corpus_path}")
    with open(corpus_path, 'r', encoding='utf-8') as f:
        corpus = json.load(f)
    print(f"✅ Successfully loaded {len(corpus)} documents.")
except Exception as e:
    print(f"❌ ERROR loading local corpus: {e}")
    raise e # Critical: Stop if corpus cannot be loaded

# --- Step 3: Define All Helper Functions ---
print("\n--- Defining Helper Functions for KG ---")

# --- Define patterns BEFORE add_custom_entity_rules ---
patterns = [
    {"label": "SATELLITE", "pattern": [{"LOWER": "cartosat"}, {"IS_PUNCT": True, "OP": "?"}, {"LIKE_NUM": True}]},
    {"label": "SATELLITE", "pattern": [{"LOWER": "oceansat"}, {"IS_PUNCT": True, "OP": "?"}, {"LIKE_NUM": True}]},
    {"label": "SATELLITE", "pattern": [{"LOWER": "insat"}, {"IS_PUNCT": True, "OP": "?"}, {"TEXT": {"REGEX": "^(3D|3DR|3DS)$"}}]},
    {"label": "SATELLITE", "pattern": "Megha-Tropiques"}, {"label": "SATELLITE", "pattern": "SCATSAT-1"}, {"label": "SATELLITE", "pattern": "SARAL-AltiKa"},
    {"label": "SENSOR", "pattern": "LISS-IV"}, {"label": "SENSOR", "pattern": "LISS-III"}, {"label": "SENSOR", "pattern": "OCM"},
    {"label": "PRODUCT", "pattern": "GeoTIFF"}, {"label": "PRODUCT", "pattern": "HDF5"},
    {"label": "ORG", "pattern": "MOSDAC"}, {"label": "ORG", "pattern": "NRSC"}, {"label": "ORG", "pattern": "ISRO"},
    {"label": "SCIENTIFIC_TERM", "pattern": "NDVI"}, {"label": "SCIENTIFIC_TERM", "pattern": "SST"}
]

def add_custom_entity_rules(nlp_model, patterns_local): # Pass patterns as argument
    """Adds a configured EntityRuler to the spaCy pipeline."""
    if "entity_ruler" in nlp_model.pipe_names: nlp_model.remove_pipe("entity_ruler")
    ruler = nlp_model.add_pipe("entity_ruler", before="ner")
    ruler.add_patterns(patterns_local) # Use local patterns
    print("✅ Custom entity rules configured.")
    return nlp_model

def clean_text_for_ner(text):
    """Cleans text to remove noise before NER. This is the robust version for KG."""
    if not isinstance(text, str) or len(text) < 20: return None # Filter very short or non-string texts
    try:
        # Filter for English content, avoiding crashes on very short/noisy strings
        if len(text.strip()) > 50:
            if detect(text) != 'en': return None
    except LangDetectException:
        return None # Filter out if language detection fails

    # Aggressive regex cleaning for web noise
    text = re.sub(r'https?://\S+|www.\S+', '', text, flags=re.MULTILINE) # Remove full URLs
    text = re.sub(r'<.*?>', '', text) # Remove HTML tags
    text = re.sub(r'\[.*?\]\(.*?\)', '', text) # Remove markdown links (e.g., [text](url) -> text)
    text = re.sub(r'\w+=\S+|&[a-z_]+=', '', text) # Remove URL parameters (e.g., id=123&param=foo)
    text = re.sub(r'[^A-Za-z0-9\s\.\-]', ' ', text) # Keep alphanumeric, spaces, dots, dashes; replace others with space
    text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces with single space and strip whitespace
    return text

def sanitize_and_filter_entities(doc_spacy, all_target_labels_set):
    """Extracts, sanitizes, and filters entities from a single spaCy doc."""
    org_whitelist = {'isro', 'nrsc', 'mosdac', 'sac', 'imd', 'nasa', 'mygov'}
    # Expanded blacklist for common web/nav noise
    junk_blacklist = {'sun', 'wave', 'height', 'access', 'plugin', 'oct', 'jul', 'guid',
                      'thermal', 'infrared', 'products', 'data', 'services', 'reflectivity', 'albedo',
                      'home', 'login', 'signup', 'logout', 'menu', 'search', 'reports', 'catalog',
                      'galleries', 'image', 'satellite', 'forecast', 'help', 'tools', 'atlases',
                      'section', 'table', 'fig', 'figure', 'content', 'read', 'more', 'details', 'contact'}

    final_entities_set = set() # Renamed to avoid confusion with the global final_entities list
    for ent in doc_spacy.ents:
        name = ent.text.strip().replace('"', '').replace("'", "")
        label = ent.label_

        # --- Comprehensive Filtering Logic ---
        if len(name) <= 2 or len(name) > 50: continue # Length check (min 3 chars, max 50 chars)
        if any(char.isdigit() for char in name): continue # Reject if contains digits
        if any(char in name for char in '()[]{}<>»&='): continue # Reject if contains specific punctuation/symbols
        if any(word in name.lower().split() for word in junk_blacklist): continue # Reject if contains blacklisted words
        if label in ['ORG', 'GPE'] and ' ' not in name and name.lower() not in org_whitelist: continue # Stricter for single-word ORG/GPE
        if not name.isascii(): continue # Reject non-ASCII characters

        # CORRECTED STOP WORD CHECK: Use nlp.Defaults.stop_words
        if name.lower() in nlp.Defaults.stop_words: continue # Check if the lowercase name is in spaCy's default stop words

        if label in all_target_labels_set: # Only include if it's one of our desired labels
            final_entities_set.add((name, label))
    return list(final_entities_set) # Return as list

# --- ADDED MISSING FUNCTION DEFINITION ---
def filter_entities(entity_list_raw):
    """
    Applies final filtering rules to a list of (name, label) tuples.
    This is called after initial extraction and deduplication.
    This ensures consistency with the filtering applied in sanitize_and_filter_entities.
    """
    org_whitelist = {'isro', 'nrsc', 'mosdac', 'sac', 'imd', 'nasa', 'mygov'}
    junk_blacklist = {'sun', 'wave', 'height', 'access', 'plugin', 'oct', 'jul', 'guid',
                      'thermal', 'infrared', 'products', 'data', 'services', 'reflectivity', 'albedo',
                      'home', 'login', 'signup', 'logout', 'menu', 'search', 'reports', 'catalog',
                      'galleries', 'image', 'satellite', 'forecast', 'help', 'tools', 'atlases',
                      'section', 'table', 'fig', 'figure', 'content', 'read', 'more', 'details', 'contact'}

    filtered_entities_set = set()
    for name, label in entity_list_raw: # entity_list_raw is expected to be a list of (name, label)
        name = name.strip()

        # Apply the same filtering rules for consistency
        if len(name) <= 2 or len(name) > 50: continue
        if any(char.isdigit() for char in name): continue
        if any(char in name for char in '()[]{}<>»&='): continue
        if any(word in name.lower().split() for word in junk_blacklist): continue
        if label in ['ORG', 'GPE'] and ' ' not in name and name.lower() not in org_whitelist: continue
        if not name.isascii(): continue

        # Stop word check (consistent with sanitize_and_filter_entities)
        if name.lower() in nlp.Defaults.stop_words: continue

        filtered_entities_set.add((name, label))
    return list(filtered_entities_set)


def populate_clean_nodes(graph_db_obj, clean_entity_list, entity_source_map_local):
    """Populates the graph with the final, clean list of nodes and their sources."""
    print(f"\nPopulating graph with {len(clean_entity_list)} final, clean nodes...")
    # Get a fresh connection within the function scope for robustness
    try:
        graph_db_reconnected_nodes = Graph(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
        graph_db_reconnected_nodes.run("RETURN 1")
        print("✅ Reconnected to Neo4j for node population!")
    except Exception as e:
        print(f"❌ Failed to reconnect to Neo4j for nodes. Error: {e}")
        raise e # Critical: Stop if reconnection fails here

    # Clear the entire graph before populating (ensures clean slate for every run)
    graph_db_reconnected_nodes.delete_all()
    print("✅ Cleared any existing data from the graph.")

    all_labels = {label for name, label in clean_entity_list}
    for label in tqdm(all_labels, desc="Creating Constraints"):
        try: graph_db_reconnected_nodes.schema.create_uniqueness_constraint(label, "name")
        except Exception: pass

    for name, label in tqdm(clean_entity_list, desc="Populating Nodes"):
        sources = list(entity_source_map_local.get((name, label), []))
        node = Node(label, name=name, sources=sources, description="")
        graph_db_reconnected_nodes.merge(node, label, "name")
    print("\n✅ Node population complete.")

def find_candidate_sentences_for_relations(text, known_entity_names_set):
    """Finds sentences in a text that contain at least two known entities."""
    candidate_sentences = []
    # More robust sentence splitting regex
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    for sentence in sentences:
        found_entities_in_sentence = {entity for entity in known_entity_names_set if re.search(r'\b' + re.escape(entity) + r'\b', sentence, re.IGNORECASE)}
        if len(found_entities_in_sentence) >= 2:
            candidate_sentences.append(sentence)
    return candidate_sentences

def extract_triplets_from_corpus(corpus_local, nlp_model_local, known_entity_names_set_local):
    """The main function to extract relationship triplets (Subject, Predicate, Object)."""
    all_relationships = []
    print("\n--- Scanning Corpus for Candidate Sentences ---")
    candidate_sentences = []
    for doc_data in tqdm(corpus_local, desc="Scanning Documents"):
        cleaned_text = clean_text_for_ner(doc_data.get("text_content", ""))
        if cleaned_text:
            candidates = find_candidate_sentences_for_relations(cleaned_text, known_entity_names_set_local)
            candidate_sentences.extend(candidates)

    print(f"\nFound {len(candidate_sentences)} candidate sentences. Now extracting triplets...")
    for sentence in tqdm(candidate_sentences, desc="Generating Triplets"):
        doc_spacy = nlp_model_local(sentence)
        for token in doc_spacy:
            if token.pos_ == 'VERB':
                subjects = [child for child in token.children if child.dep_ == 'nsubj']
                # Added 'oprd' and more robust object detection
                objects = [child for child in token.children if token.dep_ in ('dobj', 'pobj', 'attr', 'prep') or (token.dep_ == 'ROOT' and child.dep_ == 'oprd')]
                if subjects and objects:
                    for subj in subjects:
                        for obj in objects:
                            # Check if the found subject and object are in our clean entity list
                            if subj.text.lower() in known_entity_names_set_local and obj.text.lower() in known_entity_names_set_local:
                                predicate = token.lemma_.upper()
                                all_relationships.append({'subject': subj.text, 'predicate': predicate, 'object': obj.text})
    print(f"\nTriplet extraction complete. Found {len(all_relationships)} relationships.")
    return all_relationships

def populate_relationships_in_graph(graph_db_obj, relationship_list_local, final_polished_entities_local):
    """Populates the graph with relationships between existing nodes."""
    print(f"\n--- Populating graph with {len(relationship_list_local)} relationships... ---")

    # Reconnect to Neo4j right before populating relationships to avoid timeouts
    try:
        graph_db_reconnected_rels = Graph(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
        graph_db_reconnected_rels.run("RETURN 1")
        print("✅ Reconnected to Neo4j AuraDB for relationship population!")
    except Exception as e:
        print(f"❌ Failed to reconnect to Neo4j for relationships. Error: {e}")
        raise e # Critical: Stop if reconnection fails

    for rel in tqdm(relationship_list_local, desc="Creating Relationships"):
        predicate_formatted = re.sub(r'[^A-Z0-9_]', '', rel['predicate'].upper())
        if not predicate_formatted: continue

        # Find the original cased name for matching in the graph (from the cleaned list)
        subj_name_in_graph = next((name for name, _ in final_polished_entities_local if name.lower() == rel['subject'].lower()), None)
        obj_name_in_graph = next((name for name, _ in final_polished_entities_local if name.lower() == rel['object'].lower()), None)

        if subj_name_in_graph and obj_name_in_graph:
            query = f"""
            MATCH (a), (b)
            WHERE a.name = $subj_name AND b.name = $obj_name
            MERGE (a)-[r:{predicate_formatted}]->(b)
            """
            try:
                graph_db_reconnected_rels.run(query, subj_name=subj_name_in_graph, obj_name=obj_name_in_graph)
            except Exception as e:
                print(f"Error merging relationship for ({subj_name_in_graph})-[:{predicate_formatted}]->({obj_name_in_graph}): {e}")

    print("✅ Graph relationship population complete.")

# --- Step 4: Execute the Full Knowledge Graph Creation Pipeline ---
# This section now calls all the functions in sequence.
print("\n--- Starting Full Knowledge Graph Creation Pipeline ---")

# Add custom rules to the NLP model
nlp = add_custom_entity_rules(nlp, patterns) # Pass patterns to the function

# 1. Extract, Clean, and Filter Entities
print("\nStep 1: Extracting, cleaning, and filtering entities...")
entity_source_map = {} # This will store (name, label) -> [source_urls]
# Define all_target_labels_set *after* patterns are defined
all_target_labels = {p['label'] for p in patterns}.union({"PERSON", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "DATE", "QUANTITY"})

for doc_data in tqdm(corpus, desc="Processing Documents for Entities"):
    cleaned_text = clean_text_for_ner(doc_data.get("text_content", ""))
    if cleaned_text:
        doc_spacy = nlp(cleaned_text)
        doc_entities = sanitize_and_filter_entities(doc_spacy, all_target_labels) # Use all_target_labels
        for name, label in doc_entities:
            key = (name, label)
            if key not in entity_source_map: entity_source_map[key] = set()
            entity_source_map[key].add(doc_data.get("source_url", "N/A"))

final_polished_entities = filter_entities(list(entity_source_map.keys())) # Final filter on the unique names
print(f"Extraction and filtering complete. Found {len(final_polished_entities)} high-quality unique entities.")

# 2. Populate Neo4j with Clean Nodes (Using the re-connection logic within the function)
print("\nStep 2: Clearing old data and populating graph with clean nodes...")
populate_clean_nodes(graph, final_polished_entities, entity_source_map) # Pass graph object and maps

# 3. Extract and Populate Relationships
print("\nStep 3: Extracting and populating relationships...")
# Ensure global variables from node creation are accessible
# Pass 'final_polished_entities' and 'nlp' (global) to relationship functions
relationship_triplets = extract_triplets_from_corpus(corpus, nlp, {name.lower() for name, label in final_polished_entities}) # Pass lowercase set of names
populate_relationships_in_graph(graph, relationship_triplets, final_polished_entities) # Pass graph object and clean entities

# --- Final Verification ---
print("\n--- Final Verification ---")
print("Displaying a sample of the final, polished entities loaded into the graph:")
df_final = pd.DataFrame(final_polished_entities, columns=['Name', 'Label'])
display(df_final.sample(min(len(df_final), 50))) # Display max 50 samples

print("\n\n✅✅✅ KNOWLEDGE GRAPH CREATION COMPLETE! ✅✅✅")

--- Initializing Setup for Knowledge Graph Creation ---
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.

--- Configuring Connections for KG ---
✅ Successfully connected to Neo4j AuraDB!
✅ spaCy model 'en_core_web_lg' loaded.
Mounted at /content/drive

Copying unified_corpus.json from Google Drive to local Colab for KG processing...
Copy complete. KG will use local corpus.
Loading data from local corpus: /content/unified_corpus.json
✅ Successfully loaded 67838 documents.

--- Defining Helper Functions for KG ---

--- Starting Full Knowledge Graph Creation Pipeline ---
✅ Custom entity rules configured.

Step 1: Extracting, cleaning, and filtering entities...


Processing Documents for Entities:   0%|          | 0/67838 [00:00<?, ?it/s]

Extraction and filtering complete. Found 1065 high-quality unique entities.

Step 2: Clearing old data and populating graph with clean nodes...

Populating graph with 1065 final, clean nodes...
✅ Reconnected to Neo4j for node population!
✅ Cleared any existing data from the graph.


Creating Constraints:   0%|          | 0/12 [00:00<?, ?it/s]

Populating Nodes:   0%|          | 0/1065 [00:00<?, ?it/s]


✅ Node population complete.

Step 3: Extracting and populating relationships...

--- Scanning Corpus for Candidate Sentences ---


Scanning Documents:   0%|          | 0/67838 [00:00<?, ?it/s]


Found 46983 candidate sentences. Now extracting triplets...


Generating Triplets:   0%|          | 0/46983 [00:00<?, ?it/s]


Triplet extraction complete. Found 929 relationships.

--- Populating graph with 929 relationships... ---
✅ Reconnected to Neo4j AuraDB for relationship population!


Creating Relationships:   0%|          | 0/929 [00:00<?, ?it/s]

✅ Graph relationship population complete.

--- Final Verification ---
Displaying a sample of the final, polished entities loaded into the graph:


Unnamed: 0,Name,Label
684,dBW min,ORG
26,South East,LOC
840,day,DATE
295,INSAT Multi,ORG
335,YAAS.pdf lin,PERSON
1015,MAR,PERSON
573,ocean-from-space.jpg Wave-Atlas-Cover,ORG
731,ps Missions,ORG
734,GHRSST SMAP,ORG
727,CRIDA Farm,ORG




✅✅✅ KNOWLEDGE GRAPH CREATION COMPLETE! ✅✅✅


In [7]:
#  Final KG Verification Script
import pandas as pd
from py2neo import Graph
from IPython.display import display

print("--- Final KG Verification---")
print("Let's look at a sample of what was just loaded into the database.")

# Use the same credentials from your script
NEO4J_URI = "neo4j+s://9dae82f0.databases.neo4j.io"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "DUHXAf9g5PK25qfmJ63RbEbaw9tYyWeSu9MJjPwAnic"

try:
    graph_verify = Graph(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
    # Query to get a sample of 100 nodes
    query = "MATCH (n) RETURN n.name AS name, labels(n) AS labels LIMIT 100"

    # Fetch the data and display it as a table
    result_df = graph_verify.run(query).to_data_frame()

    print("✅ Here is a sample of 100 nodes currently in your Neo4j database:")
    display(result_df)

except Exception as e:
    print(f"❌ Failed to query the graph: {e}")

--- Final KG Verification---
Let's look at a sample of what was just loaded into the database.
✅ Here is a sample of 100 nodes currently in your Neo4j database:


Unnamed: 0,name,labels
0,Demonstration of GNSS,[ORG]
1,Bus Management,[ORG]
2,distance-icon-small.png,[PERSON]
3,scorpio,[PRODUCT]
4,Nowcast Current Events Alerts Met Applications,[ORG]
...,...,...
95,Feedback Feedback,[ORG]
96,Next Pau,[PERSON]
97,Remote Sens,[PRODUCT]
98,R. D. Cess J. A. Coakley D. A. H.,[PERSON]
