#### 1. Setup and Configuration

In [1]:
import os
import json
from concurrent.futures import ThreadPoolExecutor
from Neo4jRetrieval import Neo4jRetrieval
from AzureQuery import AzureQuery

# Define the search question
question = "Identify all British Columbia regulations that contain clauses that specify how the regulation applies to goods or services originating outside the province"

# Neo4j Configuration
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USERNAME = "admin"
NEO4J_PASSWORD = "admin"

# Azure Configuration
endpoint = os.getenv("AZURE_AI_ENDPOINT", "")
key = os.getenv("AZURE_AI_KEY", "")

azure = AzureQuery(endpoint, key)

#### 2a. Generate Initial Terms from User Query

In [2]:
def generate_initial_terms(question):
    prompt = f"""
    Based on this question about British Columbia regulations:
    "{question}"
    
    Generate 8-10 search terms focused on INTERPROVINCIAL and INTERTERRITORIAL trade barriers within Canada only (NOT international trade):
    
    BROAD terms (3-4 terms) - General concepts:
    - Examples: "interprovincial", "interterritorial", "out-of-province", "jurisdiction"
    
    SPECIFIC terms (4-6 terms) - Precise legal/regulatory phrases:  
    - Examples: "interprovincial trade barriers", "territorial application within Canada", "cross-provincial commerce"
    
    Focus on terms related to:
    - Trade barriers between Canadian provinces and territories
    - Provincial regulatory authority over out-of-province goods/services
    - Interprovincial goods and services classification
    - Territorial boundaries within Canada
    - Province-specific standards
    
    Return the terms as a Python list format only, no explanations.
    Example format: ["broad_term1", "specific_phrase_here", "broad_term2", "detailed_regulatory_concept"]
    """
    
    response = azure.call_agent(prompt)
    try:
        import ast
        terms = ast.literal_eval(response.strip())
        return terms
    except:
        # Fallback with mix of broad and specific
        return ["extraprovincial", "place of supply rules", "interprovincial", "cross-border commerce", 
                "out-of-province", "territorial application", "jurisdiction", "goods and services classification"]

# Generate new terms based on the original question and existing terms
def generate_additional_terms(question, existing_terms):
    existing_terms_str = ", ".join(existing_terms)
    
    prompt = f"""
    Based on this original question about British Columbia regulations:
    "{question}"
    
    We already have these search terms: {existing_terms_str}
    
    Generate 4-6 NEW search terms that maintain a balance of BROAD and SPECIFIC terms, focused on INTERPROVINCIAL/INTERTERRITORIAL trade within Canada only:
    
    BROAD terms (2-3 new terms) - Single words or short general concepts:
    - Examples: "territorial", "jurisdictional", "provincial"
    
    SPECIFIC terms (2-3 new terms) - Multi-word precise legal phrases:
    - Examples: "interprovincial supply chain regulations", "territorial compliance requirements", "inter-jurisdictional trade rules within Canada"
    
    Ensure these are different from existing terms and focus on:
    - Trade barriers between Canadian provinces and territories
    - Provincial regulatory authority over other provinces/territories
    - Interprovincial goods and services classification
    
    Stay grounded to the original question: "{question}"
    
    If no relevant new terms can be generated, return an empty list: []
    
    Return the terms in list format only, no explanations, no code blocks.
    Example format: ["broad_term", "specific multi-word phrase", "another_broad_term"] or []
    """
    
    response = azure.call_agent(prompt)
    print(response)
    try:
        import ast
        terms = ast.literal_eval(response.strip())
        print(terms)
        return terms if terms else []
    except:
        return []

# Generate initial terms
all_terms = set()
initial_terms = generate_initial_terms(question)
all_terms.update(initial_terms)
print("Initial terms:", initial_terms)



# Loop to generate additional terms
target_terms = 30  # Prevent infinite loops
iteration = 0

while len(all_terms) < target_terms:
    new_terms = generate_additional_terms(question, all_terms)
    if not new_terms:  # No more terms to generate
        print(f"No more new terms generated after {iteration + 1} iterations")
        break
    
    all_terms.update(new_terms)
    print(f"Iteration {iteration + 1} - New terms added:", new_terms)
    iteration += 1

print(f"Total terms generated: {len(all_terms)}")
print("All terms:", all_terms)

Initial terms: ['extraprovincial', 'place of supply rules', 'interprovincial', 'cross-border commerce', 'out-of-province', 'territorial application', 'jurisdiction', 'goods and services classification']
["interterritorial", "regional", "interprovincial trade barriers", "provincial cross-border regulations", "intra-Canadian trade compliance"]
['interterritorial', 'regional', 'interprovincial trade barriers', 'provincial cross-border regulations', 'intra-Canadian trade compliance']
Iteration 1 - New terms added: ['interterritorial', 'regional', 'interprovincial trade barriers', 'provincial cross-border regulations', 'intra-Canadian trade compliance']
["territoriality", "jurisdictional application", "interjurisdictional commerce frameworks", "interprovincial regulatory compliance", "provincial trade enforcement policies"]
['territoriality', 'jurisdictional application', 'interjurisdictional commerce frameworks', 'interprovincial regulatory compliance', 'provincial trade enforcement polici

#### 2b. Add Seed Terms to Set
These are terms that have been manually identified as flags for potentially relevant chunks of information.

In [3]:
# Load terms from file
with open('seed_words.txt', 'r') as f:
    seed_terms = [line.strip() for line in f if line.strip()]
    # Combine all terms
    all_terms.update(seed_terms)
    print(all_terms)

{'trade barrier', 'cross-territorial supply chain rules', 'profession', 'jurisdiction', 'regional trade barriers', 'territorial trade', 'jurisdictional trade governance', 'goods and services classification', 'procurement', 'provincial cross-border regulations', 'cross-border commerce', 'cross-province trade specifications', 'regional', 'region-specific trade regulations', 'extraprovincial', 'jurisdictional application', 'certification', 'authorization', 'territorial application', 'interterritorial', 'territorial market entry conditions', 'authority', 'cross-border trade', 'interjurisdictional commerce frameworks', 'licence', 'permit', 'interprovincial regulatory compliance', 'interprovincial commerce conditions', 'good', 'intra-Canadian trade compliance', 'provincial trade governance', 'territoriality', 'interterritorial commerce guidelines', 'extraprovincial regulatory frameworks', 'service', 'toll', 'fee', 'interprovincial trade barriers', 'charge', 'specification', 'provincial trade

#### 3. Refine and Expand Terms

In [4]:
def refine_terms(all_terms):
    refined_terms = []
    blacklisted_terms = []
    
    # First, generate blacklisted terms
    blacklist_prompt = f"""
    Based on this original question about British Columbia regulations:
    "{question}"
    
    Generate a list of blacklisted terms that would give FALSE POSITIVES for interprovincial/interterritorial trade within Canada.
    
    Focus on terms that relate to:
    - International trade (outside Canada)
    - Foreign countries or jurisdictions
    - Global commerce
    - Import/export with non-Canadian entities
    - Terms that might confuse interprovincial with international
    
    Stay grounded to the original question: "{question}"
    Keep in mind we want out-of-province but NOT out-of-country content.
    
    Return 5-8 blacklisted terms in list format only, no explanations, no code blocks.
    Example format: ["international", "foreign trade", "import duties", "export regulations"]
    """
    
    blacklist_response = azure.call_agent(blacklist_prompt)
    try:
        import ast
        blacklisted_terms = ast.literal_eval(blacklist_response.strip())
        blacklisted_terms = [term.lower() for term in blacklisted_terms]
    except:
        blacklisted_terms = ["international", "foreign", "import", "export", "overseas", "global trade"]
    
    print(f"Blacklisted terms: {blacklisted_terms}")
    
    # Then refine existing terms
    for term in all_terms:
        is_broad = len(term.split()) <= 2 and not any(keyword in term.lower() 
                     for keyword in ['rule', 'regulation', 'requirement', 'compliance', 'application'])
        
        expand_prompt = f"""
        Based on this original question about British Columbia regulations:
        "{question}"
        
        Expand this {"broad" if is_broad else "specific"} term: "{term}"
        
        Focus on interprovincial/interterritorial trade within Canada only (NOT international):
        - Trade barriers between Canadian provinces and territories
        - Out-of-province but within Canada regulations
        - Provincial regulatory authority over other provinces/territories
        
        Stay grounded to the original question: "{question}"
        
        Generate 2-4 related {"broad" if is_broad else "specific"} terms that enhance search coverage for:
        - Regulatory language alternatives  
        - Jurisdictional concepts within Canada
        {"- Keep terms general and short (1-2 words)" if is_broad else "- Keep terms specific and detailed (multi-word phrases)"}
        
        Return the terms in list format only, no explanations, no code blocks.
        Example format: {["broad1", "broad2", "broad3"] if is_broad else ["specific detailed phrase 1", "specific regulatory concept 2", "specific legal term 3"]}
        """
        
        response = azure.call_agent(expand_prompt)
        try:
            import ast
            expanded_terms = ast.literal_eval(response.strip())
            refined_terms.extend(expanded_terms)
        except:
            refined_terms.append(term)  # Keep original if expansion fails
    
    return refined_terms, blacklisted_terms

refined_terms_nested, blacklisted_terms = refine_terms(all_terms)
print("Refined terms (nested):", refined_terms_nested)
print("Blacklisted terms:", blacklisted_terms)

# Flatten nested list of terms to a single list
def flatten_terms(nested_terms):
    flattened = []
    for item in nested_terms:
        if isinstance(item, list):
            flattened.extend(item)
        else:
            flattened.append(item)
    return flattened

# Flatten the nested terms
flattened_terms = flatten_terms(refined_terms_nested)
print(f"Flattened terms ({len(flattened_terms)}): {flattened_terms}")

# Remove surrounding quotes from terms
def remove_quotes(terms):
    cleaned_terms = []
    for term in terms:
        # Remove surrounding quotes if they exist
        cleaned_term = term.strip('"\'')
        cleaned_terms.append(cleaned_term)
    return cleaned_terms

# Clean the terms
cleaned_terms = remove_quotes(flattened_terms)
print(f"Cleaned terms ({len(cleaned_terms)}): {cleaned_terms}")

# Convert all terms to lowercase and remove duplicates
def lowercase_terms(terms):
    lowercase_terms = list(set([term.lower() for term in terms]))
    return lowercase_terms

lowercased_terms = lowercase_terms(cleaned_terms)
print(f"Lowercased unique terms ({len(lowercased_terms)}): {lowercased_terms}")

# Filter out blacklisted terms
def filter_blacklisted_terms(terms, blacklisted_terms):
    filtered_terms = [term for term in terms if term not in blacklisted_terms]
    return filtered_terms

final_terms = filter_blacklisted_terms(lowercased_terms, blacklisted_terms)
print(f"Final filtered terms ({len(final_terms)}): {final_terms}")

Blacklisted terms: ['international', 'global markets', 'foreign jurisdictions', 'non-canadian entities', 'import/export tariffs', 'overseas commerce', 'worldwide trade']
Refined terms (nested): ['interprovincial trade', 'territorial regulation', 'cross-border commerce', 'regional jurisdiction', 'interprovincial trade compliance standards', 'domestic territorial trade provisions', 'regional supply chain governance', 'out-of-province regulatory enforcement', 'occupation', 'trade restrictions', 'external jurisdiction', 'out-of-province provisions', 'interprovincial', 'trade barriers', 'cross-border', 'regulatory scope', 'interprovincial trade restrictions', 'cross-border regulatory compliance', 'provincial jurisdictional authority', 'inter-territorial trade regulations', 'interprovincial trade', 'cross-border commerce', 'jurisdictional authority', 'regional regulation', 'interprovincial commerce clauses', 'cross-border regulatory schema', 'internal Canadian trade provisions', 'provincial 

#### 4. Search Neo4j Database for Matching Nodes

In [6]:
# Use the generated terms for searching
terms = final_terms

neo4j_worker = Neo4jRetrieval(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD)
nodes = neo4j_worker.search_many(terms, label='latest')
print(f"{len(nodes)} nodes found using generated terms")
neo4j_worker.close()

17324 nodes found using generated terms


#### 5. Define Relevance Assessment Function

In [7]:
def create_prompt(question, node):
    prompt = f"""
    Your task is to determine if the following object is related to the question: "{question}".

    Object: {node}

    Focus on content that addresses:
    - Trade barriers between Canadian provinces and territories
    - Out-of-province (but within Canada) goods and services
    - Interprovincial regulatory compliance
    - NOT international or out-of-country trade

    Stay grounded to the original question about interprovincial/interterritorial regulations within Canada.

    If the object is related to interprovincial/interterritorial trade within Canada, return "1". 
    If it is not related or deals with international trade, return "0".
    Do not return any explanation, just the number.

    For example:
    Question: "What BC regulations have clauses about extraprovincial services?"
    Object: {{ "text": "This regulation applies to services provided from Alberta to British Columbia customers." }}
    Response: 1
    """
    return prompt

#### 6. Filter Nodes for Relevance Using LLM

In [None]:
related_set = set()
unrelated_set = set()

# Load existing related and unrelated nodes from files
if os.path.exists('related_nodes_V4.jsonl'):
    with open('related_nodes_V4.jsonl', 'r') as f:
        for line in f:
            node = json.loads(line.strip())
            related_set.update(node.get("elementId"))
if os.path.exists('unrelated_nodes.txt'):
    with open('unrelated_nodes.txt', 'r') as f:
        for line in f:
            element_id = line.strip()
            unrelated_set.update(element_id)

new_related_nodes = []
with open("related_nodes_V4.jsonl", "a") as rf:
  with open("unrelated_nodes.txt", "a") as uf:

    # Use LLM to determine if the node is relevant
    def add_or_dispose(node):
        element_id = node.get("elementId")
        # If this node is already in related or unrelated nodes, skip it
        if element_id in related_set or element_id in unrelated_set:
            return
        try:
            prompt = create_prompt(question, node)
            azure_response = azure.call_agent(prompt)
            print(azure_response.strip())
            if int(azure_response.strip()):
                new_related_nodes.append(node)
                related_set.update(node.get("elementId"))
                rf.write(json.dumps(node) + "\n")
                print("Relevant:", node.get("text")[:100] + "...")
            else:
                unrelated_set.update(node.get("elementId"))
                uf.write((node.get("elementId")) + "\n")
                print("Not relevant:", node.get("text")[:50] + "...")
        except Exception as e:
            print("Error processing azure_response:", azure_response, e)
            pass

    # Process nodes in parallel
    with ThreadPoolExecutor(10) as executor:
        executor.map(add_or_dispose, nodes)
    
print(f"{len(new_related_nodes)} relevant nodes found")

0
Not relevant: the area of the greater vancouver water district s...
0
Not relevant: 1 the corporation shall extend its service to and ...
0
Not relevant: 1 in this part ,  court means a court of british c...
0
Not relevant: a court has territorial competence in a proceeding...
0
Not relevant: country other than canada ,  including ,  in addit...
0
Not relevant: a court that under section 3 lacks territorial com...
0
Not relevant: a court has territorial competence in a proceeding...
0
Not relevant: in this act :  person includes a state ;  plaintif...
0
Not relevant: 1 if an affidavit in respect of the lien has been ...
0
Not relevant: a court has territorial competence in a proceeding...
0
Not relevant: 1 after considering the interests of the parties t...
0
Not relevant: 1 on a transfer of a proceeding from the supreme c...
0
Not relevant: 1 after the filing of a request made by a court ou...
0
Not relevant: 1 the supreme court by order may request a court o...
0
Not relevant: if t

#### 7. Convert to CSV

In [12]:
import json
import pandas as pd


def jsonl_to_csv(jsonl_file, csv_file):
    """
    Convert a JSONL file to CSV format

    Args:
      jsonl_file (str): Path to the input JSONL file
      csv_file (str): Path to the output CSV file
    """
    # Read JSONL file
    with open(jsonl_file, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]

    # Convert to DataFrame for easier handling
    df = pd.DataFrame(data)

    # Write to CSV
    df.to_csv(csv_file, index=False)
    print(f"Converted {jsonl_file} to {csv_file}")

input_file = "related_nodes_V4.jsonl"
output_file = "related_nodes_V4.csv"
jsonl_to_csv(input_file, output_file)

Converted related_nodes.jsonl to related_nodes_V4.csv
