In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
print(f"Python Executable: {sys.executable}")
print(f"sys.path: {sys.path}")

Python Executable: /opt/miniconda3/envs/retrieval-env/bin/python
sys.path: ['/opt/miniconda3/envs/retrieval-env/lib/python311.zip', '/opt/miniconda3/envs/retrieval-env/lib/python3.11', '/opt/miniconda3/envs/retrieval-env/lib/python3.11/lib-dynload', '', '/opt/miniconda3/envs/retrieval-env/lib/python3.11/site-packages']


In [3]:
import sys

sys.path.append("../../")

from src.utils.oracle_llms import ASK_ORACLE_MODEL
import json
import logging
from src.utils import logging_utils
logger = logging.getLogger(__name__)

logging.basicConfig(
    level=logging.DEBUG,
    format=logging_utils.DEFAULT_FORMAT,
    datefmt=logging_utils.DEFAULT_DATEFMT,
    stream=sys.stdout,
)

In [4]:
import numpy as np
import itertools
import networkx as nx
import random
import matplotlib.pyplot as plt

2025-05-16 09:03:27 matplotlib DEBUG    matplotlib data path: /opt/miniconda3/envs/retrieval-env/lib/python3.11/site-packages/matplotlib/mpl-data
2025-05-16 09:03:27 matplotlib DEBUG    CONFIGDIR=/Users/giordanorogers/.matplotlib
2025-05-16 09:03:27 matplotlib DEBUG    interactive is False
2025-05-16 09:03:27 matplotlib DEBUG    platform is darwin
2025-05-16 09:03:27 matplotlib DEBUG    CACHEDIR=/Users/giordanorogers/.matplotlib
2025-05-16 09:03:27 matplotlib.font_manager DEBUG    Using fontManager instance from /Users/giordanorogers/.matplotlib/fontlist-v390.json


In [7]:
def read_off_shape_file(path):
    """ Read OFF shapefile but ignore comment lines for. """
    with open(path) as f:
        if f.readline().strip() != "OFF":
            raise ValueError("Not an OFF file")
        # counts line
        while True:
            line = f.readline()
            if line.strip() and not line.lstrip().startswith("#"):
                n_v, n_f, _ = map(int, line.split())
                break
        # vertices
        verts = []
        while len(verts) < n_v:
            line = f.readline()
            if line.strip() and not line.lstrip().startswith("#"):
                verts.append(list(map(float, line.split())))
        verts = np.asarray(verts)
        # faces
        faces = []
        while len(faces) < n_f:
            line = f.readline()
            if line.strip() and not line.lstrip().startswith("#"):
                nums = list(map(int, line.split()))
                faces.append(nums[1 : 1 + nums[0]])
    return verts, faces

In [9]:
# Load the truncated icosahedron from the OFF file
try:
    verts, faces = read_off_shape_file("Truncated_icosahedron.off")
    logger.info(f"Loaded truncated icosahedron: {len(verts)} vertices, {len(faces)} faces")
    
    # Create the face-node adjacency graph
    G_geo = nx.Graph()
    G_geo.add_nodes_from(range(len(faces)))
    
    # Add edges between faces that share an edge (2 vertices)
    for i, j in itertools.combinations(range(len(faces)), 2):
        if len(set(faces[i]) & set(faces[j])) == 2:
            G_geo.add_edge(i, j)
    
    logger.info(f"Created graph with {G_geo.number_of_nodes()} nodes and {G_geo.number_of_edges()} edges")
    
    # Verify the truncated icosahedron structure
    if G_geo.number_of_nodes() != 32 or G_geo.number_of_edges() != 90:
        logger.warning(f"Graph doesn't match expected truncated icosahedron structure (32 nodes, 90 edges)")
        
    # Identify pentagons and hexagons
    pentagon_nodes = []
    hexagon_nodes = []
    
    for i, face in enumerate(faces):
        if len(face) == 5:
            pentagon_nodes.append(i)
            G_geo.nodes[i]['type'] = 'pentagon'
        elif len(face) == 6:
            hexagon_nodes.append(i)
            G_geo.nodes[i]['type'] = 'hexagon'
    
    logger.info(f"Identified {len(pentagon_nodes)} pentagons and {len(hexagon_nodes)} hexagons")
    
except FileNotFoundError:
    logger.error("Could not find Truncated_icosahedron.off file")

2025-05-16 09:12:20 __main__ INFO     Loaded truncated icosahedron: 60 vertices, 32 faces
2025-05-16 09:12:20 __main__ INFO     Created graph with 32 nodes and 90 edges
2025-05-16 09:12:20 __main__ INFO     Identified 12 pentagons and 20 hexagons


In [10]:
names = [
    "Amara Vellon",
    "Eamon Thistledown",
    "Calista Norwick",
    "Dorian Fairmont",
    "Isolde Brackenridge",
    "Lachlan Everfield",
    "Mirella Stonebridge",
    "Orin Hawkwell",
    "Soraya Bramble",
    "Thaddeus Elmworth",
    "Zelena Rookwood",
    "Bastian Crowther",
    "Cressida Fenner",
    "Jorah Lockwood",
    "Kestrel Hathersage",
    "Linnea Wexford",
    "Marcellus Quill",
    "Nerissa Greycliff",
    "Oren Voss",
    "Phaedra Dunston",
    "Quinlan Vire",
    "Rosabel Hawthorne",
    "Soren Blackwell",
    "Thalia Ashbourne",
    "Ulric Pembroke",
    "Vesper Larkspur",
    "Wynnifred Ormond",
    "Xavian Merrow",
    "Yara Coldridge",
    "Zephyrine Marlton",
    "Ambrose Fallow",
    "Elysia Thornwick",
]

In [11]:
ids_to_entities = {i: name for i, name in enumerate(names)}
node_labels = ids_to_entities
print(node_labels)

{0: 'Amara Vellon', 1: 'Eamon Thistledown', 2: 'Calista Norwick', 3: 'Dorian Fairmont', 4: 'Isolde Brackenridge', 5: 'Lachlan Everfield', 6: 'Mirella Stonebridge', 7: 'Orin Hawkwell', 8: 'Soraya Bramble', 9: 'Thaddeus Elmworth', 10: 'Zelena Rookwood', 11: 'Bastian Crowther', 12: 'Cressida Fenner', 13: 'Jorah Lockwood', 14: 'Kestrel Hathersage', 15: 'Linnea Wexford', 16: 'Marcellus Quill', 17: 'Nerissa Greycliff', 18: 'Oren Voss', 19: 'Phaedra Dunston', 20: 'Quinlan Vire', 21: 'Rosabel Hawthorne', 22: 'Soren Blackwell', 23: 'Thalia Ashbourne', 24: 'Ulric Pembroke', 25: 'Vesper Larkspur', 26: 'Wynnifred Ormond', 27: 'Xavian Merrow', 28: 'Yara Coldridge', 29: 'Zephyrine Marlton', 30: 'Ambrose Fallow', 31: 'Elysia Thornwick'}


In [13]:
attributes = [
    "nationality",
    "occupation",
    "university",
    "hobby",
    "pet",
    "type of car",
    "allergy",
    "favorite food",
    "favorite drink",
    "favorite color",
    "biggest fear"
]

In [15]:
def assign_unique_edge_attributes(G, attribute_list):
    """
    Assign edge attributes. Each node gets at most one connection of each attribute type.

    Args:
        G: Networkx graph
        attribute_list: List of possible attributes
    """
    # Trck which attributes are used for each node
    node_attributes = {node: set() for node in G.nodes()}

    # Create a list of edges to process
    ## Shuffle to randomize assignment order
    edges = list(G.edges())
    random.shuffle(edges)

    # Keep track of remaining attrbute options for each edge
    edge_attribute_options = {
        edge: set(attribute_list) for edge in edges
    }

    unassigned_edges = []

    # First pass: Try to assign attributes while maintaining uniqueness
    for edge in edges:
        u, v = edge
        # Find attributes that are still available for both nodes
        valid_attributes = [
            attr for attr in attribute_list 
            if attr not in node_attributes[u] and attr not in node_attributes[v]
        ]

        if valid_attributes:
            # If there are valid options, choose one randomly
            chosen_attr = random.choice(list(valid_attributes))
            G.edges[u, v]['label'] = chosen_attr

            # Mark this attribute as used for both nodes
            node_attributes[u].add(chosen_attr)
            node_attributes[v].add(chosen_attr)

        else:
            # If no valid options, leave for second pass
            unassigned_edges.append(edge)
            G.edges[u, v]['label'] = None

    if unassigned_edges:
        print(f"Warning: {len(unassigned_edges)} edges couldn't be assigned with strict uniqueness.")
        print(unassigned_edges)

    return G

In [16]:
# Validate that we have enough attributes for the degree of the graph
max_degree = max(dict(G_geo.degree()).values())
if len(attributes) < max_degree:
    logger.warning(f"Warning: Not enough unique attributes ({len(attributes)}) for nodes with degree {max_degree}.")
    logger.warning("Some connections will have to share attribute types.")

In [17]:
# Assign attributes to edges
if G_geo.number_of_edges() > 0:
    # Apply the unique attribute assignment
    G_geo = assign_unique_edge_attributes(G_geo, attributes)

    # Log the results
    logger.info("Edge attributes assigned with uniqueness constraints:")

    # Verify uniqueness and log connections
    attribute_counts = {}
    duplicates_found = False

    # For each node, show its connections and verify uniqueness
    for node in sorted(G_geo.nodes()):
        node_connections = {}
        node_name = ids_to_entities[node]

        # Collect all connections for this node
        for neighbor in G_geo.neighbors(node):
            attr = G_geo.edges[node, neighbor]['label']
            if attr in node_connections:
                duplicates_found = True
                logger.warning(f"DUPLICATE FOUND: {node_name} has multiple '{attr}' connections with {node_connections[attr]} and {ids_to_entities[neighbor]}")
            node_connections[attr] = ids_to_entities[neighbor]

        # Log this node's connections
        logger.info(f"Node {node} ({node_name}) connections:")
        for attr, neighbor in sorted(node_connections.items()):
            logger.info(f"  - {attr}: {neighbor}")
            attribute_counts[attr] = attribute_counts.get(attr, 0) + 1

    # Print statistics
    logger.info("Attribute usage statistics:")
    for attr, count in sorted(attribute_counts.items(), key=lambda x: x[1], reverse=True):
        logger.info(f"  - {attr}: used {count} times")
    
    if duplicates_found:
        logger.warning("Duplicates were found! Some nodes have multiple connections with the same attribute type.")
        logger.warning("Consider increasing the number of distinct attributes or accepting some duplicates.")
    else:
        logger.info("Success! All nodes have at most one connection of each attribute type.")

2025-05-16 09:16:50 __main__ INFO     Edge attributes assigned with uniqueness constraints:
2025-05-16 09:16:50 __main__ INFO     Node 0 (Amara Vellon) connections:
2025-05-16 09:16:50 __main__ INFO       - biggest fear: Calista Norwick
2025-05-16 09:16:50 __main__ INFO       - favorite drink: Eamon Thistledown
2025-05-16 09:16:50 __main__ INFO       - favorite food: Orin Hawkwell
2025-05-16 09:16:50 __main__ INFO       - hobby: Dorian Fairmont
2025-05-16 09:16:50 __main__ INFO       - pet: Mirella Stonebridge
2025-05-16 09:16:50 __main__ INFO       - university: Lachlan Everfield
2025-05-16 09:16:50 __main__ INFO     Node 1 (Eamon Thistledown) connections:
2025-05-16 09:16:50 __main__ INFO       - biggest fear: Bastian Crowther
2025-05-16 09:16:50 __main__ INFO       - favorite color: Calista Norwick
2025-05-16 09:16:50 __main__ INFO       - favorite drink: Amara Vellon
2025-05-16 09:16:50 __main__ INFO       - favorite food: Zelena Rookwood
2025-05-16 09:16:50 __main__ INFO       - o

In [18]:
# Map attributes to natural language descriptions
edge_type_descriptions = {
    "nationality": "are from the same country",  
    "occupation": "work in the same field or profession",
    "university": "attended the same university",
    "hobby": "have the same hobby (not a sport or a game)",
    "pet": "have the same type of pet",
    "car": "have the same type of car",
    "allergy": "have the same type of allergy",
    "favorite food": "have the same favorite food",
    "favorite drink": "have the same favorite drink",
    "favorite color": "have the same favorite color",
    "biggest fear": "have the same same biggest fear"
}

In [20]:
print(ids_to_entities)

{0: 'Amara Vellon', 1: 'Eamon Thistledown', 2: 'Calista Norwick', 3: 'Dorian Fairmont', 4: 'Isolde Brackenridge', 5: 'Lachlan Everfield', 6: 'Mirella Stonebridge', 7: 'Orin Hawkwell', 8: 'Soraya Bramble', 9: 'Thaddeus Elmworth', 10: 'Zelena Rookwood', 11: 'Bastian Crowther', 12: 'Cressida Fenner', 13: 'Jorah Lockwood', 14: 'Kestrel Hathersage', 15: 'Linnea Wexford', 16: 'Marcellus Quill', 17: 'Nerissa Greycliff', 18: 'Oren Voss', 19: 'Phaedra Dunston', 20: 'Quinlan Vire', 21: 'Rosabel Hawthorne', 22: 'Soren Blackwell', 23: 'Thalia Ashbourne', 24: 'Ulric Pembroke', 25: 'Vesper Larkspur', 26: 'Wynnifred Ormond', 27: 'Xavian Merrow', 28: 'Yara Coldridge', 29: 'Zephyrine Marlton', 30: 'Ambrose Fallow', 31: 'Elysia Thornwick'}


In [21]:
# Generate connection descriptions
connection_descriptions = []
for u, v, data in G_geo.edges(data=True):
    profile_u_name = ids_to_entities[u]
    profile_v_name = ids_to_entities[v]
    attribute_label = data['label']
    desc_phrase = edge_type_descriptions.get(attribute_label, f"share the attribute '{attribute_label}'")
    connection_descriptions.append(f"{profile_u_name} and {profile_v_name} {desc_phrase}.")

In [22]:
# Format the descriptions for the prompt
conn_desc_str = "\n".join([f"* {conn}" for conn in connection_descriptions])
logger.info(f"Generated {len(connection_descriptions)} connection descriptions")

2025-05-16 09:19:31 __main__ INFO     Generated 90 connection descriptions


In [23]:
logger.info("Connection descriptions:")
for i in range(len(connection_descriptions)):
    logger.info(connection_descriptions[i])

2025-05-16 09:19:35 __main__ INFO     Connection descriptions:
2025-05-16 09:19:35 __main__ INFO     Amara Vellon and Eamon Thistledown have the same favorite drink.
2025-05-16 09:19:35 __main__ INFO     Amara Vellon and Calista Norwick have the same same biggest fear.
2025-05-16 09:19:35 __main__ INFO     Amara Vellon and Dorian Fairmont have the same hobby (not a sport or a game).
2025-05-16 09:19:35 __main__ INFO     Amara Vellon and Lachlan Everfield attended the same university.
2025-05-16 09:19:35 __main__ INFO     Amara Vellon and Mirella Stonebridge have the same type of pet.
2025-05-16 09:19:35 __main__ INFO     Amara Vellon and Orin Hawkwell have the same favorite food.
2025-05-16 09:19:35 __main__ INFO     Eamon Thistledown and Calista Norwick have the same favorite color.
2025-05-16 09:19:35 __main__ INFO     Eamon Thistledown and Dorian Fairmont attended the same university.
2025-05-16 09:19:35 __main__ INFO     Eamon Thistledown and Thaddeus Elmworth work in the same fiel

2025-05-16 09:19:35 __main__ INFO     Calista Norwick and Orin Hawkwell have the same hobby (not a sport or a game).
2025-05-16 09:19:35 __main__ INFO     Calista Norwick and Soraya Bramble share the attribute 'type of car'.
2025-05-16 09:19:35 __main__ INFO     Calista Norwick and Thaddeus Elmworth are from the same country.
2025-05-16 09:19:35 __main__ INFO     Dorian Fairmont and Isolde Brackenridge have the same favorite food.
2025-05-16 09:19:35 __main__ INFO     Dorian Fairmont and Lachlan Everfield have the same favorite drink.
2025-05-16 09:19:35 __main__ INFO     Dorian Fairmont and Bastian Crowther are from the same country.
2025-05-16 09:19:35 __main__ INFO     Isolde Brackenridge and Lachlan Everfield have the same type of allergy.
2025-05-16 09:19:35 __main__ INFO     Isolde Brackenridge and Bastian Crowther work in the same field or profession.
2025-05-16 09:19:35 __main__ INFO     Isolde Brackenridge and Ulric Pembroke attended the same university.
2025-05-16 09:19:35 __