In [12]:
!pip install PyOpenGL PyOpenGL_accelerate
!pip install PyGLM

Collecting PyOpenGL
  Downloading PyOpenGL-3.1.7-py3-none-any.whl.metadata (3.2 kB)
Collecting PyOpenGL_accelerate
  Downloading PyOpenGL-accelerate-3.1.7.tar.gz (562 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m562.1/562.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hDownloading PyOpenGL-3.1.7-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hBuilding wheels for collected packages: PyOpenGL_accelerate
  Building wheel for PyOpenGL_accelerate (pyproject.toml) ... [?25ldone
[?25h  Created wheel for PyOpenGL_accelerate: filename=PyOpenGL_accelerate-3.1.7-cp310-cp310-macosx_11_0_arm64.whl size=410478 sha

In [43]:
import numpy as np
import glm


def create_projection_matrix(fov, aspect_ratio, near, far):
    """Create a perspective projection matrix using GLM."""
    # Note: glm.perspective expects the FOV to be in radians.
    return glm.perspective(glm.radians(fov), aspect_ratio, near, far)


def create_view_matrix(position, pitch, yaw):
    # Convert pitch and yaw from degrees to radians.
    pitch = np.radians(pitch)
    yaw = np.radians(yaw)

    # Calculate the camera's forward direction vector.
    # Assuming a right-handed coordinate system with y up.
    forward = glm.vec3(
        np.cos(yaw) * np.cos(pitch), np.sin(pitch), np.sin(yaw) * np.cos(pitch)
    )
    forward = glm.normalize(forward)

    # The up vector for the camera.
    up = glm.vec3(0, 1, 0)

    # Create the view matrix.
    return glm.lookAt(glm.vec3(position), glm.vec3(position) + forward, up)


def convert_to_screen_coordinates(
    pos_3d, fov, aspect_ratio, near, far, cam_position, cam_pitch, cam_yaw
):
    """Convert 3D game coordinates to 2D screen coordinates using OpenGL matrices."""
    # Create projection and view matrices.
    projection_matrix = create_projection_matrix(fov, aspect_ratio, near, far)
    view_matrix = create_view_matrix(cam_position, cam_pitch, cam_yaw)

    # Transform 3D coordinates to clip space.
    world_pos = glm.vec4(pos_3d[0], pos_3d[1], pos_3d[2], 1.0)
    print("World Space:", world_pos)
    print(projection_matrix)
    print(view_matrix)
    clip_space_pos = projection_matrix * view_matrix * world_pos
    print("Clip Space:", clip_space_pos)

    # Perform perspective division to convert to NDC space.
    if clip_space_pos.w != 0:
        ndc_space_pos = glm.vec3(
            clip_space_pos.x / clip_space_pos.w,
            clip_space_pos.y / clip_space_pos.w,
            clip_space_pos.z / clip_space_pos.w,
        )
    else:
        ndc_space_pos = glm.vec3(
            clip_space_pos.x,
            clip_space_pos.y,
            clip_space_pos.z,
        )

    # Convert from NDC space to screen space.
    screen_pos = glm.vec2((ndc_space_pos.x + 1) / 2, (1 - ndc_space_pos.y) / 2)

    return (screen_pos.x, screen_pos.y)


# Camera setup based on the agent's position and orientation
cam_position = (-245.5, 69.0, -191.5)  # Position of the camera (agent's position)
cam_pitch = 0.0  # Pitch of the camera (agent's pitch)
cam_yaw = 0.0  # Yaw of the camera (agent's yaw)

# Example usage
fov = 60.0  # Field of view
aspect_ratio = 16.0 / 9.0  # Aspect ratio
near = 0.1  # Near clipping plane
far = 1000.0  # Far clipping plane

# Test position (should be different from the camera's position to be visible)
test_pos_3d = (-245.5, 69.0, -200.0)  # Example position
# Example position

screen_coordinates = convert_to_screen_coordinates(
    test_pos_3d, fov, aspect_ratio, near, far, cam_position, cam_pitch, cam_yaw
)

print("Screen Coordinates:", screen_coordinates)

World Space: vec4(       -245.5,           69,         -200,            1 )
[     0.974279 ][            0 ][            0 ][            0 ]
[            0 ][      1.73205 ][            0 ][            0 ]
[            0 ][            0 ][      -1.0002 ][     -0.20002 ]
[            0 ][            0 ][           -1 ][            0 ]
[            0 ][            0 ][            1 ][        191.5 ]
[            0 ][            1 ][            0 ][          -69 ]
[           -1 ][           -0 ][           -0 ][       -245.5 ]
[            0 ][            0 ][            0 ][            1 ]
Clip Space: vec4(     -8.28137,            0,    -0.200027,            0 )
Screen Coordinates: (-3.64068603515625, 0.5)


In [68]:
import math


def rotate_point(x, y, z, pitch, yaw):
    """
    Rotates a point in 3D space around the X axis (pitch) and then around the Y axis (yaw).
    Angles are given in degrees.
    """
    # Convert angles to radians
    pitch_rad = math.radians(pitch)
    yaw_rad = math.radians(yaw)

    # Rotate around x-axis (pitch)
    y_rot = y * math.cos(pitch_rad) - z * math.sin(pitch_rad)
    z_rot = y * math.sin(pitch_rad) + z * math.cos(pitch_rad)

    # Rotate around y-axis (yaw)
    x_rot = x * math.cos(yaw_rad) - z_rot * math.sin(yaw_rad)
    z_rot_final = x * math.sin(yaw_rad) + z_rot * math.cos(yaw_rad)

    return x_rot, y_rot, z_rot_final


def project_to_normalized_screen(x, y, z, fov):
    """
    Projects a 3D point onto a normalized 2D screen (coordinates between 0 and 1) with a given field of view (fov).
    Assumes the camera is looking along the z-axis.
    """
    # Projection calculations
    f = 1 / math.tan(math.radians(fov / 2))
    x_proj = f * x / z
    y_proj = f * y / z

    # Normalize to screen coordinates (0 to 1)
    x_screen = (x_proj + 1) / 2
    y_screen = (1 - y_proj) / 2

    return x_screen, y_screen


def calculate_normalized_screen_coordinates(agent_pos, entity_pos, pitch, yaw, fov):
    """
    Calculates the normalized screen coordinates (0 to 1) of an entity based on the agent's position,
    the entity's position, and the camera's pitch and yaw.
    """
    # Calculate relative position
    rel_x = entity_pos[0] - agent_pos[0]
    rel_y = entity_pos[1] - agent_pos[1]
    rel_z = entity_pos[2] - agent_pos[2]

    # Apply rotation for pitch and yaw
    rot_x, rot_y, rot_z = rotate_point(rel_x, rel_y, rel_z, pitch, yaw)

    # Check if the entity is behind the camera
    if rot_z < 0:
        return None  # Entity is behind the camera

    # Project onto the normalized screen
    return project_to_normalized_screen(rot_x, rot_y, rot_z+1, fov)


# Recalculate using the normalized function

In [69]:
# Example usage
agent_position = (0, 0, 0)  # Agent's position (x, y, z)
entity_position = (10, 5, 15)  # Entity's position (x, y, z)
pitch = 10  # Camera's pitch in degrees
yaw = 30  # Camera's yaw in degrees
screen_width = 1920  # Screen width in pixels
screen_height = 1080  # Screen height in pixels
fov = 90  # Field of view in degrees

screen_coordinates = calculate_normalized_screen_coordinates(
    agent_position, entity_position, pitch, yaw, fov
)

screen_coordinates

(0.5214908605619626, 0.4406671161052259)

In [72]:
import json
import numpy as np
from PIL import Image, ImageDraw

# Load the entities from JSON
with open("test.json", "r") as file:
    entities = json.load(file)["entities"]

# Load the image
img = Image.open("test.jpg")
draw = ImageDraw.Draw(img)


# Process each entity
for entity in entities:
    pos_3d = (entity["x"], entity["y"], entity["z"])
    entity_yaw = entity["yaw"]
    entity_pitch = entity["pitch"]
    screen_pos = calculate_normalized_screen_coordinates(
        (-245.5, 69.0, -191.5), pos_3d, entity_pitch, entity_yaw, 90
    )
    if screen_pos is None:
        continue
    x, y = screen_pos[0] * img.width, (screen_pos[1]) * img.height

    # Draw a thick point on the image
    radius = 5  # Radius of the point
    draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill="red")
    # Draw the name of the entity
    entity_name = entity["name"]
    print(entity_name,screen_pos)

    text_position = (x + radius + 5, y - radius)  # Adjust as needed
    draw.text(text_position, entity_name, fill="blue")

# Display the result
img.show()

MineDojoAgent0 (0.5, 0.5)
Horse (0.5018512154772318, 0.6027301350185992)
Horse (0.8818846236690518, 0.5976297314002229)
Horse (0.041304086062316725, 0.33478365575073316)
Horse (0.765183663180668, 0.38694018291319104)
Squid (8.913199798642664, 6.101770591867582)
Sheep (0.625, 0.5)
