In [2]:
import os

os.chdir('/kaggle/working/Affordance3DHighlighter')

In [None]:
!pip install gdown
!gdown --id 1siZtGusB1LfQVapTvNOiYi8aeKKAgcDF
!unzip full-shape.zip -d /kaggle/working/Affordance3DHighlighter/data/

In [None]:
import pickle

# Load training data
with open('/kaggle/working/Affordance3DHighlighter/data/full_shape_train_data.pkl', 'rb') as train_file:
    train_data = pickle.load(train_file)
# Inspect the contents
print(f"Training Data Type: {type(train_data)}")
print(f"Training Data Example: {train_data[:1]}")

In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install kaolin==0.17.0 -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.5.1_cu121.html

In [27]:

import sys
import torch

need_pytorch3d = False
try:
    import pytorch3d
except ModuleNotFoundError:
    need_pytorch3d = True
if need_pytorch3d:
    pyt_version_str = torch.__version__.split("+")[0].replace(".", "")
    version_str = "".join([
        f"py3{sys.version_info.minor}_cu",
        torch.version.cuda.replace(".", ""),
        f"_pyt{pyt_version_str}"
    ])
    !pip install iopath
    if sys.platform.startswith("linux"):
        print("Trying to install wheel for PyTorch3D")
        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html
        pip_list = !pip freeze
        need_pytorch3d = not any(i.startswith("pytorch3d==") for i in pip_list)
    if need_pytorch3d:
        print(f"failed to find/install wheel for {version_str}")
if need_pytorch3d:
    print("Installing PyTorch3D from source")
    !pip install ninja
    !pip install 'git+https://github.com/facebookresearch/pytorch3d.git@stable'

In [None]:
!pip install open3d

In [None]:
!mkdir -p data/PittsburghBridge
!wget -P data/PittsburghBridge https://dl.fbaipublicfiles.com/pytorch3d/data/PittsburghBridge/pointcloud.npz

In [28]:

from src.mesh import Mesh
from pytorch3d.structures import Pointclouds

from src.convertor import obj_to_pointcloud


def bounding_sphere_normalize(points: torch.Tensor) -> torch.Tensor:
    """
    points: (N,3) tensor of point coords
    Return normalized points in a unit sphere centered at origin.
    """
    center = points.mean(dim=0, keepdim=True)
    max_dist = (points - center).norm(p=2, dim=1).max()
    points_normed = (points - center) / max_dist
    return points_normed


def load_3d_data(file_path, num_points=10000, device="cuda", do_normalize=True):
    """
    Loads 3D data as PyTorch3D Pointclouds from either NPZ point cloud or OBJ mesh.

    Args:
        file_path: Path to either .npz point cloud or .obj mesh file
        num_points: Number of points to sample if loading from mesh
        device: Device to load data on

    Returns:
        Pointclouds object containing points and features
    """
    file_ext = file_path.split('.')[-1].lower()

    if file_ext == 'npz':
        # Load NPZ point cloud directly like in the example
        pointcloud = np.load(file_path)
        verts = torch.Tensor(pointcloud['verts']).to(device)
        rgb = torch.Tensor(pointcloud['rgb']).to(device)

        print("lenght of the data")
        print(len(verts))

        # Subsample if needed
        if len(verts) > num_points:
            idx = torch.randperm(len(verts))[:num_points]
            verts = verts[idx]
            rgb = rgb[idx]

        if do_normalize:
            verts = bounding_sphere_normalize(verts)

        # Return both the points tensor and the Pointclouds object
        point_cloud = Pointclouds(points=[verts], features=[rgb])
        return verts, point_cloud  # Return both

    elif file_ext == 'obj':
        # Load and convert your OBJ file
        points, point_cloud = obj_to_pointcloud(
            file_path,
            num_points=num_points,  # Adjust this number as needed
            device="cuda"  # Use "cpu" if you don't have a GPU
        )
        if do_normalize:
            points = bounding_sphere_normalize(points)
            # here we update the point cloud too
            rgb = point_cloud.features_packed() # shape [N,3]
            point_cloud = Pointclouds(points = [points], features = [rgb])
        return points, point_cloud
        # # Load mesh and sample points
        # mesh = Mesh(file_path)
        # vertices = mesh.vertices

        # # Sample random points
        # idx = torch.randperm(vertices.shape[0])[:num_points]
        # points = vertices[idx].to(device)

        # # Initialize with gray color
        # colors = torch.ones_like(points) * 0.7

        # return Pointclouds(points=[points], features=[colors])

    else:
        raise ValueError(f"Unsupported file format: {file_ext}. Only .npz and .obj are supported.")



In [29]:
def print_grad_fn(tensor, depth=0):
    """Recursively print the gradient function graph"""
    if tensor.grad_fn is None:
        print("  " * depth + "None (leaf tensor)")
        return

    print("  " * depth + str(tensor.grad_fn))
    for fn in tensor.grad_fn.next_functions:
        if fn[0] is not None:
            print("  " * (depth + 1) + str(fn[0]))

In [34]:

from src.render.cloud_point_renderer import MultiViewPointCloudRenderer
from src.save_results import save_renders, save_results
from src.neural_highlighter import NeuralHighlighter
from src.Clip.loss_function import clip_loss
from src.Clip.clip_model import get_clip_model, encode_text, setup_clip_transforms

import torch
import numpy as np
import random
from tqdm import tqdm

# Constrain most sources of randomness
# (some torch backwards functions within CLIP are non-determinstic)
# Set a consistent seed for reproducibility
seed = 0  # You can use any integer value
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


def optimize_point_cloud(points, clip_model, renderer, encoded_text, log_dir: str, **kwargs):
    num_iterations = kwargs.get('num_iterations', 1000)
    learning_rate = kwargs.get('learning_rate', 1e-4)
    depth = kwargs.get('depth', 5)
    width = kwargs.get('network_width', 256)
    n_views = kwargs.get("n_views", 4)
    n_augs = kwargs.get('n_augs', 1)
    clipavg = kwargs.get('clipavg', 'view')
    device = kwargs.get('device', 'cuda')

    # Initialize network and optimizer
    net = NeuralHighlighter(
        depth=depth,  # Number of hidden layers
        width=width,  # Width of each layer
        out_dim=2,  # Binary classification (highlight/no-highlight)
        input_dim=3,  # 3D coordinates (x,y,z)
        positional_encoding=False  # As recommended in the paper
    ).to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

    # Set up the transforms
    clip_transform, augment_transform = setup_clip_transforms()

    # Training loop
    for i in tqdm(range(num_iterations)):
        optimizer.zero_grad()

        # Predict highlight probabilities
        pred_class = net(points)

        # Create colors based on predictions
        highlight_color = torch.tensor([204 / 255, 1.0, 0.0]).to(device)
        base_color = torch.tensor([180 / 255, 180 / 255, 180 / 255]).to(device)

        colors = pred_class[:, 0:1] * highlight_color + pred_class[:, 1:2] * base_color

        # Create and render point cloud
        point_cloud = renderer.create_point_cloud(points, colors)
        rendered_images = renderer.render_all_views(point_cloud=point_cloud, n_views=n_views)
        # Convert dictionary of images to tensor
        rendered_tensor = []
        for name, img in rendered_images.items():
            rendered_tensor.append(img.to(device))
        rendered_tensor = torch.stack(rendered_tensor)

        #Convert rendered images to CLIP format
        rendered_images = rendered_tensor.permute(0, 3, 1, 2)  # [B, H, W, C] -> [B, C, H, W]
        #print(rendered_images.shape)

        # Calculate CLIP loss
        loss = clip_loss(
            rendered_images=rendered_images,
            encoded_text=encoded_text,
            clip_transform=clip_transform,
            augment_transform=augment_transform,
            clip_model=clip_model,
            n_augs=n_augs,
            clipavg=clipavg
        )
        #print("Loss computation graph:")
        #print_grad_fn(loss)
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print(f"Iteration {i}, Loss: {loss.item():.4f}")
            save_renders(log_dir, i, rendered_images)

    return net


def main(input_path, object_name, highlight_region, **kwargs):
    """
    Main function for 3D highlighting with configurable parameters.
    
    Args:
        input_path: Path to input 3D file (mesh or point cloud)
        object_name: Name of the object for the prompt
        highlight_region: Region to highlight
        **kwargs: Optional parameters with defaults:
            n_views: Number of views to render (default: 5)
            n_aug: Number of augmentations (default: 5) 
            clipavg: Method for CLIP averaging (default: "view")
            network_depth: Depth of neural network (default: 5)
            network_width: Width of neural layers (default: 256)
            learning_rate: Learning rate for optimization (default: 1e-4)
            num_iterations: Number of training iterations (default: 500)
            num_points: Number of points to sample (default: 10000)
            device: Device to run on (default: "cuda")
            output_dir: Directory for outputs (default: "./output")
    """
    # Extract parameters from kwargs with defaults
    n_views = kwargs.get("n_views", 4)
    num_points = kwargs.get("num_points", 10000)
    device = kwargs.get("device", "cuda")
    output_dir = kwargs.get("output_dir", "./output")
    do_normalize = kwargs.get("do_normalize", True) 
    
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Load 3D data (either mesh or point cloud)
        print(f"Loading 3D data from {input_path}...")
        points, point_cloud = load_3d_data(input_path, num_points=num_points, device=device)
        print(f"Loaded {len(points)} points")

        # Setup CLIP model
        print("Setting up CLIP model...")
        clip_model, preprocess, resolution = get_clip_model()

        # Create and encode prompt
        prompt = f"A 3D render of a gray {object_name} with highlighted {highlight_region}"
        print(f"Using prompt: {prompt}")
        text_features = encode_text(clip_model, prompt, device)

        # Initialize renderer
        print("Setting up renderer...")
        renderer = MultiViewPointCloudRenderer(
            image_size=512,
            base_dist=30,  # Your default view distance
            base_elev=10,  # Your default elevation
            base_azim=0,  # Your default azimuth
            device=device
        )

        # Optimize point cloud highlighting
        print("Starting optimization...")
        net = optimize_point_cloud(
            points=points,
            renderer=renderer,
            clip_model=clip_model,
            encoded_text=text_features,
            log_dir=output_dir,
            **kwargs
        )

        # Save results
        print("Saving results...")
        save_results(
            net=net,
            points=points,
            n_views=n_views,
            prompt=prompt,
            output_dir=output_dir,
            renderer=renderer,
            device=device
        )

        print("Processing complete!")
        return net, points

    except Exception as e:
        print(f"Error in processing: {str(e)}")
        raise



In [None]:
main(
    input_path="/kaggle/working/Affordance3DHighlighter/data/candle.obj",
    object_name="candle",
    highlight_region="head",
    n_views=4,
    n_augs=1,
    clipavg="view",
    network_depth=5,
    network_width=256,
    learning_rate=1e-4,
    num_iterations=500,
    num_points=100000,
    device="cuda",
    output_dir="./output"
)

### Evaluation for part 3 

Main for the other dataset

In [None]:
from src.evaluation_fullshape import evaluate_single_object, visualize_single_object
from src.data_loader_fullshape import FullShapeDataset, create_dataset_splits
from src.render.cloud_point_renderer import MultiViewPointCloudRenderer
from src.neural_highlighter import NeuralHighlighter
from src.Clip.clip_model import get_clip_model, encode_text

def main(data_entry, net, clip_model, renderer, device="cuda", **kwargs):
    """
    Main function to process a single dataset entry.
    Args:
        data_entry (dict): Single object data from the dataset.
        net: Neural highlighting model.
        clip_model: CLIP model.
        renderer: Renderer for visualization.
        device (str): Device for computation.
        **kwargs: Additional parameters for optimization.
    """
    try:
        # Extract information from the dataset entry
        points = data_entry["coords"]  # Nx3 point cloud
        shape_id = data_entry["shape_id"]
        shape_class = data_entry["shape_class"]
        highlight_region = data_entry["affordances"][0]  # Use the first affordance for testing
        
        # Generate prompt
        prompt = f"A 3D render of a gray {shape_class} with highlighted {highlight_region}"
        print(f"Using prompt: {prompt}")
        text_features = encode_text(clip_model, prompt, device)

        # Optimize point cloud highlighting
        print("Starting optimization...")
        net = optimize_point_cloud(
            points=points,
            renderer=renderer,
            clip_model=clip_model,
            encoded_text=text_features,
            log_dir=kwargs.get("output_dir", "./output"),
            **kwargs
        )

        # Save results
        print("Saving results...")
        save_results(
            net=net,
            points=points,
            n_views=kwargs.get("n_views", 4),
            prompt=prompt,
            output_dir=kwargs.get("output_dir", "./output"),
            renderer=renderer,
            device=device
        )

        print(f"Processing complete for shape_id: {shape_id}")
        
        # Optional visualization
        if kwargs.get("visualize", True):
            visualize_single_object(data_entry, net, clip_model, device=device, out_dir=kwargs.get("output_dir", "./output"))

        return net, points

    except Exception as e:
        print(f"Error in processing shape_id {data_entry['shape_id']}: {str(e)}")
        raise

# Loading the dataset
# We only use the val_data and test_data for part 3. 10 percent of train set is validation set 
# and 5 percent of train set is test set.
# also when loading the dataset (better seen in Dataset Loader), specific classes and affordance labels 
# have been filtered as per the req in part 3. 
dataset = FullShapeDataset("/kaggle/working/Affordance3DHighlighter/data/full_shape_train_data.pkl", device="cuda")
train_data, val_data, test_data = create_dataset_splits(dataset, val_ratio=0.1, test_ratio=0.05)

# Ensure test set is not empty
if len(test_data) == 0:
    raise ValueError("Test dataset is empty. Check your dataset and split ratios.")

# Select a single object from the test set
data_index = 0  # You can adjust this to test different objects
data_entry = test_data[data_index]

# Setup CLIP and Renderer
clip_model, preprocess, resolution = get_clip_model()
renderer = MultiViewPointCloudRenderer(image_size=512, base_dist=30, base_elev=10, device="cuda")

# Setup Neural Highlighter
net = NeuralHighlighter(depth=5, width=256, out_dim=2, input_dim=3).to("cuda")

# Run the main function for a single object
main(
    data_entry=data_entry,
    net=net,
    clip_model=clip_model,
    renderer=renderer,
    device="cuda",
    num_iterations=500,
    learning_rate=1e-4,
    output_dir="./results",
    visualize=True
)

# Evaluate affordances for the object
results = evaluate_single_object(data_entry, net, clip_model, device="cuda")
print("Evaluation Results:", results)

# Visualize predictions
visualize_single_object(data_entry, net, clip_model, device="cuda", out_dir="./results")


### New strategy for evaluation
Hyperparam + Strategy Tuning, Then Test Evaluation

In [None]:
from src.evaluation_fullshape import (
    compute_mIoU,  # needed for final checks
    evaluate_single_object,
    grid_search_validation,   # We'll define a simple approach
    evaluate_dataset
)

# Here we define a short list of strategies and thresholds. 
strategies_list = ["basic", "functional", "descriptive", "action"]
thresholds_list = [0.3, 0.5, 0.7]

clip_model2, _, _= get_clip_model()
renderer2 = MultiViewPointCloudRenderer(
    image_size=512, base_dist=30, base_elev=10, device="cuda"
)
net2 = NeuralHighlighter(depth=5, width=256, out_dim=2, input_dim=3).to("cuda")

# Here we pick 4 objects from val
val_size = len(val_data)
num_val_objects = min(4, val_size)
val_indices = list(range(num_val_objects))

best_strat = None
best_th = None 
best_iou = 1.0 

# For each of these 4 val objects, we train from scratch. Then measure how well each (strat, threshold)
# for now, perform across all affordances of that shape. 
# Then we will average the IoU across these 4 shapes to pick the best approach. 
val_results = [] # Here we store shape level IoU so we can compute the average. 

for strategy in strategies_list:
    for threshold in thresholds_list:
        # here we will accumulate the iou across the 4 shapes
        sum_iou = 0.0
        count = 0
        for idx in val_indices:
            val_entry = val_data[idx]
            # now we train the network for the shape
            shape_net = NeuralHighlighter(depth = 5, width = 256, out_dim = 2, input_dim=3).cuda()
            shape_coords = val_entry["coords"]
            shape_class = val_entry["shape_class"]
            # Here we pick the first affordance for the main prompt
            aff = val_entry["affordances"][0]
            prompt = f"A 3D render of a gray {shape_class} with highlighted {aff}"

            # short training 
            txt_feats = encode_text(clip_model2, prompt, device = "cuda")
            shape_renderer = MultiViewPointCloudRenderer(image_size = 256, base_dist=20, base_elev=10, device="cuda")
            shape_net=optimize_point_cloud(
                points=shape_coords,
                clip_model=clip_model2,
                renderer=shape_renderer,
                encoded_text=txt_feats,
                log_dir="./val_tmp",
                num_iterations=200,
                device="cuda",
                n_views=2
            )

            # measure IoU across all affs in that shape with (strategy, threshold)
            aff_list = val_entry["affordances"]
            shape_sum = 0.0
            c2 = 0
            with torch.no_grad():
                pred2 = shape_net(shape_coords)
                highlight_prob2 = pred2[:, 0]
            for a2 in aff_list:
                gt_bin=(val_entry["labels_dict"][a2]>0.5).long()
                # apply threshold
                bin_pred=(highlight_prob2>=threshold).long()
                iou_val=compute_mIoU(bin_pred, gt_bin)
                shape_sum+=iou_val
                c2+=1

            shape_mean=shape_sum/c2 if c2>0 else 0.0
            sum_iou+=shape_mean
            count+=1
            
        avg_iou= sum_iou/count if count>0 else 0.0
        val_results.append((strategy,threshold,avg_iou))
        if avg_iou>best_iou:
            best_iou=avg_iou
            best_strat=strategy
            best_th=threshold

print(f"[Val Done] best strategy={best_strat}, threshold={best_th}, meanIoU={best_iou:.3f}")


# Now with the best hyperparameters , strategy and threshold achieved we check the test set. 
# Now test with best
test_count= len(test_data)
test_iou_sum=0.0
for tidx in range(test_count):
    test_entry=test_data[tidx]
    # train a net for that shape
    shape_net2=NeuralHighlighter(depth=5, width=256, out_dim=2, input_dim=3).cuda()
    shape_coords2=test_entry["coords"]
    aff_main2=test_entry["affordances"][0]
    prompt_main2=f"A 3D render of a gray {test_entry['shape_class']} with highlighted {aff_main2}"

    txt_feats_main2=encode_text(clip_model2, prompt_main2, device="cuda")
    test_renderer=MultiViewPointCloudRenderer(image_size=256, base_dist=20, base_elev=10, device="cuda")

    shape_net2=optimize_point_cloud(
        points=shape_coords2,
        clip_model=clip_model2,
        renderer=test_renderer,
        encoded_text=txt_feats_main2,
        log_dir="./test_tmp",
        num_iterations=200,
        device="cuda",
        n_views=2
    )

    # measure iou across all affs with best_strat, best_th
    sum_tiou=0.0
    c3=0
    with torch.no_grad():
        pclass_test=shape_net2(shape_coords2)
        highlight_prob_test= pclass_test[:,0]
    for aff_tst in test_entry["affordances"]:
        gt_tst=(test_entry["labels_dict"][aff_tst]>0.5).long()
        bin_preds_tst=(highlight_prob_test>=best_th).long()
        iou_test=compute_mIoU(bin_preds_tst, gt_tst)
        sum_tiou+=iou_test
        c3+=1
    shape_avg_tiou= sum_tiou/c3 if c3>0 else 0.0
    test_iou_sum+=shape_avg_tiou

final_test_mIoU = test_iou_sum/test_count if test_count>0 else 0.0
print(f"[Test] Using best strategy={best_strat}, threshold={best_th}, final test mIoU={final_test_mIoU:.3f}")

# We can also visualize one test shape's multi-view using the highlight probability
print("\nVisualizing multi-view for the last test shape with final threshold:")
test_shape = test_data[test_count-1]
coords_test = test_shape["coords"]
with torch.no_grad():
    test_pred = shape_net2(coords_test)
    highlight_sc = test_pred[:,0]

# let's do a quick multi-view
test_cloud = test_renderer.create_point_cloud(
    coords_test,
    highlight_sc.unsqueeze(1)*torch.tensor([204/255,1.0,0.0],device="cuda") +
    (1.0 - highlight_sc.unsqueeze(1))*torch.tensor([180/255,180/255,180/255],device="cuda")
)
rendered_testviews = test_renderer.render_all_views(test_cloud,n_views=4)
import matplotlib.pyplot as plt
fig,axes=plt.subplots(1,len(rendered_testviews),figsize=(4*len(rendered_testviews),4))
for ax,(vname,imgT) in zip(axes, rendered_testviews.items()):
    ax.imshow(imgT.cpu().numpy())
    ax.set_title(vname)
    ax.axis('off')
plt.suptitle(f"Test Shape {test_shape['shape_id']} - final threshold={best_th}")
plt.show()