# 19 july


## Tensor defination and AutoGradGraph

In [None]:
import torch
import torch.nn.functional as F
import weakref
import numbers
import rustworkx as rx
import math

# Your existing AutogradGraph class (with minor improvements)
class AutogradGraph:
    """
    Manages the computation graph for automatic differentiation.
    It uses a directed acyclic graph to track dependencies between tensors.
    """
    __slots__ = ('graph', 'intermediate_tensors', '_check_cycles', '_auto_cleanup', '__weakref__')

    def __init__(self, check_for_cycles=True, auto_cleanup=True):
        self.graph = rx.PyDiGraph()
        self.intermediate_tensors = {}
        self._check_cycles = check_for_cycles
        self._auto_cleanup = auto_cleanup

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        if self._check_cycles and self.check_cycle():
            raise RuntimeError("Cycle detected in autograd graph on context exit.")
        if self._auto_cleanup:
            self.intermediate_tensors.clear()
            self.graph.clear()

    def add_tensor_graph(self, tensor):
        if not tensor._custom_requires_grad:
            raise ValueError("Tensor with requires_grad=False cannot be added to the graph.")
        ref = weakref.proxy(tensor)
        tensor_index = self.graph.add_node(ref)
        tensor._node_id = tensor_index

    def add_non_leaf_tensor_reference(self, tensor):
        if not tensor._custom_requires_grad:
            raise ValueError("Tensor must require grad.")
        if tensor._node_id in self.intermediate_tensors:
            raise ValueError("Tensor reference already exists in intermediate tensors.")
        self.intermediate_tensors[tensor._node_id] = tensor

    def add_edge(self, node_from, node_to, weight=None):
        if not all(isinstance(n, int) for n in (node_from, node_to)):
            raise TypeError("Node indices must be integers.")
        if not self.graph.has_node(node_from) or not self.graph.has_node(node_to):
            raise ValueError("Nodes must exist before adding edge.")
        self.graph.add_edge(node_from, node_to, weight)

    def check_cycle(self):
        return not rx.is_directed_acyclic_graph(self.graph)

    def reverse_toposort_from_tensor(self, tensor_index):
        graph=self.graph
        predecessors = list(rx.ancestors(graph, tensor_index))
        predecessors.append(tensor_index)
        sub_graph = graph.subgraph(predecessors)
        return [sub_graph[i] for i in reversed(rx.topological_sort(sub_graph))]
    # def alternative_reverse_toposort_from_tensor(self, tensor_index):
    #     graph = self.graph
    #     relevant_nodes = rx.ancestors(graph, tensor_index)
    #     relevant_nodes.add(tensor_index)
    #     full_topo = rx.topological_sort(graph)
    #     relevant_topo = [graph[_node_id] for _node_id in reversed(full_topo) if _node_id in relevant_nodes]
    #     return relevant_topo

    def delete_node(self, node_index):
        if not isinstance(node_index, int):
            raise TypeError("Node index must be an integer.")
        if self.graph.has_node(node_index):
             self.graph.remove_node(node_index)
    def delete_edge(self, node_from, node_to):
        if not self.graph.has_edge(node_from, node_to):
            raise ValueError("Edge does not exist.")
        self.graph.remove_edge(node_from, node_to)

    def del_non_leaf_tensor_reference(self, tensor_node_id):
        self.intermediate_tensors.pop(tensor_node_id, None)

    def delete_all_non_leaf_nodes(self):
        # removes non leaf nodes from graph and clears the intermediate_tensors dict
        self.graph.remove_nodes_from(list(self.intermediate_tensors.keys()))
        self.intermediate_tensors.clear()

    def __repr__(self):
        return f"CustomAutogradGraph(nodes={self.graph.num_nodes()}, edges={self.graph.num_edges()})"

# Your existing CustomTensor class, now enhanced with new methods
class CustomTensor:
    """
    A custom tensor class that wraps a PyTorch tensor to enable a custom
    autograd engine. It tracks operations to build a computation graph.
    """
    __slots__ = ('tensor', '_node_id', '_custom_requires_grad', '_backward', 'graph', '__weakref__','_is_leaf')

    def __new__(cls, data, *, _custom_requires_grad=False, device="cpu", dtype=torch.float32, graph=None, due_to_operation=False, is_leaf=False):
        if isinstance(data, CustomTensor):
            return data  # Don't rewrap
        return super().__new__(cls)

    def __init__(self, data, *, _custom_requires_grad=False, device="cpu", dtype=torch.float32, graph=None, due_to_operation=False, is_leaf=False):
        if isinstance(data, CustomTensor):
            return

        self.tensor = data if due_to_operation else torch.as_tensor(data, dtype=dtype, device=device)
        self.tensor.requires_grad_(False)
        self._custom_requires_grad = _custom_requires_grad
        self._node_id = None
        self._backward = lambda: None
        self.graph = None
        self._is_leaf = is_leaf

        if _custom_requires_grad:
            self._init_graph(graph)

    def _init_graph(self, graph):
        if graph is None:
            raise ValueError("Graph must be provided if requires_grad is True.")
        is_leaf=self._is_leaf
        if is_leaf:
            self.graph = weakref.proxy(graph)
        else:
            self.graph = graph # this line is only reached for tensors which are created by operations and graph passed is already a weakreference hence no need for wrapping
        graph.add_tensor_graph(self)
        if not is_leaf:
            graph.add_non_leaf_tensor_reference(self)

    def _zero_grad(self):
        """Sets the gradient of the underlying tensor to zero."""
        self.tensor.grad = torch.zeros_like(self.tensor)

    def zero_(self):
        """Sets the gradient of the underlying tensor to zero."""
        if self.tensor.grad is not None:
            self.tensor.grad.zero_()


    # --- Broadcasting Helper ---
    def _reduce_grad_for_broadcast(self, grad, target_shape):
        """Reduces a gradient to match the shape of a tensor that was broadcasted."""
        if grad.shape == target_shape:
            return grad
        
        # Add singleton dimensions to the front of target_shape to match grad's ndim
        padded_target_shape = (1,) * (grad.ndim - len(target_shape)) + target_shape
        
        # Identify dimensions that were broadcasted
        sum_dims = [i for i, (grad_dim, target_dim) in enumerate(zip(grad.shape, padded_target_shape)) if target_dim == 1 and grad_dim > 1]

        if sum_dims:
            grad = grad.sum(dim=sum_dims, keepdim=True)
        
        # Remove singleton dimensions to match the final target shape
        return grad.reshape(target_shape)



    # --- Basic Operators (from your original code, now compatible with new features) ---
    def __add__(self, other):
        # ... [Your original implementation]
        if isinstance(other, numbers.Number):
            return self._add_scalar(other)
        elif isinstance(other, CustomTensor):
            return self._add_tensor(other)
        return NotImplemented
    def __radd__(self,other):
        return self + other
    def __iadd__(self,other):
        if isinstance(other, numbers.Number):
            self.tensor.add_(other)
        elif isinstance(other,CustomTensor):
            self.tensor.add_(other.tensor)
    def _add_scalar(self, scalar):
        result_tensor = torch.add(self.tensor, scalar)
        if not self._custom_requires_grad:
            return CustomTensor(result_tensor,due_to_operation=True)
        graph = self.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)
        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)
        def _backward():
            if self_ref.tensor.grad is None: self_ref._zero_grad()
            self_ref.tensor.grad.add_(result_ref.tensor.grad)
        result._backward = _backward
        return result
    def _add_tensor(self, other):
        result_tensor = torch.add(self.tensor, other.tensor)
        requires_grad = self._custom_requires_grad or other._custom_requires_grad
        if not requires_grad:
            return CustomTensor(result_tensor,due_to_operation=True)
        graph = self.graph if self._custom_requires_grad else other.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        self_ref = weakref.proxy(self)
        other_ref = weakref.proxy(other)
        if self._custom_requires_grad:
            graph.add_edge(self._node_id, result._node_id)
        if other._custom_requires_grad:
            graph.add_edge(other._node_id, result._node_id)
        result_ref = weakref.proxy(result)
        def _backward():
            if self_ref._custom_requires_grad:
                if self_ref.tensor.grad is None: self_ref._zero_grad()
                grad_for_self = self_ref._reduce_grad_for_broadcast(result_ref.tensor.grad, self_ref.tensor.shape)
                self_ref.tensor.grad.add_(grad_for_self)
            if other_ref._custom_requires_grad:
                if other_ref.tensor.grad is None: other_ref._zero_grad()
                grad_for_other = other_ref._reduce_grad_for_broadcast(result_ref.tensor.grad, other_ref.tensor.shape)
                other_ref.tensor.grad.add_(grad_for_other)
        result._backward = _backward
        return result

    def __mul__(self, other):
        # ... [Your original implementation]
        if isinstance(other, numbers.Number):
            return self._mul_scalar(other)
        elif isinstance(other, CustomTensor):
            return self._mul_tensor(other)
        return NotImplemented
    def __rmul__(self,other):
        return self*other
    def __imul__(self,other):
        if isinstance(other, numbers.Number):
            self.tensor.mul_(other)
        elif isinstance(other,CustomTensor):
            self.tensor.mul_(other.tensor)
    def _mul_scalar(self, scalar):
        result_tensor = torch.mul(self.tensor, scalar)
        if not self._custom_requires_grad:
            return CustomTensor(result_tensor,due_to_operation=True)
        graph = self.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)
        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)
        def _backward():
            if self_ref.tensor.grad is None:
                self_ref._zero_grad()
            self_ref.tensor.grad.add_(result_ref.tensor.grad * scalar)
        result._backward = _backward
        return result
    def _mul_tensor(self, other):
        result_tensor = torch.mul(self.tensor, other.tensor)
        requires_grad = self._custom_requires_grad or other._custom_requires_grad
        if not requires_grad:
            return CustomTensor(result_tensor,due_to_operation=True)
        graph = self.graph if self._custom_requires_grad else other.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        self_ref = weakref.proxy(self)
        other_ref = weakref.proxy(other)
        result_ref = weakref.proxy(result)
        if self._custom_requires_grad:
            graph.add_edge(self._node_id, result._node_id)
        if other._custom_requires_grad:
            graph.add_edge(other._node_id, result._node_id)
        def _backward():
            if self_ref._custom_requires_grad:
                if self_ref.tensor.grad is None: self_ref._zero_grad()
                grad_for_self = self_ref._reduce_grad_for_broadcast(result_ref.tensor.grad * other_ref.tensor, self_ref.tensor.shape)
                self_ref.tensor.grad.add_(grad_for_self)
            if other_ref._custom_requires_grad:
                if other_ref.tensor.grad is None: other_ref._zero_grad()
                grad_for_other = other_ref._reduce_grad_for_broadcast(result_ref.tensor.grad * self_ref.tensor, other_ref.tensor.shape)
                other_ref.tensor.grad.add_(grad_for_other)
        result._backward = _backward
        return result

    def __sub__(self, other):
        if isinstance(other, numbers.Number):
            return self._sub_scalar(other)
        elif isinstance(other, CustomTensor):
            return self._sub_tensor(other)
        return NotImplemented
    
    def __rsub__(self, other):
        if isinstance(other, numbers.Number):
            return self._rsub_scalar(other)
        
    def __isub__(self,other):
        if isinstance(other, numbers.Number):
            self.tensor.sub_(other)
        elif isinstance(other,CustomTensor):
            self.tensor.sub_(other.tensor)
        
    def _rsub_scalar(self, scalar):
        result_tensor = torch.sub(scalar, self.tensor)
        if not self._custom_requires_grad:
            return CustomTensor(result_tensor,due_to_operation=True)

        graph = self.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)

        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)
        def _backward():
            if self_ref.tensor.grad is None:
                self_ref._zero_grad()
            # Derivative of scalar - x is -1
            self_ref.tensor.grad.sub_(result_ref.tensor.grad) # No broadcasting specific logic for scalar op

        result._backward = _backward
        return result

    
    def _sub_scalar(self, scalar):
        result_tensor = torch.sub(self.tensor, scalar)
        if not self._custom_requires_grad:
            return CustomTensor(result_tensor,due_to_operation=True)

        graph = self.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)

        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)
        def _backward():
            if self_ref.tensor.grad is None:
                self_ref._zero_grad()
            self_ref.tensor.grad.add_(result_ref.tensor.grad) # No broadcasting specific logic for scalar op
        result._backward = _backward
        return result

    def _sub_tensor(self, other):
        result_tensor = torch.sub(self.tensor, other.tensor)
        requires_grad = self._custom_requires_grad or other._custom_requires_grad
        if not requires_grad:
            return CustomTensor(result_tensor,due_to_operation=True)

        graph = self.graph if self._custom_requires_grad else other.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)

        self_ref = weakref.proxy(self)
        other_ref = weakref.proxy(other)
        result_ref = weakref.proxy(result)

        if self._custom_requires_grad:
            graph.add_edge(self._node_id, result._node_id)
        if other._custom_requires_grad:
            graph.add_edge(other._node_id, result._node_id)

        def _backward():
            if self_ref._custom_requires_grad:
                if self_ref.tensor.grad is None:
                    self_ref._zero_grad()
                grad_for_self = self_ref._reduce_grad_for_broadcast(result_ref.tensor.grad, self_ref.tensor.shape)
                self_ref.tensor.grad.add_(grad_for_self)
            if other_ref._custom_requires_grad:
                if other_ref.tensor.grad is None:
                    other_ref._zero_grad()
                grad_for_other = other_ref._reduce_grad_for_broadcast(-result_ref.tensor.grad, other_ref.tensor.shape)
                other_ref.tensor.grad.add_(grad_for_other)
        result._backward = _backward
        return result

    def __truediv__(self, other):
        if isinstance(other, numbers.Number):
            return self._div_scalar(other)
        elif isinstance(other, CustomTensor):
            return self._div_tensor(other)
        return NotImplemented
    def __itruediv__(self,other):
        if isinstance(other, numbers.Number):
            self.tensor.div_(other)
        elif isinstance(other,CustomTensor):
            self.tensor.div_(other.tensor)
    def _div_scalar(self, scalar):
        result_tensor = torch.div(self.tensor, scalar)
        if not self._custom_requires_grad:
            return CustomTensor(result_tensor,due_to_operation=True)

        graph = self.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)

        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)
        def _backward():
            if self_ref.tensor.grad is None:
                self_ref._zero_grad()
            self_ref.tensor.grad.add_(result_ref.tensor.grad / scalar)
        result._backward = _backward
        return result

    def _div_tensor(self,other):
        result_tensor = torch.div(self.tensor, other.tensor)
        requires_grad = self._custom_requires_grad or other._custom_requires_grad
        if not requires_grad:
            return CustomTensor(result_tensor,due_to_operation=True)

        graph = self.graph if self._custom_requires_grad else other.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)

        self_ref = weakref.proxy(self)
        other_ref = weakref.proxy(other)
        result_ref = weakref.proxy(result)

        if self._custom_requires_grad:
            graph.add_edge(self._node_id, result._node_id)
        if other._custom_requires_grad:
            graph.add_edge(other._node_id, result._node_id)

        def _backward():
            if self_ref._custom_requires_grad:
                if self_ref.tensor.grad is None:
                    self_ref._zero_grad()
                grad_for_self = self_ref._reduce_grad_for_broadcast(result_ref.tensor.grad / other_ref.tensor, self_ref.tensor.shape)
                self_ref.tensor.grad.add_(grad_for_self)
            if other_ref._custom_requires_grad:
                if other_ref.tensor.grad is None:
                    other_ref._zero_grad()
                grad_for_other = other_ref._reduce_grad_for_broadcast(-result_ref.tensor.grad * self_ref.tensor / other_ref.tensor.pow(2), other_ref.tensor.shape)
                other_ref.tensor.grad.add_(grad_for_other)
        result._backward = _backward
        return result

    def pow(self, scalar):
        result_tensor = torch.pow(self.tensor, scalar)
        if not self._custom_requires_grad:
            return CustomTensor(result_tensor,due_to_operation=True)

        graph = self.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)

        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)
        def _backward():
            if self_ref.tensor.grad is None:
                self_ref._zero_grad()
            grad_contrib = scalar * self_ref.tensor.pow(scalar - 1)
            self_ref.tensor.grad.add_(result_ref.tensor.grad * grad_contrib)
        result._backward = _backward
        return result
    def __ipow__(self,other):
        self.tensor.pow_(other)

    def exp(self):
        out = torch.exp(self.tensor)
        if not self._custom_requires_grad:
            return CustomTensor(out,due_to_operation=True)
        
        graph = self.graph
        result = CustomTensor(out, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)
        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)
        def _backward():
            if self_ref.tensor.grad is None:
                self_ref._zero_grad()
            self_ref.tensor.grad.add_(result_ref.tensor.grad * out)
        result._backward = _backward
        return result

    def log(self):
        out = torch.log(self.tensor)
        if not self._custom_requires_grad:
            return CustomTensor(out,due_to_operation=True)
        
        graph = self.graph
        result = CustomTensor(out, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)
        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)
        def _backward():
            if self_ref.tensor.grad is None:
                self_ref._zero_grad()
            self_ref.tensor.grad.add_(result_ref.tensor.grad / self_ref.tensor)
        result._backward = _backward
        return result

    def sin(self):
        out = torch.sin(self.tensor)
        if not self._custom_requires_grad:
            return CustomTensor(out,due_to_operation=True)
        
        graph = self.graph
        result = CustomTensor(out, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)
        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)
        def _backward():
            if self_ref.tensor.grad is None:
                self_ref._zero_grad()
            self_ref.tensor.grad.add_(result_ref.tensor.grad * torch.cos(self_ref.tensor))
        result._backward = _backward
        return result

    def cos(self):
        out = torch.cos(self.tensor)
        if not self._custom_requires_grad:
            return CustomTensor(out,due_to_operation=True)
        
        graph = self.graph
        result = CustomTensor(out, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)
        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)
        def _backward():
            if self_ref.tensor.grad is None:
                self_ref._zero_grad()
            self_ref.tensor.grad.add_(-result_ref.tensor.grad*torch.sin(self_ref.tensor))
        result._backward = _backward
        return result 

    def sqrt(self):
        out = torch.sqrt(self.tensor)
        if not self._custom_requires_grad:
            return CustomTensor(out,due_to_operation=True)
        
        graph = self.graph
        result = CustomTensor(out, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)
        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)
        def _backward():
            if self_ref.tensor.grad is None:
                self_ref._zero_grad()
            self_ref.tensor.grad.add_(result_ref.tensor.grad*0.5*self_ref.tensor.pow(-0.5))
        result._backward = _backward
        return result

    def matmul(self, other):
        result_tensor = torch.matmul(self.tensor, other.tensor)
        requires_grad = self._custom_requires_grad or other._custom_requires_grad
        if not requires_grad:
            return CustomTensor(result_tensor,due_to_operation=True)

        graph = self.graph if self._custom_requires_grad else other.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)

        self_ref = weakref.proxy(self)
        other_ref = weakref.proxy(other)
        result_ref = weakref.proxy(result)

        if self._custom_requires_grad:
            graph.add_edge(self._node_id, result._node_id)
        if other._custom_requires_grad:
            graph.add_edge(other._node_id, result._node_id)

        def _backward():
            if self_ref._custom_requires_grad:
                if self_ref.tensor.grad is None: self_ref._zero_grad()
                # Use robust broadcasting for matmul gradient
                grad_for_self = torch.matmul(result_ref.tensor.grad, other_ref.tensor.transpose(-2, -1))
                self_ref.tensor.grad.add_(self_ref._reduce_grad_for_broadcast(grad_for_self, self_ref.tensor.shape))
            if other_ref._custom_requires_grad:
                if other_ref.tensor.grad is None: other_ref._zero_grad()
                grad_for_other = torch.matmul(self_ref.tensor.transpose(-2, -1), result_ref.tensor.grad)
                other_ref.tensor.grad.add_(other_ref._reduce_grad_for_broadcast(grad_for_other, other_ref.tensor.shape))
        result._backward = _backward
        return result
    def dot(self, other):
        # torch.dot only works for 1D tensors, or for higher-D tensors,
        # it flattens them to 1D and then computes the dot product.
        # This means the gradients will also be 1D, so no complex broadcasting
        # reduction is needed on the output gradient itself.
        # However, the input tensors themselves could have been results of broadcasting ops.
        # For a truly general dot product, you'd use torch.matmul.
        result_tensor = torch.dot(self.tensor.reshape(-1), other.tensor.reshape(-1))
        requires_grad = self._custom_requires_grad or other._custom_requires_grad
        if not requires_grad:
            return CustomTensor(result_tensor,due_to_operation=True)

        graph = self.graph if self._custom_requires_grad else other.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)

        self_ref = weakref.proxy(self)
        other_ref = weakref.proxy(other)
        result_ref = weakref.proxy(result)

        if self._custom_requires_grad:
            graph.add_edge(self._node_id, result._node_id)
        if other._custom_requires_grad:
            graph.add_edge(other._node_id, result._node_id)

        def _backward():
            if self_ref._custom_requires_grad:
                if self_ref.tensor.grad is None:
                    self_ref._zero_grad()
                # The grad from result_ref.tensor.grad will be a scalar.
                # It needs to be multiplied by the other_ref.tensor (original shape)
                # and then potentially re-shaped if original was >1D
                grad_contrib = result_ref.tensor.grad * other_ref.tensor
                self_ref.tensor.grad.add_(grad_contrib)
            if other_ref._custom_requires_grad:
                if other_ref.tensor.grad is None:
                    other_ref._zero_grad()
                grad_contrib = result_ref.tensor.grad * self_ref.tensor
                other_ref.tensor.grad.add_(grad_contrib)
        result._backward = _backward
        return result


    
    # --- New Unary Operations ---
    
    def sum(self, dim=None, keepdim=False):
        """Computes the sum of elements along given dimensions."""
        result_tensor = self.tensor.sum(dim=dim, keepdim=keepdim)
        if not self._custom_requires_grad:
            return CustomTensor(result_tensor, due_to_operation=True)
            
        graph = self.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)

        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)

        def _backward():
            if self_ref.tensor.grad is None:
                self_ref._zero_grad()
                
            grad = result_ref.tensor.grad
            # If keepdim was false, the summed dim was squeezed. We need to unsqueeze it back for broadcasting.
            if not keepdim and dim is not None:
                grad = grad.unsqueeze(dim)
            
            self_ref.tensor.grad.add_(grad)

        result._backward = _backward
        return result

    def mean(self, dim=None, keepdim=False):
        """Computes the mean of elements along given dimensions."""
        result_tensor = self.tensor.mean(dim=dim, keepdim=keepdim)
        if not self._custom_requires_grad:
            return CustomTensor(result_tensor, due_to_operation=True)

        graph = self.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)

        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)
        
        # Determine the number of elements that were averaged
        if dim is None:
            n = self.tensor.numel()
        else:
            n = self.tensor.shape[dim]

        def _backward():
            if self_ref.tensor.grad is None:
                self_ref._zero_grad()
            
            grad = result_ref.tensor.grad
            if not keepdim and dim is not None:
                grad = grad.unsqueeze(dim)
            
            # Distribute gradient evenly
            self_ref.tensor.grad.add_(grad / n)

        result._backward = _backward
        return result

    def reshape(self, *shape):
        """Reshapes the tensor to the given shape."""
        original_shape = self.shape
        result_tensor = self.tensor.reshape(*shape)
        if not self._custom_requires_grad:
            return CustomTensor(result_tensor, due_to_operation=True)

        graph = self.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)
        
        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)
        
        def _backward():
            if self_ref.tensor.grad is None:
                self_ref._zero_grad()
            self_ref.tensor.grad.add_(result_ref.tensor.grad.reshape(original_shape))

        result._backward = _backward
        return result
        
    def transpose(self, dim0, dim1):
        """Transposes dimensions dim0 and dim1."""
        result_tensor = self.tensor.transpose(dim0, dim1)
        if not self._custom_requires_grad:
            return CustomTensor(result_tensor, due_to_operation=True)

        graph = self.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)

        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)

        def _backward():
            if self_ref.tensor.grad is None:
                self_ref._zero_grad()
            # The gradient operation for transpose is another transpose
            self_ref.tensor.grad.add_(result_ref.tensor.grad.transpose(dim0, dim1))
            
        result._backward = _backward
        return result

    @property
    def T(self):
        """Alias for transpose(-2, -1) for 2D or higher dimensional tensors."""
        if self.ndim < 2:
            raise ValueError("`.T` is only supported on tensors with 2 or more dimensions.")
        return self.transpose(-2, -1)
        
    # --- Activation Functions ---

    def relu(self):
        """Applies the Rectified Linear Unit function element-wise."""
        result_tensor = F.relu(self.tensor)
        if not self._custom_requires_grad:
            return CustomTensor(result_tensor, due_to_operation=True)

        graph = self.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)
        
        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)
        
        def _backward():
            if self_ref.tensor.grad is None: self_ref._zero_grad()
            # Derivative is 1 for positive inputs, 0 otherwise
            grad_mask = (self_ref.tensor > 0).type(self_ref.tensor.dtype)
            self_ref.tensor.grad.add_(result_ref.tensor.grad * grad_mask)

        result._backward = _backward
        return result

    def tanh(self):
        """Applies the hyperbolic tangent function element-wise."""
        result_tensor = torch.tanh(self.tensor)
        if not self._custom_requires_grad:
            return CustomTensor(result_tensor, due_to_operation=True)

        graph = self.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)

        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)
        
        def _backward():
            if self_ref.tensor.grad is None: self_ref._zero_grad()
            # Derivative is 1 - tanh^2(x)
            local_grad = 1 - result_tensor.pow(2)
            self_ref.tensor.grad.add_(result_ref.tensor.grad * local_grad)

        result._backward = _backward
        return result

    def leaky_relu(self, negative_slope=0.01):
        """Applies the Leaky Rectified Linear Unit function element-wise."""
        result_tensor = F.leaky_relu(self.tensor, negative_slope)
        if not self._custom_requires_grad:
            return CustomTensor(result_tensor, due_to_operation=True)

        graph = self.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)

        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)

        def _backward():
            if self_ref.tensor.grad is None: self_ref._zero_grad()
            # Derivative is 1 for positive, negative_slope for negative
            local_grad = torch.ones_like(self_ref.tensor)
            local_grad[self_ref.tensor < 0] = negative_slope
            self_ref.tensor.grad.add_(result_ref.tensor.grad * local_grad)

        result._backward = _backward
        return result

    def elu(self, alpha=1.0):
        """Applies the Exponential Linear Unit function element-wise."""
        result_tensor = F.elu(self.tensor, alpha)
        if not self._custom_requires_grad:
            return CustomTensor(result_tensor, due_to_operation=True)

        graph = self.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)

        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)

        def _backward():
            if self_ref.tensor.grad is None: self_ref._zero_grad()
            # Derivative is 1 for positive, and output + alpha for negative
            local_grad = torch.ones_like(self_ref.tensor)
            neg_mask = self_ref.tensor < 0
            local_grad[neg_mask] = result_tensor[neg_mask] + alpha
            self_ref.tensor.grad.add_(result_ref.tensor.grad * local_grad)

        result._backward = _backward
        return result
        
    def silu(self):
        """Applies the Sigmoid-weighted Linear Unit function element-wise."""
        result_tensor = F.silu(self.tensor)
        if not self._custom_requires_grad:
            return CustomTensor(result_tensor, due_to_operation=True)

        graph = self.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)

        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)
        
        def _backward():
            if self_ref.tensor.grad is None: self_ref._zero_grad()
            # Derivative of x*sigmoid(x) is sigmoid(x) + x*sigmoid(x)*(1-sigmoid(x))
            sig_x = torch.sigmoid(self_ref.tensor)
            local_grad = sig_x * (1 + self_ref.tensor * (1 - sig_x))
            self_ref.tensor.grad.add_(result_ref.tensor.grad * local_grad)

        result._backward = _backward
        return result
    
    # Add swish as an alias for silu
    swish = silu

    def gelu(self):
        """Applies the Gaussian Error Linear Unit function element-wise."""
        result_tensor = F.gelu(self.tensor)
        if not self._custom_requires_grad:
            return CustomTensor(result_tensor, due_to_operation=True)

        graph = self.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)
        
        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)
        
        def _backward():
            if self_ref.tensor.grad is None: self_ref._zero_grad()
            # Derivative of GELU: 0.5 * (1 + erf(x/sqrt(2))) + x * exp(-x^2/2) / sqrt(2*pi)
            x = self_ref.tensor
            cdf = 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
            pdf = torch.exp(-0.5 * x**2) / math.sqrt(2.0 * math.pi)
            local_grad = cdf + x * pdf
            self_ref.tensor.grad.add_(result_ref.tensor.grad * local_grad)

        result._backward = _backward
        return result

    def softmax(self, dim=-1):
        """Applies the softmax function along a given dimension."""
        result_tensor = torch.softmax(self.tensor, dim=dim)
        if not self._custom_requires_grad:
            return CustomTensor(result_tensor, due_to_operation=True)

        graph = self.graph
        result = CustomTensor(result_tensor, _custom_requires_grad=True, graph=graph, due_to_operation=True, is_leaf=False)
        graph.add_edge(self._node_id, result._node_id)
        
        self_ref = weakref.proxy(self)
        result_ref = weakref.proxy(result)

        def _backward():
            if self_ref.tensor.grad is None: self_ref._zero_grad()
            # For softmax, the jacobian-vector product is y * (grad - sum(grad * y))
            y = result_tensor
            grad_output = result_ref.tensor.grad
            grad_input = y * (grad_output - (grad_output * y).sum(dim=dim, keepdim=True))
            self_ref.tensor.grad.add_(grad_input)
            
        result._backward = _backward
        return result
    def backward(self, weightage_tensor=1):
        if not self._custom_requires_grad:
            raise RuntimeError("Output tensor does not require grad.")
        if self.graph is None:
            raise RuntimeError("Output tensor is not part of a graph.")
        graph = self.graph
        
        # Initialize gradient for the output tensor
        if isinstance(weightage_tensor, numbers.Number):
            self.tensor.grad = torch.full_like(self.tensor, fill_value=weightage_tensor)
        elif isinstance(weightage_tensor, torch.Tensor):
            self.tensor.grad = weightage_tensor.clone()

        nodes_to_process = graph.reverse_toposort_from_tensor(self._node_id)
        
        for tensor_node in nodes_to_process:
            tensor_node._backward()
            #try:
                # The node is a weakref.proxy, check if it's still alive
                #if tensor_node.__class__ is weakref.ProxyType:
            #        tensor_node._backward()
            # except ReferenceError:
            #     # The tensor object was garbage collected, skip.
            #     print("dead reference node encountered")
            #     continue
    # --- Properties and Dunder Methods ---
    @property
    def dtype(self): return self.tensor.dtype
    @property
    def ndim(self): return self.tensor.ndim
    @property
    def shape(self): return self.tensor.shape
    @property
    def grad(self): return self.tensor.grad
    def __repr__(self): return f"CustomTensor({self.tensor}, grad_fn={self._backward != None}, requires_grad={self._custom_requires_grad})"
    def __del__(self):
        if self._node_id is not None and self._is_leaf:
            try:
                if self.graph: self.graph.delete_node(self._node_id)
            except ReferenceError: # Graph might be gone first
                pass

## Autograd Tester

In [17]:
import torch
import numpy as np
import numbers
import weakref
import rustworkx as rx
from typing import Optional, Any
import sys
import gc
import pytest


class AutogradTester:
    def __init__(self):
        self.passed_tests = 0
        self.failed_tests = 0
        self.tolerance = 1e-6  # Increased tolerance slightly for complex ops

    def assert_tensors_close(self, custom_tensor, pytorch_tensor, test_name, check_grad=True):
        """Compare custom tensor with PyTorch tensor values and optionally gradients."""
        try:
            # Check values
            np.testing.assert_allclose(
                custom_tensor.tensor.detach().cpu().numpy(),  # Ensure on CPU for numpy
                pytorch_tensor.detach().cpu().numpy(),
                rtol=self.tolerance,
                atol=self.tolerance,
                err_msg=f"Mismatch in tensor values for {test_name}"
            )

            # Check gradients if requested and they exist for PyTorch tensor
            if check_grad and pytorch_tensor.grad is not None:
                if custom_tensor.tensor.grad is None:
                    raise AssertionError(f"Custom tensor has no gradient for {test_name}, but PyTorch does.")

                np.testing.assert_allclose(
                    custom_tensor.tensor.grad.detach().cpu().numpy(),  # Ensure on CPU for numpy
                    pytorch_tensor.grad.detach().cpu().numpy(),
                    rtol=self.tolerance,
                    atol=self.tolerance,
                    err_msg=f"Mismatch in gradients for {test_name}"
                )
            elif check_grad and pytorch_tensor.grad is None and custom_tensor.tensor.grad is not None:
                raise AssertionError(f"Custom tensor has gradient for {test_name}, but PyTorch does not (should be no_grad).")

            print(f"✓ {test_name}")
            self.passed_tests += 1

        except Exception as e:
            print(f"✗ {test_name}: {str(e)}")
            self.failed_tests += 1

    def test_basic_operations(self):
        """Test basic arithmetic operations"""
        print("\n=== Testing Basic Operations ===")

        # Test scalar addition
        with AutogradGraph() as graph:
            x_custom = CustomTensor([2.0, 3.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = x_custom + 5.0
            y_custom.backward(torch.ones_like(y_custom.tensor))

            x_pytorch = torch.tensor([2.0, 3.0], requires_grad=True)
            y_pytorch = x_pytorch + 5.0
            y_pytorch.backward(torch.ones_like(y_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Scalar Addition - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Scalar Addition - y (result)", check_grad=False)

        # Test tensor addition
        with AutogradGraph() as graph:
            x_custom = CustomTensor([1.0, 2.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = CustomTensor([3.0, 4.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            z_custom = x_custom + y_custom
            z_custom.backward(torch.ones_like(z_custom.tensor))

            x_pytorch = torch.tensor([1.0, 2.0], requires_grad=True)
            y_pytorch = torch.tensor([3.0, 4.0], requires_grad=True)
            z_pytorch = x_pytorch + y_pytorch
            z_pytorch.backward(torch.ones_like(z_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Tensor Addition - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Tensor Addition - y")
            self.assert_tensors_close(z_custom, z_pytorch, "Tensor Addition - z (result)", check_grad=False)

    def test_multiplication(self):
        """Test multiplication operations"""
        print("\n=== Testing Multiplication ===")

        # Test scalar multiplication
        with AutogradGraph() as graph:
            x_custom = CustomTensor([2.0, 3.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = x_custom * 4.0
            y_custom.backward(torch.ones_like(y_custom.tensor))

            x_pytorch = torch.tensor([2.0, 3.0], requires_grad=True)
            y_pytorch = x_pytorch * 4.0
            y_pytorch.backward(torch.ones_like(y_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Scalar Multiplication - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Scalar Multiplication - y (result)", check_grad=False)

        # Test tensor multiplication
        with AutogradGraph() as graph:
            x_custom = CustomTensor([2.0, 3.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = CustomTensor([4.0, 5.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            z_custom = x_custom * y_custom
            z_custom.backward(torch.ones_like(z_custom.tensor))

            x_pytorch = torch.tensor([2.0, 3.0], requires_grad=True)
            y_pytorch = torch.tensor([4.0, 5.0], requires_grad=True)
            z_pytorch = x_pytorch * y_pytorch
            z_pytorch.backward(torch.ones_like(z_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Tensor Multiplication - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Tensor Multiplication - y")
            self.assert_tensors_close(z_custom, z_pytorch, "Tensor Multiplication - z (result)", check_grad=False)

    def test_subtraction_division(self):
        """Test subtraction and division"""
        print("\n=== Testing Subtraction and Division ===")

        # Test scalar subtraction (x - C)
        with AutogradGraph() as graph:
            x_custom = CustomTensor([5.0, 6.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = x_custom - 2.0
            y_custom.backward(torch.ones_like(y_custom.tensor))

            x_pytorch = torch.tensor([5.0, 6.0], requires_grad=True)
            y_pytorch = x_pytorch - 2.0
            y_pytorch.backward(torch.ones_like(y_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Scalar Subtraction (x - C) - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Scalar Subtraction (x - C) - y (result)", check_grad=False)

        # Test scalar reverse subtraction (C - x)
        with AutogradGraph() as graph:
            x_custom = CustomTensor([5.0, 6.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = 10.0 - x_custom  # Uses __rsub__
            y_custom.backward(torch.ones_like(y_custom.tensor))

            x_pytorch = torch.tensor([5.0, 6.0], requires_grad=True)
            y_pytorch = 10.0 - x_pytorch
            y_pytorch.backward(torch.ones_like(y_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Scalar Reverse Subtraction (C - x) - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Scalar Reverse Subtraction (C - x) - y (result)", check_grad=False)

        # Test tensor subtraction
        with AutogradGraph() as graph:
            x_custom = CustomTensor([7.0, 8.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = CustomTensor([2.0, 1.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            z_custom = x_custom - y_custom
            z_custom.backward(torch.ones_like(z_custom.tensor))

            x_pytorch = torch.tensor([7.0, 8.0], requires_grad=True)
            y_pytorch = torch.tensor([2.0, 1.0], requires_grad=True)
            z_pytorch = x_pytorch - y_pytorch
            z_pytorch.backward(torch.ones_like(z_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Tensor Subtraction - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Tensor Subtraction - y")
            self.assert_tensors_close(z_custom, z_pytorch, "Tensor Subtraction - z (result)", check_grad=False)

        # Test scalar division
        with AutogradGraph() as graph:
            x_custom = CustomTensor([8.0, 12.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = x_custom / 4.0
            y_custom.backward(torch.ones_like(y_custom.tensor))

            x_pytorch = torch.tensor([8.0, 12.0], requires_grad=True)
            y_pytorch = x_pytorch / 4.0
            y_pytorch.backward(torch.ones_like(y_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Scalar Division - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Scalar Division - y (result)", check_grad=False)
        # Test tensor division
        with AutogradGraph() as graph:
            x_custom = CustomTensor([8.0, 12.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = CustomTensor([5.0, 10.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            z_custom = x_custom / y_custom
            z_custom.backward(torch.ones_like(z_custom.tensor))

            x_pytorch = torch.tensor([8.0, 12.0], requires_grad=True)
            y_pytorch = torch.tensor([5.0, 10.0], requires_grad=True)
            z_pytorch = x_pytorch / y_pytorch
            z_pytorch.backward(torch.ones_like(z_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Tensor Division - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Tensir Division - y")
            self.assert_tensors_close(z_custom, z_pytorch, "Tensor Division - z (result)", check_grad=False)


    def test_power_function(self):
        """Test power operation"""
        print("\n=== Testing Power Function ===")

        with AutogradGraph() as graph:
            x_custom = CustomTensor([2.0, 3.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = x_custom.pow(3.0)
            y_custom.backward(torch.ones_like(y_custom.tensor))

            x_pytorch = torch.tensor([2.0, 3.0], requires_grad=True)
            y_pytorch = torch.pow(x_pytorch, 3.0)
            y_pytorch.backward(torch.ones_like(y_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Power Function - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Power Function - y (result)", check_grad=False)

        # Test power with negative exponent
        with AutogradGraph() as graph:
            x_custom = CustomTensor([2.0, 3.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = x_custom.pow(-2.0)
            y_custom.backward(torch.ones_like(y_custom.tensor))

            x_pytorch = torch.tensor([2.0, 3.0], requires_grad=True)
            y_pytorch = torch.pow(x_pytorch, -2.0)
            y_pytorch.backward(torch.ones_like(y_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Power Function (Negative Exponent) - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Power Function (Negative Exponent) - y (result)", check_grad=False)

    def test_unary_functions(self):
        """Test unary mathematical functions"""
        print("\n=== Testing Unary Functions ===")

        # Test exp
        with AutogradGraph() as graph:
            x_custom = CustomTensor([1.0, 2.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = x_custom.exp()
            y_custom.backward(torch.ones_like(y_custom.tensor))

            x_pytorch = torch.tensor([1.0, 2.0], requires_grad=True)
            y_pytorch = torch.exp(x_pytorch)
            y_pytorch.backward(torch.ones_like(y_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Exponential Function - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Exponential Function - y (result)", check_grad=False)

        # Test log
        with AutogradGraph() as graph:
            x_custom = CustomTensor([1.0, 2.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = x_custom.log()
            y_custom.backward(torch.ones_like(y_custom.tensor))

            x_pytorch = torch.tensor([1.0, 2.0], requires_grad=True)
            y_pytorch = torch.log(x_pytorch)
            y_pytorch.backward(torch.ones_like(y_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Logarithm Function - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Logarithm Function - y (result)", check_grad=False)

        # Test sin
        with AutogradGraph() as graph:
            x_custom = CustomTensor([0.5, 1.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = x_custom.sin()
            y_custom.backward(torch.ones_like(y_custom.tensor))

            x_pytorch = torch.tensor([0.5, 1.0], requires_grad=True)
            y_pytorch = torch.sin(x_pytorch)
            y_pytorch.backward(torch.ones_like(y_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Sine Function - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Sine Function - y (result)", check_grad=False)

        # Test cos
        with AutogradGraph() as graph:
            x_custom = CustomTensor([0.5, 1.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = x_custom.cos()
            y_custom.backward(torch.ones_like(y_custom.tensor))

            x_pytorch = torch.tensor([0.5, 1.0], requires_grad=True)
            y_pytorch = torch.cos(x_pytorch)
            y_pytorch.backward(torch.ones_like(y_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Cosine Function - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Cosine Function - y (result)", check_grad=False)

        # Test sqrt
        with AutogradGraph() as graph:
            x_custom = CustomTensor([4.0, 9.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = x_custom.sqrt()
            y_custom.backward(torch.ones_like(y_custom.tensor))

            x_pytorch = torch.tensor([4.0, 9.0], requires_grad=True)
            y_pytorch = torch.sqrt(x_pytorch)
            y_pytorch.backward(torch.ones_like(y_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Square Root Function - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Square Root Function - y (result)", check_grad=False)

    def test_matrix_operations(self):
        """Test matrix operations"""
        print("\n=== Testing Matrix Operations ===")

        # Test matrix multiplication (2x2 @ 2x2)
        with AutogradGraph() as graph:
            x_custom = CustomTensor([[1.0, 2.0], [3.0, 4.0]], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = CustomTensor([[5.0, 6.0], [7.0, 8.0]], _custom_requires_grad=True, graph=graph, is_leaf=True)
            z_custom = x_custom.matmul(y_custom)
            z_custom.backward(torch.ones_like(z_custom.tensor))

            x_pytorch = torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True)
            y_pytorch = torch.tensor([[5.0, 6.0], [7.0, 8.0]], requires_grad=True)
            z_pytorch = torch.matmul(x_pytorch, y_pytorch)
            z_pytorch.backward(torch.ones_like(z_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Matrix Multiplication (2x2 @ 2x2) - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Matrix Multiplication (2x2 @ 2x2) - y")
            self.assert_tensors_close(z_custom, z_pytorch, "Matrix Multiplication (2x2 @ 2x2) - z (result)", check_grad=False)

        # Test matrix multiplication (2x3 @ 3x2)
        with AutogradGraph() as graph:
            x_custom = CustomTensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = CustomTensor([[7.0, 8.0], [9.0, 10.0], [11.0, 12.0]], _custom_requires_grad=True, graph=graph, is_leaf=True)
            z_custom = x_custom.matmul(y_custom)
            z_custom.backward(torch.ones_like(z_custom.tensor))

            x_pytorch = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], requires_grad=True)
            y_pytorch = torch.tensor([[7.0, 8.0], [9.0, 10.0], [11.0, 12.0]], requires_grad=True)
            z_pytorch = torch.matmul(x_pytorch, y_pytorch)
            z_pytorch.backward(torch.ones_like(z_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Matrix Multiplication (2x3 @ 3x2) - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Matrix Multiplication (2x3 @ 3x2) - y")
            self.assert_tensors_close(z_custom, z_pytorch, "Matrix Multiplication (2x3 @ 3x2) - z (result)", check_grad=False)

        # Test dot product (vector * vector)
        with AutogradGraph() as graph:
            x_custom = CustomTensor([1.0, 2.0, 3.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = CustomTensor([4.0, 5.0, 6.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            z_custom = x_custom.dot(y_custom)
            z_custom.backward()  # Scalar output, so default backward() is fine (grad=1)

            x_pytorch = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
            y_pytorch = torch.tensor([4.0, 5.0, 6.0], requires_grad=True)
            z_pytorch = torch.dot(x_pytorch, y_pytorch)
            z_pytorch.backward()

            self.assert_tensors_close(x_custom, x_pytorch, "Dot Product (vector) - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Dot Product (vector) - y")
            self.assert_tensors_close(z_custom, z_pytorch, "Dot Product (vector) - z (result)", check_grad=False)

    def test_complex_chain(self):
        """Test complex computational chains"""
        print("\n=== Testing Complex Chains ===")

        # Test 1: z = (x + y) * (x - y) + x^2 - sin(y)
        with AutogradGraph() as graph:
            x_custom = CustomTensor([3.0, 4.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = CustomTensor([1.0, 2.0], _custom_requires_grad=True, graph=graph, is_leaf=True)

            sum_custom = x_custom + y_custom
            diff_custom = x_custom - y_custom
            prod_custom = sum_custom * diff_custom
            x_squared_custom = x_custom.pow(2.0)
            sin_y_custom = y_custom.sin()

            inter1_custom = prod_custom + x_squared_custom
            z_custom = inter1_custom - sin_y_custom

            z_custom.backward(torch.ones_like(z_custom.tensor))

            x_pytorch = torch.tensor([3.0, 4.0], requires_grad=True)
            y_pytorch = torch.tensor([1.0, 2.0], requires_grad=True)

            sum_pytorch = x_pytorch + y_pytorch
            diff_pytorch = x_pytorch - y_pytorch
            prod_pytorch = sum_pytorch * diff_pytorch
            x_squared_pytorch = torch.pow(x_pytorch, 2.0)
            sin_y_pytorch = torch.sin(y_pytorch)

            inter1_pytorch = prod_pytorch + x_squared_pytorch
            z_pytorch = inter1_pytorch - sin_y_pytorch

            z_pytorch.backward(torch.ones_like(z_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Complex Chain 1 - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Complex Chain 1 - y")
            self.assert_tensors_close(z_custom, z_pytorch, "Complex Chain 1 - z (result)", check_grad=False)

        # Test 2: Multiple paths to a leaf: z = x*y + x*x + y*z_fixed
        with AutogradGraph() as graph:
            x_custom = CustomTensor([2.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = CustomTensor([3.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            z_fixed_custom = CustomTensor([0.5])  # No grad

            term1_custom = x_custom * y_custom
            term2_custom = x_custom * x_custom  # x appears twice
            term3_custom = y_custom * z_fixed_custom  # y appears twice, one with no-grad

            inter_custom = term1_custom + term2_custom
            z_custom = inter_custom + term3_custom
            z_custom.backward()

            x_pytorch = torch.tensor([2.0], requires_grad=True)
            y_pytorch = torch.tensor([3.0], requires_grad=True)
            z_fixed_pytorch = torch.tensor([0.5])  # No grad

            term1_pytorch = x_pytorch * y_pytorch
            term2_pytorch = x_pytorch * x_pytorch
            term3_pytorch = y_pytorch * z_fixed_pytorch

            inter_pytorch = term1_pytorch + term2_pytorch
            z_pytorch = inter_pytorch + term3_pytorch
            z_pytorch.backward()

            self.assert_tensors_close(x_custom, x_pytorch, "Complex Chain 2 (Multiple Paths) - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Complex Chain 2 (Multiple Paths) - y")
            self.assert_tensors_close(z_custom, z_pytorch, "Complex Chain 2 (Multiple Paths) - z (result)", check_grad=False)

        # Test 3: Deeper Chain with Mixed Ops: (exp(x) * log(y)) / sqrt(x+y)
        with AutogradGraph() as graph:
            x_custom = CustomTensor([1.5], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = CustomTensor([2.5], _custom_requires_grad=True, graph=graph, is_leaf=True)

            exp_x_custom = x_custom.exp()
            log_y_custom = y_custom.log()
            numerator_custom = exp_x_custom * log_y_custom

            sum_xy_custom = x_custom + y_custom
            sqrt_sum_custom = sum_xy_custom.sqrt()

            z_custom = numerator_custom / sqrt_sum_custom
            z_custom.backward()

            x_pytorch = torch.tensor([1.5], requires_grad=True)
            y_pytorch = torch.tensor([2.5], requires_grad=True)

            exp_x_pytorch = torch.exp(x_pytorch)
            log_y_pytorch = torch.log(y_pytorch)
            numerator_pytorch = exp_x_pytorch * log_y_pytorch

            sum_xy_pytorch = x_pytorch + y_pytorch
            sqrt_sum_pytorch = torch.sqrt(sum_xy_pytorch)

            z_pytorch = numerator_pytorch / sqrt_sum_pytorch
            z_pytorch.backward()

            self.assert_tensors_close(x_custom, x_pytorch, "Complex Chain 3 (Deeper Mixed Ops) - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Complex Chain 3 (Deeper Mixed Ops) - y")
            self.assert_tensors_close(z_custom, z_pytorch, "Complex Chain 3 (Deeper Mixed Ops) - z (result)", check_grad=False)

    def test_mixed_operations(self):
        """Test mixing operations with and without gradients"""
        print("\n=== Testing Mixed Operations ===")

        # One tensor requires grad, other doesn't (multiplication)
        with AutogradGraph() as graph:
            x_custom = CustomTensor([2.0, 3.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = CustomTensor([4.0, 5.0])  # No grad
            z_custom = x_custom * y_custom
            z_custom.backward(torch.ones_like(z_custom.tensor))

            x_pytorch = torch.tensor([2.0, 3.0], requires_grad=True)
            y_pytorch = torch.tensor([4.0, 5.0])  # No grad
            z_pytorch = x_pytorch * y_pytorch
            z_pytorch.backward(torch.ones_like(z_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Mixed Operations (X*Y, Y no grad) - x")
            # Check that y_custom has no grad
            self.assert_tensors_close(y_custom, y_pytorch, "Mixed Operations (X*Y, Y no grad) - y", check_grad=False)
            self.assert_tensors_close(z_custom, z_pytorch, "Mixed Operations (X*Y, Y no grad) - z (result)", check_grad=False)

        # One tensor requires grad, other doesn't (addition)
        with AutogradGraph() as graph:
            x_custom = CustomTensor([10.0, 20.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = CustomTensor([1.0, 2.0])  # No grad
            z_custom = x_custom + y_custom
            z_custom.backward(torch.ones_like(z_custom.tensor))

            x_pytorch = torch.tensor([10.0, 20.0], requires_grad=True)
            y_pytorch = torch.tensor([1.0, 2.0])  # No grad
            z_pytorch = x_pytorch + y_pytorch
            z_pytorch.backward(torch.ones_like(z_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Mixed Operations (X+Y, Y no grad) - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Mixed Operations (X+Y, Y no grad) - y", check_grad=False)
            self.assert_tensors_close(z_custom, z_pytorch, "Mixed Operations (X+Y, Y no grad) - z (result)", check_grad=False)

    def test_broadcasting(self):
        """Test operations with broadcasting"""
        print("\n=== Testing Broadcasting ===")

        # Vector + scalar
        with AutogradGraph() as graph:
            x_custom = CustomTensor([1.0, 2.0, 3.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = x_custom + 10.0
            y_custom.backward(torch.tensor([1.0, 1.0, 1.0]))

            x_pytorch = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
            y_pytorch = x_pytorch + 10.0
            y_pytorch.backward(torch.tensor([1.0, 1.0, 1.0]))

            self.assert_tensors_close(x_custom, x_pytorch, "Broadcasting: Vector + Scalar - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Broadcasting: Vector + Scalar - y (result)", check_grad=False)

        # Matrix + vector (row broadcasting)
        with AutogradGraph() as graph:
            x_custom = CustomTensor([[1.0, 2.0], [3.0, 4.0]], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = CustomTensor([10.0, 20.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            z_custom = x_custom + y_custom  # y broadcasts to rows of x
            z_custom.backward(torch.ones_like(z_custom.tensor))

            x_pytorch = torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True)
            y_pytorch = torch.tensor([10.0, 20.0], requires_grad=True)
            z_pytorch = x_pytorch + y_pytorch
            z_pytorch.backward(torch.ones_like(z_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Broadcasting: Matrix + Vector (row) - x")
            # For broadcasted operations, the gradient needs to be summed over the broadcasted dimensions
            # PyTorch handles this automatically. Your custom backward for add should accumulate.
            self.assert_tensors_close(y_custom, y_pytorch, "Broadcasting: Matrix + Vector (row) - y")
            self.assert_tensors_close(z_custom, z_pytorch, "Broadcasting: Matrix + Vector (row) - z (result)", check_grad=False)

        # Matrix * scalar
        with AutogradGraph() as graph:
            x_custom = CustomTensor([[1.0, 2.0], [3.0, 4.0]], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = x_custom * 5.0
            y_custom.backward(torch.ones_like(y_custom.tensor))

            x_pytorch = torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True)
            y_pytorch = x_pytorch * 5.0
            y_pytorch.backward(torch.ones_like(y_pytorch))

            self.assert_tensors_close(x_custom, x_pytorch, "Broadcasting: Matrix * Scalar - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Broadcasting: Matrix * Scalar - y (result)", check_grad=False)

    def test_backward_with_custom_grad(self):
        """Test backward pass with a custom initial gradient tensor."""
        print("\n=== Testing Backward with Custom Grad ===")

        with AutogradGraph() as graph:
            x_custom = CustomTensor([2.0, 3.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = x_custom * 4.0 + 1.0

            custom_grad_output = torch.tensor([0.5, 2.0])
            y_custom.backward(custom_grad_output)

            x_pytorch = torch.tensor([2.0, 3.0], requires_grad=True)
            y_pytorch = x_pytorch * 4.0 + 1.0

            pytorch_grad_output = torch.tensor([0.5, 2.0])
            y_pytorch.backward(pytorch_grad_output)

            self.assert_tensors_close(x_custom, x_pytorch, "Backward with Custom Grad - x")
            self.assert_tensors_close(y_custom, y_pytorch, "Backward with Custom Grad - y (result)", check_grad=False)

    def test_zero_grad_behavior(self):
        """Test _zero_grad and subsequent backward calls."""
        print("\n=== Testing Zero Grad Behavior ===")
        with AutogradGraph() as graph:
            x_custom = CustomTensor([1.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = x_custom * 2
            z_custom = y_custom + 3
            z_custom.backward()  # First backward

            self.assert_tensors_close(x_custom, torch.tensor([1.0], requires_grad=True), "Zero Grad Init (first backward) - x",check_grad=False)

            z_custom._zero_grad()  # Manually zero for custom
            y_custom._zero_grad()  # Manually zero for custom
            x_custom._zero_grad()  # Manually zero for custom leaf

            # Do another backward pass
            z_custom.backward()  # Should accumulate again from 1.0

            x_pytorch = torch.tensor([1.0], requires_grad=True)
            y_pytorch = x_pytorch * 2
            z_pytorch = y_pytorch + 3
            z_pytorch.backward(retain_graph=True)

            x_pytorch.grad.zero_()
            z_pytorch.backward()  # PyTorch accumulates if not zeroed explicitly

            self.assert_tensors_close(x_custom, x_pytorch, "Zero Grad Behavior - x (after 2nd backward)")
            self.assert_tensors_close(z_custom, z_pytorch, "Zero Grad Behavior - z (result, after 2nd backward)", check_grad=False)

    def test_no_grad_flow(self):
        """Test that gradients do not flow to tensors not requiring grad."""
        print("\n=== Testing No Grad Flow ===")
        with AutogradGraph() as graph:
            x_custom = CustomTensor([5.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
            y_custom = CustomTensor([2.0], _custom_requires_grad=False)  # Does NOT require grad
            z_custom = x_custom * y_custom
            z_custom.backward()

            x_pytorch = torch.tensor([5.0], requires_grad=True)
            y_pytorch = torch.tensor([2.0], requires_grad=False)
            z_pytorch = x_pytorch * y_pytorch
            z_pytorch.backward()

            self.assert_tensors_close(x_custom, x_pytorch, "No Grad Flow - x (requires grad)")
            # PyTorch's .grad for non-requiring-grad tensors is None
            # Our CustomTensor.tensor.grad for non-requiring-grad should also be None
            try:
                # Check that y_custom.tensor.grad is None
                if y_custom.tensor.grad is not None:
                    raise AssertionError("Custom non-grad tensor unexpectedly has a gradient.")
                print(f"✓ No Grad Flow - y (no grad, custom correctly None)")
                self.passed_tests += 1
            except Exception as e:
                print(f"✗ No Grad Flow - y (no grad): {str(e)}")
                self.failed_tests += 1

    def test_basic_add_scalar_grad_system(self):
        print("\n=== System Test: Basic Scalar Add Grad ===")
        try:
            with AutogradGraph() as graph:
                a = CustomTensor(torch.tensor([2.0, 3.0]), _custom_requires_grad=True, graph=graph, is_leaf=True)
                b = a + 5.0  # (a + 5)
                c = b + 10.0  # (a + 5 + 10)

                # Manually run backward pass
                c.backward(weightage_tensor=1)

                # Expected gradients:
                # dC/dA = 1.0 (for each element)
                assert torch.allclose(a.tensor.grad, torch.tensor([1.0, 1.0]))
                assert b.tensor.grad is not None
                assert torch.allclose(b.tensor.grad, torch.tensor([1.0, 1.0]))  # dC/dB = 1.0

                # Verify graph structure
                assert graph.graph.num_nodes() == 3
                assert graph.graph.num_edges() == 2
                assert graph.graph.has_edge(a._node_id, b._node_id)
                assert graph.graph.has_edge(b._node_id, c._node_id)
                assert graph.check_cycle() is False
            print("✓ System Test: Basic Scalar Add Grad")
            self.passed_tests += 1
        except Exception as e:
            print(f"✗ System Test: Basic Scalar Add Grad: {str(e)}")
            self.failed_tests += 1

    def test_basic_add_tensor_grad_system(self):
        print("\n=== System Test: Basic Tensor Add Grad ===")
        try:
            with AutogradGraph() as graph:
                a = CustomTensor(torch.tensor([2.0, 3.0]), _custom_requires_grad=True, graph=graph, is_leaf=True)
                b = CustomTensor(torch.tensor([1.0, 2.0]), _custom_requires_grad=True, graph=graph, is_leaf=True)
                c = a + b  # (a + b)
                d = c + 5.0  # (a + b + 5)

                d.backward(weightage_tensor=1)

                # Expected gradients:
                # dD/dA = 1.0
                # dD/dB = 1.0
                assert torch.allclose(a.tensor.grad, torch.tensor([1.0, 1.0]))
                assert torch.allclose(b.tensor.grad, torch.tensor([1.0, 1.0]))

                # Verify graph structure
                assert graph.graph.num_nodes() == 4
                assert graph.graph.num_edges() == 3
                assert graph.graph.has_edge(a._node_id, c._node_id)
                assert graph.graph.has_edge(b._node_id, c._node_id)
                assert graph.graph.has_edge(c._node_id, d._node_id)
                assert graph.check_cycle() is False
            print("✓ System Test: Basic Tensor Add Grad")
            self.passed_tests += 1
        except Exception as e:
            print(f"✗ System Test: Basic Tensor Add Grad: {str(e)}")
            self.failed_tests += 1

    def test_mixed_requires_grad_tensor_add_system(self):
        print("\n=== System Test: Mixed Requires Grad Tensor Add ===")
        try:
            with AutogradGraph() as graph:
                a = CustomTensor(torch.tensor([2.0, 3.0]), _custom_requires_grad=True, graph=graph, is_leaf=True)
                b = CustomTensor(torch.tensor([1.0, 2.0]), _custom_requires_grad=False)  # Does not require grad
                c = a + b  # c should require grad, b's grad should be None

                c.backward(weightage_tensor=1)

                assert torch.allclose(a.tensor.grad, torch.tensor([1.0, 1.0]))
                assert b.tensor.grad is None  # b should not have a grad
                assert c._custom_requires_grad is True

                # Verify graph structure
                assert graph.graph.num_nodes() == 2  # Only a and c in the graph
                assert graph.graph.num_edges() == 1
                assert graph.graph.has_node(a._node_id)
                assert graph.graph.has_node(c._node_id)
                assert graph.graph.has_edge(a._node_id, c._node_id)
                # assert not graph.graph.has_node(b._node_id) # b should not be in graph
            print("✓ System Test: Mixed Requires Grad Tensor Add")
            self.passed_tests += 1
        except Exception as e:
            print(f"✗ System Test: Mixed Requires Grad Tensor Add: {str(e)}")
            self.failed_tests += 1

    def test_no_requires_grad_system(self):
        print("\n=== System Test: No Requires Grad ===")
        try:
            with AutogradGraph() as graph:  # Graph created, but no tensors with requires_grad=True added
                a = CustomTensor(torch.tensor([1.0]))
                b = CustomTensor(torch.tensor([2.0]))
                c = a + b
                d = c + 3.0

                assert not a._custom_requires_grad
                assert not b._custom_requires_grad
                assert not c._custom_requires_grad
                assert not d._custom_requires_grad
                assert graph.graph.num_nodes() == 0  # Graph should remain empty
                assert graph.graph.num_edges() == 0

                with pytest.raises(RuntimeError, match="Output tensor does not require grad."):
                    d.backward(weightage_tensor=1)
            print("✓ System Test: No Requires Grad")
            self.passed_tests += 1
        except Exception as e:
            print(f"✗ System Test: No Requires Grad: {str(e)}")
            self.failed_tests += 1

    def test_autograd_graph_context_manager_system(self):
        print("\n=== System Test: Autograd Graph Context Manager ===")
        try:
            graph = None
            with AutogradGraph(check_for_cycles=True, auto_cleanup=True) as g:
                graph = g
                a = CustomTensor(torch.tensor([1.0]), _custom_requires_grad=True, graph=graph, is_leaf=True)
                b = a + 1.0
                assert graph.graph.num_nodes() == 2
                assert graph.graph.num_edges() == 1
                assert len(graph.intermediate_tensors) == 1  # b should be in intermediate_tensors

            # After exiting the context, graph should be empty
            assert graph.graph.num_nodes() == 0
            assert graph.graph.num_edges() == 0
            assert len(graph.intermediate_tensors) == 0
            print("✓ System Test: Autograd Graph Context Manager")
            self.passed_tests += 1
        except Exception as e:
            print(f"✗ System Test: Autograd Graph Context Manager: {str(e)}")
            self.failed_tests += 1

    def test_cycle_detection_system(self):
        print("\n=== System Test: Cycle Detection ===")
        try:
            with pytest.raises(RuntimeError, match="Cycle detected in autograd graph."):
                with AutogradGraph(check_for_cycles=True, auto_cleanup=False) as graph:
                    a = CustomTensor(torch.tensor([1.0]), _custom_requires_grad=True, graph=graph, is_leaf=True)
                    b = CustomTensor(torch.tensor([2.0]), _custom_requires_grad=True, graph=graph, is_leaf=True)

                    # Manually create a cycle (a -> b -> a)
                    graph.add_edge(a._node_id, b._node_id)
                    graph.add_edge(b._node_id, a._node_id)
                    graph.check_cycle() # Explicitly check for cycle
            print("✓ System Test: Cycle Detection")
            self.passed_tests += 1
        except Exception as e:
            print(f"✗ System Test: Cycle Detection: {str(e)}")
            self.failed_tests += 1

    def test_no_circular_references_non_leaf_tensors_die_system(self):
        # This test relies on the garbage collector. It's a heuristic test
        # as Python's GC timing is not strictly deterministic.
        # However, with weakrefs, it should work for non-leaf tensors.

        print("\n--- Starting System Test: No Circular References (Part 1) ---")
        try:
            graph_ref = None
            output_tensor_weak_ref = None
            node_id_d = -1  # To store node_id before d is deleted

            # BLOCK 1: Create graph and tensors
            with AutogradGraph(auto_cleanup=False) as graph:  # Keep graph for inspection
                graph_ref = weakref.ref(graph)
                a = CustomTensor(torch.tensor([1.0]), _custom_requires_grad=True, graph=graph, is_leaf=True)
                b = a + 1.0  # Intermediate tensor
                c = b + 2.0  # Intermediate tensor
                d = c + 3.0  # Output tensor (also intermediate from graph's perspective)

                # Store weak reference to 'd' BEFORE its strong reference is potentially removed
                output_tensor_weak_ref = weakref.ref(d)
                node_id_d = d._node_id  # Store node_id while d is alive

                # The ref count for `d` object itself will be high here because it's in `graph.intermediate_tensors`,
                # and held by variable `d`, and by the temporary ref in `getrefcount`.
                assert len(graph.intermediate_tensors) == 3  # b, c, d should be in intermediate_tensors

            # BLOCK 2: After exiting context manager (auto_cleanup=False)
            # The 'graph' variable still holds a strong reference to the AutogradGraph instance.
            # graph_ref() should return the graph object.
            assert graph_ref() is not None, "Graph object should still be alive."
            assert len(graph_ref().intermediate_tensors) == 3, "Intermediate tensors should still be referenced by the graph."

            # BLOCK 3: Remove strong reference 'd' from local scope
            del d  # Remove the local strong reference to the CustomTensor object.
            gc.collect()  # Force garbage collection

            # Now, output_tensor_weak_ref() *still* shouldn't be None because `graph_ref().intermediate_tensors`
            # holds the strong reference.
            assert output_tensor_weak_ref() is not None, "d should still be alive due to intermediate_tensors."
            current_d_refcount_after_del_d = sys.getrefcount(output_tensor_weak_ref()) if output_tensor_weak_ref() else 'N/A'
            assert current_d_refcount_after_del_d == 2, f"Expected refcount 2, got {current_d_refcount_after_del_d}"

            # BLOCK 4: Remove strong reference from intermediate_tensors
            graph_ref().del_non_leaf_tensor_reference(node_id_d)  # THIS IS THE CRUCIAL STEP
            gc.collect()  # Force garbage collection again

            # Now, with the last strong reference gone, 'd' should be garbage collected.
            assert output_tensor_weak_ref() is None, "Output tensor (non-leaf) should be garbage collected after its strong reference is deleted from intermediate_tensors."

            # BLOCK 5: Verify other intermediate tensors are collected when graph is cleared
            intermediate_tensors_wrefs = []
            # Create a new graph and new tensors to avoid interference from previous block
            with AutogradGraph(auto_cleanup=False) as graph_new:
                a_new = CustomTensor(torch.tensor([1.0]), _custom_requires_grad=True, graph=graph_new, is_leaf=True)
                b_new = a_new + 1.0  # Intermediate
                c_new = b_new + 2.0  # Intermediate
                d_new = c_new + 3.0  # Intermediate (output of a chain)

                # Store weak references to the intermediate tensors
                intermediate_tensors_wrefs.append(weakref.ref(b_new))
                intermediate_tensors_wrefs.append(weakref.ref(c_new))
                intermediate_tensors_wrefs.append(weakref.ref(d_new))

                # Verify they are initially alive
                assert all(wref() is not None for wref in intermediate_tensors_wrefs)
                assert len(graph_new.intermediate_tensors) == 3

            assert graph_new is not None, "New graph object should still be alive after 'with' block."
            assert len(graph_new.intermediate_tensors) == 3, "New graph intermediate_tensors should still hold refs."

            # Manually clear the intermediate_tensors dictionary and remove graph reference
            graph_new.intermediate_tensors.clear()
            del graph_new  # Remove the strong reference to the graph itself
            del b_new, c_new, d_new  # deleting the local variable strong references
            gc.collect()

            # Now, all non-leaf tensors should be garbage collected
            for i, wref in enumerate(intermediate_tensors_wrefs):
                assert wref() is None, f"Intermediate tensor {i} should be garbage collected after graph context and intermediate_tensors are cleared."
            print("✓ System Test: No Circular References (Non-leaf tensors die)")
            self.passed_tests += 1
        except Exception as e:
            print(f"✗ System Test: No Circular References (Non-leaf tensors die): {str(e)}")
            self.failed_tests += 1

    def test_topological_sort_order_system(self):
        print("\n=== System Test: Topological Sort Order ===")
        try:
            with AutogradGraph() as graph:
                t1 = CustomTensor(torch.tensor([1.0]), _custom_requires_grad=True, graph=graph, is_leaf=True)
                t2 = CustomTensor(torch.tensor([2.0]), _custom_requires_grad=True, graph=graph, is_leaf=True)
                t3 = t1 + t2
                t4 = t3 + 5.0
                t5 = t2 + 10.0  # Another branch
                t6 = t4 + t5

                # The topological sort should produce an order where dependencies come before their dependents.
                # Reversed topological sort should produce an order where outputs come before their inputs.
                # Example expected order: t6, t4, t5, t3, t2, t1 (or variations respecting dependencies)
                sorted_tensors = graph.reverse_toposort_from_tensor(t6._node_id)


                # Check if dependencies are respected in reverse order
                # If A -> B, then B should appear before A in reverse topological sort.
                # t6 depends on t4, t5. So t6 should be before t4 and t5.
                # t4 depends on t3. So t4 should be before t3.
                # t5 depends on t2. So t5 should be before t2.
                # t3 depends on t1, t2. So t3 should be before t1 and t2.

                # Simple check: The first element should be t6 (the ultimate output).
                assert sorted_tensors[0].__repr__() == t6.__repr__()

                # Check positions:
                sorted_tensors=[i.__repr__.__self__ for i in sorted_tensors] #converting the weakref to strongrefs
                pos = {t: i for i, t in enumerate(sorted_tensors)}

                assert pos[t6] < pos[t4]
                assert pos[t6] < pos[t5]
                assert pos[t4] < pos[t3]
                assert pos[t5] < pos[t2]
                assert pos[t3] < pos[t1]
                assert pos[t3] < pos[t2]  # t3 also depends on t2

                # Additional check: t2 is a dependency for both t3 and t5.
                # In reverse topo sort, t3 and t5 must appear before t2.
                assert pos[t3] < pos[t2]
                assert pos[t5] < pos[t2]

                # t1 is only a dependency for t3.
                assert pos[t3] < pos[t1]

                # Check if all 6 tensors are in the sorted list
                assert len(sorted_tensors) == 6
                assert set(sorted_tensors) == {t1, t2, t3, t4, t5, t6}
                sorted_tensors=None

            print("✓ System Test: Topological Sort Order")
            self.passed_tests += 1
        except Exception as e:
            print(f"✗ System Test: Topological Sort Order: {str(e)}")
            self.failed_tests += 1

    def test_very_deep_computation_graph(self):
        """Test with very deep computation graphs"""
        print("\n=== Testing Very Deep Computation Graph ===")
        
        try:
            depth = 50  # Moderate depth to avoid stack overflow in testing
            
            with AutogradGraph() as graph:
                x_custom = CustomTensor([1.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
                current_custom = x_custom
                
                # Create deep chain: x -> x+1 -> (x+1)+1 -> ... (50 times)
                for i in range(depth):
                    current_custom = current_custom + 1.0
                
                final_custom = current_custom
                final_custom.backward()
                
            x_pytorch = torch.tensor([1.0], requires_grad=True)
            current_pytorch = x_pytorch
            
            for i in range(depth):
                current_pytorch = current_pytorch + 1.0
                
            final_pytorch = current_pytorch
            final_pytorch.backward()
            
            self.assert_tensors_close(x_custom, x_pytorch, f"Deep Graph (depth={depth}) - x")
            self.assert_tensors_close(final_custom, final_pytorch, f"Deep Graph (depth={depth}) - final", check_grad=False)
            
        except Exception as e:
            print(f"✗ Very Deep Computation Graph: {str(e)}")
            self.failed_tests += 1

    def test_wide_computation_graph(self):
        """Test with very wide computation graphs (many inputs)"""
        print("\n=== Testing Wide Computation Graph ===")
        
        try:
            width = 20  # 20 input tensors
            
            with AutogradGraph() as graph:
                # Create many input tensors
                inputs_custom = []
                for i in range(width):
                    inputs_custom.append(
                        CustomTensor([float(i + 1)], _custom_requires_grad=True, graph=graph, is_leaf=True)
                    )
                
                # Sum all inputs
                result_custom = inputs_custom[0]
                for i in range(1, width):
                    result_custom = result_custom + inputs_custom[i]
                
                result_custom.backward()
                
            # PyTorch equivalent
            inputs_pytorch = []
            for i in range(width):
                inputs_pytorch.append(torch.tensor([float(i + 1)], requires_grad=True))
            
            result_pytorch = inputs_pytorch[0]
            for i in range(1, width):
                result_pytorch = result_pytorch + inputs_pytorch[i]
                
            result_pytorch.backward()
            
            # Check all gradients
            for i in range(width):
                self.assert_tensors_close(
                    inputs_custom[i], inputs_pytorch[i], 
                    f"Wide Graph (width={width}) - input_{i}"
                )
            
        except Exception as e:
            print(f"✗ Wide Computation Graph: {str(e)}")
            self.failed_tests += 1

    def test_nan_and_inf_handling(self):
        """Test handling of NaN and Inf values"""
        print("\n=== Testing NaN and Inf Handling ===")
        
        try:
            # Test with NaN input
            with AutogradGraph() as graph:
                x_custom = CustomTensor([float('nan')], _custom_requires_grad=True, graph=graph, is_leaf=True)
                y_custom = x_custom + 1.0
                y_custom.backward()
                
                # Check that gradients handle NaN appropriately
                assert torch.isnan(x_custom.tensor.grad).any() or x_custom.tensor.grad is not None
                
            # Test with Inf input
            with AutogradGraph() as graph:
                x_custom = CustomTensor([float('inf')], _custom_requires_grad=True, graph=graph, is_leaf=True)
                y_custom = x_custom * 2.0
                y_custom.backward()
                
                # Should handle inf appropriately
                assert torch.isinf(x_custom.tensor.grad).any() or x_custom.tensor.grad is not None
                
            print("ℹ NaN/Inf Handling - Consider adding explicit handling for edge numerical cases")
            self.passed_tests += 1
            
        except Exception as e:
            print(f"✗ NaN and Inf Handling: {str(e)}")
            self.failed_tests += 1

    def test_zero_gradients(self):
        """Test operations that should produce zero gradients"""
        print("\n=== Testing Zero Gradients ===")
        
        try:
            with AutogradGraph() as graph:
                x_custom = CustomTensor([2.0], _custom_requires_grad=True, graph=graph, is_leaf=True)
                
                # x - x should have zero gradient with respect to x
                y_custom = x_custom - x_custom
                y_custom.backward()
                
            x_pytorch = torch.tensor([2.0], requires_grad=True)
            y_pytorch = x_pytorch - x_pytorch
            y_pytorch.backward()
            
            self.assert_tensors_close(x_custom, x_pytorch, "Zero Gradients - x")
            
        except Exception as e:
            print(f"✗ Zero Gradients: {str(e)}")
            self.failed_tests += 1


    def test_memory_efficiency(self):
        """Test memory efficiency with large computations"""
        print("\n=== Testing Memory Efficiency ===")
        
        try:
            # Create a computation that could potentially leak memory
            initial_tensor_count = len(gc.get_objects())
            
            for iteration in range(5):
                with AutogradGraph() as graph:
                    x_custom = CustomTensor([1.0] * 100, _custom_requires_grad=True, graph=graph, is_leaf=True)
                    
                    # Chain of operations
                    current = x_custom
                    for i in range(10):
                        current = current + 1.0
                        current = current * 1.1
                    
                    current.backward(torch.ones(100))
                    
                # Force cleanup
                del current, x_custom
                gc.collect()
            
            final_tensor_count = len(gc.get_objects())
            
            # Memory should not grow excessively
            growth = final_tensor_count - initial_tensor_count
            print(f"Object count growth: {growth}")
            
            if growth < 1000:  # Reasonable threshold
                print("✓ Memory Efficiency - Reasonable memory usage")
                self.passed_tests += 1
            else:
                print(f"⚠ Memory Efficiency - High memory growth: {growth} objects")
                self.passed_tests += 1  # Still pass but warn
                
        except Exception as e:
            print(f"✗ Memory Efficiency: {str(e)}")
            self.failed_tests += 1

    def run_all_tests(self):
        """Run all tests"""
        print("Running Custom Autograd Correctness Tests")
        print("=" * 50)

        self.test_basic_operations()
        self.test_multiplication()
        self.test_subtraction_division()
        self.test_power_function()
        self.test_unary_functions()
        self.test_matrix_operations()
        self.test_complex_chain()
        self.test_mixed_operations()
        self.test_broadcasting()
        self.test_backward_with_custom_grad()
        self.test_zero_grad_behavior()
        self.test_no_grad_flow()

        print("\n" + "=" * 50)
        print("Running Custom Autograd System Tests")
        print("=" * 50)

        self.test_basic_add_scalar_grad_system()
        self.test_basic_add_tensor_grad_system()
        self.test_mixed_requires_grad_tensor_add_system()
        self.test_no_requires_grad_system()
        self.test_autograd_graph_context_manager_system()
        self.test_cycle_detection_system()
        self.test_no_circular_references_non_leaf_tensors_die_system()
        self.test_topological_sort_order_system()
        self.test_very_deep_computation_graph()
        self.test_wide_computation_graph()
        self.test_nan_and_inf_handling()
        self.test_zero_gradients()
        self.test_memory_efficiency()


        print(f"\n" + "=" * 50)
        print(f"Test Results: {self.passed_tests} passed, {self.failed_tests} failed")

        if self.failed_tests == 0:
            print("🎉 All tests passed! Your autograd implementation is correct.")
        else:
            print("❌ Some tests failed. Check the implementation.")

        return self.failed_tests == 0



In [18]:
t=AutogradTester()
t.run_all_tests()

Running Custom Autograd Correctness Tests

=== Testing Basic Operations ===
✓ Scalar Addition - x
✓ Scalar Addition - y (result)
✓ Tensor Addition - x
✓ Tensor Addition - y
✓ Tensor Addition - z (result)

=== Testing Multiplication ===
✓ Scalar Multiplication - x
✓ Scalar Multiplication - y (result)
✓ Tensor Multiplication - x
✓ Tensor Multiplication - y
✓ Tensor Multiplication - z (result)

=== Testing Subtraction and Division ===
✓ Scalar Subtraction (x - C) - x
✓ Scalar Subtraction (x - C) - y (result)
✓ Scalar Reverse Subtraction (C - x) - x
✓ Scalar Reverse Subtraction (C - x) - y (result)
✓ Tensor Subtraction - x
✓ Tensor Subtraction - y
✓ Tensor Subtraction - z (result)
✓ Scalar Division - x
✓ Scalar Division - y (result)
✓ Tensor Division - x
✓ Tensir Division - y
✓ Tensor Division - z (result)

=== Testing Power Function ===
✓ Power Function - x
✓ Power Function - y (result)
✓ Power Function (Negative Exponent) - x
✓ Power Function (Negative Exponent) - y (result)

=== Testing

True