In [1]:
import numpy as np

matmul_call_count = 0 # A counter to keep track of matmul calls

def reset_matmul_call_counter():
    global matmul_call_count
    matmul_call_count = 0

def matmul_counter_wrapper(func):
    def wrapper(*args, **kwargs):
        global matmul_call_count
        matmul_call_count += 1
        return func(*args, **kwargs)
    return wrapper

# Replace np.matmul with the logged version
np.matmul = matmul_counter_wrapper(np.matmul)

class ag: # AutoGrad
    """
    A barebone version of the AutoGrad library that we've been working with
    it only supports 
    - matmul 
        - between two numpy matrices ONLY,
            -i.e., arrays X such that len(X.shape) == 2
    - sum 
        - "axis = None" ONLY
            - so everything gets summed up
    - add (entrywise)
    """

    #################
    # REDUCTIVE OPS #
    #################
    def sum(input):
        output = ag.Tensor(np.sum(input.value), inputs = [input], op='sum')
        def _backward():
            # YOUR CODE HERE FOR initialize the grad if they are none
            input.grad += output.grad
            # YOUR CODE HERE FOR discarding the grad
            return None
        output._backward = _backward
        return output

    ##########
    # MATMUL #
    ##########
    def matmul(input1, input2):
        return input1@input2

    class Tensor: # Tensor with grads
        def __init__(self,
                     value,
                     requires_grad=False,
                     op="",
                     _backward= lambda : None,
                     inputs=[],
                     label=""):

            if type(value) in [float ,int]:
                value = np.array(value)
            
            self.requires_grad = requires_grad
            
            self.value = 1.0*value
            self.grad = None

            
            if self.requires_grad:
                self.grad = np.zeros_like(self.value)

            self.shape = value.shape

            self._backward = _backward
            self.inputs = inputs

            self.op = op
            self.label = label

        def topological_sort(self):
            topo_order = []
            visited = set()

            def dfs(node):
                if node not in visited:
                    visited.add(node)
                    for input in node.inputs:
                        dfs(input)
                    topo_order.append(node)

            dfs(self)
            return topo_order


        def backward(self):
            self.grad = np.array(1.0)

            
            topo_order = self.topological_sort()
            
            start_trace()  # added to trace memory used
            mem_usage = [] # added to trace memory used

            for node in reversed(topo_order):
                node._backward()
                mem_usage.append(snapshot_trace())
            end_trace()
            return mem_usage

        ##########
        # MATMUL #
        ##########
        def __matmul__(self,other):
            """
            matrix multiplication between two MATRICES only
            """

            assert(len(self.shape) == 2)
            assert(len(other.shape) == 2)

            output = ag.Tensor(np.matmul(self.value,other.value),
                               inputs = [self,other],
                               op="matmul")
            
            def _backward():
                # YOUR CODE HERE FOR initializing the grad
                    
                self.grad += np.matmul(output.grad, other.value.T)
                other.grad += np.matmul(self.value.T, output.grad)

                # YOUR CODE HERE FOR discarding the grad
                return None
            output._backward = _backward
            return output
        
        def zero_grad(self):
            self.grad = np.zeros_like(self.value)
            return None
        
        def discard_grad(self):
            self.grad = None
            return None
            
        def __repr__(self) -> str:
            return "Value:\n"+self.value.__repr__() + "\nGrad:\n" + self.grad.__repr__()


In [2]:
import tracemalloc
import numpy as np
# code adapted from 
# https://numpy.org/doc/2.0/reference/c-api/data_memory.html#example-of-memory-tracing-with-np-lib-tracemalloc-domain

def start_trace():
    tracemalloc.start()
    return None

def snapshot_trace():
    snapshot = tracemalloc.take_snapshot()

    # only keep track of the allocations by numpy
    dom_filter = tracemalloc.DomainFilter(inclusive=True,
                                          domain=np.lib.tracemalloc_domain)
    
    snapshot = snapshot.filter_traces([dom_filter])
    top_stats = snapshot.statistics('traceback')

    return top_stats
    
def end_trace():
    
    tracemalloc.clear_traces()
    tracemalloc.stop()
    return None
    
def print_trace_stats(stats):
    mem_allocated = 0
    for stat in stats:
        mem_allocated += stat.size
    print(f"memory allocated: {mem_allocated//  1000000} MB")
    return None

In [3]:
np.random.seed(42)

num_layers = 10
num_samples = 4096
dim_hidden = 1000

weights = [ag.Tensor(0.02*np.random.randn(dim_hidden, dim_hidden), 
                     requires_grad = True) for _ in range(num_layers)]
X = ag.Tensor(np.random.randn(num_samples, dim_hidden))

def forward(x, weights):
    for w in weights:
        x = ag.matmul(x, w)
    return ag.sum(x)

def forward_traced_grad_discard(x, weights):
    start_trace()
    mem_usage = []
    for w in weights:
        x = ag.matmul(x, w)
        mem_usage.append(snapshot_trace())
    l = ag.sum(x)
    mem_usage.append(snapshot_trace())
    end_trace()
    return l, mem_usage

reset_matmul_call_counter()
l, mem_usage_forward = forward_traced_grad_discard(X,weights)
mem_usage_backward = l.backward()
print(matmul_call_count)

TypeError: unsupported operand type(s) for +: 'NoneType' and 'float'

In [None]:
print( f"size of the numpy array {( dim_hidden * num_samples * 8 ) // 1000000} MB")

# Sanity checks

In [None]:
for i, trace_stats in enumerate(mem_usage_forward):
    print(f"layer {i}")
    print_trace_stats(trace_stats)
# expected output
# layer 0
# memory allocated: 32 MB
# layer 1
# memory allocated: 65 MB
# layer 2
# memory allocated: 98 MB
# ...

In [None]:
for i, trace_stats in enumerate(mem_usage_backward):
    print(f"backward step {i}")
    print_trace_stats(trace_stats)

# expected output
# backward step 0
# memory allocated: 32 MB
# backward step 1
# memory allocated: 32 MB
# backward step 2
# memory allocated: 32 MB
# ...

In [None]:
weights[0].grad
# EXPECTED OUTPUT

# array([[ 0.12749943, -0.28241071, -0.05621888, ..., -0.02374291,
#          0.27343724,  0.48198191],
#        [ 0.25550796, -0.56594908, -0.11266225, ..., -0.04758063,
#          0.5479663 ,  0.96588835],
# ...