Proper implementation of arrayjit_copy_with_padding, by Grok

lukstafi · lukstafi · commit 61cf318cf8b1 · 2025-07-11T13:50:10.000+02:00
diff --git a/arrayjit/lib/arrayjit_stubs.c b/arrayjit/lib/arrayjit_stubs.c
@@ -1,10 +1,12 @@
 #include <caml/alloc.h>
+#include <caml/fail.h>
 #include <caml/memory.h>
 #include <caml/mlvalues.h>
 #include <caml/bigarray.h>
 #include <math.h>
 #include <stdint.h>
 #include <string.h>
+#include <stdlib.h>
 
 /* Pure C conversion functions for use in C backends */
 
@@ -14,14 +16,14 @@ static inline float bfloat16_to_float(uint16_t bf16)
   /* BFloat16 format: 1 sign bit, 8 exponent bits, 7 mantissa bits
      To convert to float32, we shift left by 16 bits */
   uint32_t f32 = ((uint32_t)bf16) << 16;
-  return *((float*)&f32);
+  return *((float *)&f32);
 }
 
 /* Float to BFloat16 conversion (C function) */
 static inline uint16_t float_to_bfloat16(float f)
 {
-  uint32_t f32 = *((uint32_t*)&f);
-  
+  uint32_t f32 = *((uint32_t *)&f);
+
   /* Round to nearest even */
   uint32_t rounded = f32 + 0x7FFF + ((f32 >> 16) & 1);
   return (uint16_t)(rounded >> 16);
@@ -32,88 +34,105 @@ static inline uint16_t float_to_bfloat16(float f)
 static inline float fp8_to_float(uint8_t fp8)
 {
   /* Handle zero */
-  if (fp8 == 0) {
+  if (fp8 == 0)
+  {
     return 0.0f;
   }
-  
+
   uint32_t sign = (fp8 >> 7) & 1;
   uint32_t exp = (fp8 >> 2) & 0x1F;
   uint32_t mant = fp8 & 0x3;
-  
+
   /* Handle special cases */
-  if (exp == 0x1F) {  /* Infinity or NaN */
-    if (mant == 0) {
+  if (exp == 0x1F)
+  { /* Infinity or NaN */
+    if (mant == 0)
+    {
       return sign ? -INFINITY : INFINITY;
-    } else {
+    }
+    else
+    {
       return NAN;
     }
   }
-  
+
   /* Denormalized numbers */
-  if (exp == 0) {
+  if (exp == 0)
+  {
     float result = ldexpf((float)mant / 4.0f, -14);
-    if (sign) result = -result;
+    if (sign)
+      result = -result;
     return result;
   }
-  
+
   /* Normalized numbers */
   float result = (1.0f + (float)mant * 0.25f) * ldexpf(1.0f, (int)exp - 15);
-  if (sign) result = -result;
-  
+  if (sign)
+    result = -result;
+
   return result;
 }
 
 /* Float to FP8 E5M2 conversion (C function) */
 static inline uint8_t float_to_fp8(float f)
 {
   /* Handle zero */
-  if (f == 0.0f) {
+  if (f == 0.0f)
+  {
     return 0;
   }
-  
+
   uint32_t sign = (f < 0) ? 1 : 0;
   f = fabsf(f);
-  
+
   /* Handle special cases */
-  if (isinf(f)) {
-    return (sign << 7) | 0x7C;  /* Infinity: exp=0x1F, mant=0 */
+  if (isinf(f))
+  {
+    return (sign << 7) | 0x7C; /* Infinity: exp=0x1F, mant=0 */
   }
-  if (isnan(f)) {
-    return (sign << 7) | 0x7F;  /* NaN: exp=0x1F, mant!=0 */
+  if (isnan(f))
+  {
+    return (sign << 7) | 0x7F; /* NaN: exp=0x1F, mant!=0 */
   }
-  
+
   /* Get exponent and mantissa */
   int exp_val;
   float mant_f = frexpf(f, &exp_val);
-  int exp = exp_val + 14;  /* Bias is 15, but frexp gives us mantissa in [0.5, 1) */
-  
+  int exp = exp_val + 14; /* Bias is 15, but frexp gives us mantissa in [0.5, 1) */
+
   /* Clamp to representable range */
-  if (exp < 0) {
+  if (exp < 0)
+  {
     /* Underflow to zero */
     return sign << 7;
   }
-  if (exp > 30) {
+  if (exp > 30)
+  {
     /* Overflow to infinity */
     return (sign << 7) | 0x7C;
   }
-  
+
   /* Handle denormalized numbers */
-  if (exp == 0) {
+  if (exp == 0)
+  {
     float denorm_mant = f * ldexpf(1.0f, 14) * 4.0f;
     uint32_t mant_bits = (uint32_t)(denorm_mant + 0.5f);
-    if (mant_bits > 3) mant_bits = 3;
+    if (mant_bits > 3)
+      mant_bits = 3;
     return (sign << 7) | mant_bits;
   }
-  
+
   /* Normalized numbers: convert mantissa from [0.5, 1) to [0, 0.75] */
   mant_f = (mant_f - 0.5f) * 4.0f;
-  uint32_t mant_bits = (uint32_t)(mant_f + 0.5f);  /* Round to nearest */
-  if (mant_bits > 3) mant_bits = 3;
-  
+  uint32_t mant_bits = (uint32_t)(mant_f + 0.5f); /* Round to nearest */
+  if (mant_bits > 3)
+    mant_bits = 3;
+
   return (uint8_t)((sign << 7) | ((exp & 0x1F) << 2) | (mant_bits & 0x3));
 }
 
-typedef struct {
+typedef struct
+{
   uint32_t v[4];
 } uint4x32_t;
 
@@ -141,7 +160,7 @@ CAMLprim value arrayjit_bfloat16_to_float(value v_bf16)
 }
 
 /* Float to BFloat16 conversion (OCaml wrapper) */
-CAMLprim value arrayjit_float_to_bfloat16(value v_float) 
+CAMLprim value arrayjit_float_to_bfloat16(value v_float)
 {
   CAMLparam1(v_float);
   float f = (float)Double_val(v_float);
@@ -167,45 +186,115 @@ CAMLprim value arrayjit_float_to_fp8(value v_float)
   CAMLreturn(Val_int(fp8));
 }
 
-/* Efficient copying with padding support */
-CAMLprim value arrayjit_copy_with_padding(value v_source, value v_target, 
-                                          value v_source_dims, value v_padding)
+// TODO: a more efficient approach would involve computing strides once and using memcpy
+// for contiguous inner slices, but that adds complexity).
+CAMLprim value arrayjit_copy_with_padding(value v_source, value v_target, value v_padding)
 {
-  CAMLparam4(v_source, v_target, v_source_dims, v_padding);
-  
-  /* Get the bigarray data pointers */
-  void* source_data = Caml_ba_data_val(v_source);
-  void* target_data = Caml_ba_data_val(v_target);
-  
-  /* Get element size in bytes */
-  int kind = Caml_ba_kind_val(v_source);
-  size_t elem_size;
-  switch(kind) {
-    case CAML_BA_FLOAT32: elem_size = 4; break;
-    case CAML_BA_FLOAT64: elem_size = 8; break;
-    case CAML_BA_SINT8:
-    case CAML_BA_UINT8: elem_size = 1; break;
-    case CAML_BA_SINT16:
-    case CAML_BA_UINT16: elem_size = 2; break;
-    case CAML_BA_INT32: elem_size = 4; break;
-    case CAML_BA_COMPLEX64: elem_size = 16; break;
-    default: elem_size = 8; break;
+  CAMLparam3(v_source, v_target, v_padding);
+
+  struct caml_ba_array *source_ba = Caml_ba_array_val(v_source);
+  struct caml_ba_array *target_ba = Caml_ba_array_val(v_target);
+  int ndim = source_ba->num_dims;
+
+  if (ndim != target_ba->num_dims)
+  {
+    caml_failwith("Source and target must have the same number of dimensions");
+  }
+
+  if (ndim == 0)
+  {
+    CAMLreturn(Val_unit);
   }
-  
-  /* FIXME: For now, implement a simple flat copy */
-  /* The proper padding-aware copy would require more complex logic */
-  /* but this provides a foundation for optimization */
-  struct caml_ba_array* source_ba = Caml_ba_array_val(v_source);
-  intnat* source_dims_ba = source_ba->dim;
-  int source_ndim = source_ba->num_dims;
-  
-  size_t source_total = 1;
-  for(int i = 0; i < source_ndim; i++) {
-    source_total *= source_dims_ba[i];
+
+  void *source_data = Caml_ba_data_val(v_source);
+  void *target_data = Caml_ba_data_val(v_target);
+
+  size_t elem_size = caml_ba_byte_size(source_ba);
+
+  // Use source dimensions directly from bigarray
+  intnat *source_shape = source_ba->dim;
+
+  // Extract paddings
+  if (Wosize_val(v_padding) != (uintnat)ndim)
+  {
+    caml_failwith("Padding array length mismatch");
+  }
+  intnat *left = malloc(ndim * sizeof(intnat));
+  intnat *right = malloc(ndim * sizeof(intnat));
+  if (left == NULL || right == NULL)
+    caml_failwith("Malloc failed");
+  for (int d = 0; d < ndim; d++)
+  {
+    value pad = Field(v_padding, d);
+    left[d] = Long_val(Field(pad, 0));
+    right[d] = Long_val(Field(pad, 1));
+    if (left[d] < 0 || right[d] < 0)
+      caml_failwith("Negative padding");
   }
-  
-  /* FIXME: Simple memcpy for now - must implement proper padding */
-  memcpy(target_data, source_data, source_total * elem_size);
-  
+
+  // Verify target dimensions match source + padding
+  for (int d = 0; d < ndim; d++)
+  {
+    if (target_ba->dim[d] != source_shape[d] + left[d] + right[d])
+    {
+      caml_failwith("Target dimensions do not match source + padding");
+    }
+  }
+
+  // Multi-dimensional index loop
+  intnat *indices = calloc(ndim, sizeof(intnat));
+  if (indices == NULL)
+    caml_failwith("Calloc failed");
+
+  while (1)
+  {
+    // Compute source flat offset
+    intnat source_offset = 0;
+    intnat s_stride = 1;
+    for (int d = ndim - 1; d >= 0; d--)
+    {
+      source_offset += indices[d] * s_stride;
+      s_stride *= source_shape[d];
+    }
+
+    // Compute target flat offset with padding offset
+    intnat target_offset = 0;
+    intnat t_stride = 1;
+    for (int d = ndim - 1; d >= 0; d--)
+    {
+      target_offset += (indices[d] + left[d]) * t_stride;
+      t_stride *= target_ba->dim[d];
+    }
+
+    // Copy the element
+    memcpy((char *)target_data + target_offset * elem_size,
+           (char *)source_data + source_offset * elem_size,
+           elem_size);
+
+    // Increment indices (odometer-style)
+    int carry = 1;
+    for (int d = ndim - 1; d >= 0; d--)
+    {
+      if (carry == 0)
+        break;
+      indices[d] += carry;
+      if (indices[d] < source_shape[d])
+      {
+        carry = 0;
+      }
+      else
+      {
+        indices[d] = 0;
+        carry = 1;
+      }
+    }
+    if (carry == 1)
+      break; // Done
+  }
+
+  free(indices);
+  free(left);
+  free(right);
+
   CAMLreturn(Val_unit);
-} 
+}
diff --git a/arrayjit/lib/ndarray.ml b/arrayjit/lib/ndarray.ml
@@ -387,13 +387,12 @@ let hash_t nd = Nativeint.hash @@ to_native nd
 
 (** C function declarations for efficient copying *)
 external copy_with_padding_c : 
-  ('a, 'b) bigarray -> ('a, 'b) bigarray -> int array -> axis_padding array -> unit
+  ('a, 'b) bigarray -> ('a, 'b) bigarray -> axis_padding array -> unit
   = "arrayjit_copy_with_padding"
 
 let copy_with_padding ~source ~target ~padding =
-  let source_dims = dims source in
   let copy_impl source_arr target_arr =
-    copy_with_padding_c source_arr target_arr source_dims padding
+    copy_with_padding_c source_arr target_arr padding
   in
   map2 { f2 = copy_impl } source target