ahrefs
diff --git a/‎arrayjit/lib/c_syntax.ml‎
Lines changed: 74 additions & 7 deletions b/‎arrayjit/lib/c_syntax.ml‎
Lines changed: 74 additions & 7 deletions
diff --git a/‎arrayjit/lib/metal_backend.ml‎
Lines changed: 3 additions & 0 deletions b/‎arrayjit/lib/metal_backend.ml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎arrayjit/test/test_numerical_types.ml‎
Lines changed: 0 additions & 3 deletions b/‎arrayjit/test/test_numerical_types.ml‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎bin/compilation_speed.ml‎
Lines changed: 1 addition & 1 deletion b/‎bin/compilation_speed.ml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bin/micrograd_basic.ml‎
Lines changed: 2 additions & 2 deletions b/‎bin/micrograd_basic.ml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎bin/micrograd_demo.ml‎
Lines changed: 2 additions & 2 deletions b/‎bin/micrograd_demo.ml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎bin/micrograd_demo_logging.ml‎
Lines changed: 1 addition & 1 deletion b/‎bin/micrograd_demo_logging.ml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bin/moons_demo.ml‎
Lines changed: 2 additions & 2 deletions b/‎bin/moons_demo.ml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎bin/primitive_ops.ml‎
Lines changed: 2 additions & 2 deletions b/‎bin/primitive_ops.ml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎bin/zero2hero_1of7.ml‎
Lines changed: 9 additions & 12 deletions b/‎bin/zero2hero_1of7.ml‎
Lines changed: 9 additions & 12 deletions
@@ -267,7 +267,7 @@ struct
     match op with
     | Ops.Satur01_gate -> (
         match prec with
-        | Ops.Byte_prec _ | Ops.Uint16_prec _ | Ops.Int32_prec _ ->
+        | Ops.Byte_prec _ | Ops.Uint16_prec _ | Ops.Int32_prec _ | Ops.Uint4x32_prec _ ->
             let open PPrint in
             group
               (parens
@@ -592,16 +592,44 @@ module C_syntax (B : C_syntax_config) = struct
         let prefix, postfix = B.convert_precision ~from:scope_prec ~to_:prec in
         let expr = string prefix ^^ string ("v" ^ Int.to_string id.scope_id) ^^ string postfix in
         (empty, expr)
-    | Access (Ops.Merge_buffer { source_node_id }, Some idcs) ->
-        let tn = Option.value_exn ~here:[%here] @@ Tn.find ~id:source_node_id in
+    | Access (Low_level.Merge_buffer { source }, Some idcs) ->
+        let tn = source in
         let from_prec = Lazy.force tn.prec in
         let prefix, postfix = B.convert_precision ~from:from_prec ~to_:prec in
         let offset_doc = pp_array_offset (idcs, Lazy.force tn.dims) in
         let expr =
           string prefix ^^ string "merge_buffer" ^^ brackets offset_doc ^^ string postfix
         in
         (empty, expr)
-    | Access _ -> failwith "C_syntax: Access / FFI NOT IMPLEMENTED YET"
+    | Access (Low_level.C_function f_name, None) ->
+        let expr = string (f_name ^ "()") in
+        (empty, expr)
+    | Access (Low_level.External_unsafe { ptr; prec = source_prec; dims }, Some idcs) ->
+        let dims_val = Lazy.force dims in
+        let prefix, postfix = B.convert_precision ~from:source_prec ~to_:prec in
+        let offset_doc = pp_array_offset (idcs, dims_val) in
+        let ptr_str = Ops.c_rawptr_to_string (Ctypes.raw_address_of_ptr @@ Ctypes.to_voidp ptr) source_prec in
+        let expr =
+          string prefix ^^ string ("(*(" ^ ptr_str ^ " + ") ^^ offset_doc ^^ string "))" ^^ string postfix
+        in
+        (empty, expr)
+    | Access (Low_level.File_mapped (file, source_prec), Some idcs) ->
+        let prefix, postfix = B.convert_precision ~from:source_prec ~to_:prec in
+        let expr =
+          string prefix ^^ string ("file_mapped_data_" ^ file ^ "[") ^^ pp_array_offset (idcs, [||]) ^^ string "]" ^^ string postfix
+        in
+        (empty, expr)
+    | Access (Low_level.Uint4x32_to_prec_uniform { source; prec = source_prec }, Some idcs) ->
+        let tn = source in
+        let prefix, postfix = B.convert_precision ~from:source_prec ~to_:prec in
+        let offset_doc = pp_array_offset (idcs, Lazy.force tn.dims) in
+        let source_ident = string (get_ident tn) in
+        let expr =
+          string prefix ^^ string ("uint4x32_to_" ^ Ops.prec_string source_prec ^ "_uniform(") 
+          ^^ source_ident ^^ brackets offset_doc ^^ string ")" ^^ string postfix
+        in
+        (empty, expr)
+    | Access _ -> failwith "C_syntax: Access cases with wrong indices / FFI NOT IMPLEMENTED YET"
     | Get (tn, idcs) ->
         let ident_doc = string (get_ident tn) in
         let from_prec = Lazy.force tn.prec in
@@ -665,8 +693,8 @@ module C_syntax (B : C_syntax_config) = struct
         let prefix, postfix = B.convert_precision ~from:scope_prec ~to_:prec in
         let v_doc = string prefix ^^ string ("v" ^ Int.to_string id.scope_id) ^^ string postfix in
         (v_doc ^^ braces (string ("=" ^ B.float_log_style)), [ `Value v_doc ])
-    | Access (Ops.Merge_buffer { source_node_id }, Some idcs) ->
-        let tn = Option.value_exn ~here:[%here] @@ Tn.find ~id:source_node_id in
+    | Access (Low_level.Merge_buffer { source }, Some idcs) ->
+        let tn = source in
         let from_prec = Lazy.force tn.prec in
         let dims = Lazy.force tn.dims in
         let prefix, postfix = B.convert_precision ~from:from_prec ~to_:prec in
@@ -681,7 +709,46 @@ module C_syntax (B : C_syntax_config) = struct
           ^^ braces (string ("=" ^ B.float_log_style))
         in
         (expr_doc, [ `Accessor (idcs, dims); `Value access_doc ])
-    | Access _ -> failwith "C_syntax: Access / FFI NOT IMPLEMENTED YET"
+    | Access (Low_level.C_function f_name, None) ->
+        let expr_doc = string (f_name ^ "()") in
+        (expr_doc, [])
+    | Access (Low_level.External_unsafe { ptr; prec = source_prec; dims }, Some idcs) ->
+        let dims_val = Lazy.force dims in
+        let prefix, postfix = B.convert_precision ~from:source_prec ~to_:prec in
+        let offset_doc = pp_array_offset (idcs, dims_val) in
+        let ptr_str = Ops.c_rawptr_to_string (Ctypes.raw_address_of_ptr @@ Ctypes.to_voidp ptr) source_prec in
+        let access_doc =
+          string prefix ^^ string ("(*(" ^ ptr_str ^ " + ") ^^ offset_doc ^^ string "))" ^^ string postfix
+        in
+        let expr_doc =
+          string prefix ^^ string ("external[%u]{=" ^ B.float_log_style ^ "}") ^^ string postfix
+        in
+        (expr_doc, [ `Accessor (idcs, dims_val); `Value access_doc ])
+    | Access (Low_level.File_mapped (file, source_prec), Some idcs) ->
+        let prefix, postfix = B.convert_precision ~from:source_prec ~to_:prec in
+        let access_doc =
+          string prefix ^^ string ("file_mapped_data_" ^ file ^ "[") ^^ pp_array_offset (idcs, [||]) ^^ string "]" ^^ string postfix
+        in
+        let expr_doc =
+          string prefix ^^ string ("file_mapped_" ^ file ^ "[%u]{=" ^ B.float_log_style ^ "}") ^^ string postfix
+        in
+        (expr_doc, [ `Accessor (idcs, [||]); `Value access_doc ])
+    | Access (Low_level.Uint4x32_to_prec_uniform { source; prec = source_prec }, Some idcs) ->
+        let tn = source in
+        let prefix, postfix = B.convert_precision ~from:source_prec ~to_:prec in
+        let dims = Lazy.force tn.dims in
+        let offset_doc = pp_array_offset (idcs, dims) in
+        let source_ident = string (get_ident tn) in
+        let access_doc =
+          string prefix ^^ string ("uint4x32_to_" ^ Ops.prec_string source_prec ^ "_uniform(") 
+          ^^ source_ident ^^ brackets offset_doc ^^ string ")" ^^ string postfix
+        in
+        let expr_doc =
+          string prefix ^^ string ("uint4x32_to_" ^ Ops.prec_string source_prec ^ "_uniform(") 
+          ^^ source_ident ^^ brackets (string "%u") ^^ string "){=" ^^ string B.float_log_style ^^ string "}" ^^ string postfix
+        in
+        (expr_doc, [ `Accessor (idcs, dims); `Value access_doc ])
+    | Access _ -> failwith "C_syntax: Access cases with wrong indices / FFI NOT IMPLEMENTED YET"
     | Get (tn, idcs) ->
         let ident_doc = string (get_ident tn) in
         let from_prec = Lazy.force tn.prec in
 
@@ -443,6 +443,7 @@ end) : Ir.Backend_impl.Lowered_backend = struct
       | Ops.Byte_prec _ -> "uchar"
       | Ops.Uint16_prec _ -> "ushort"
       | Ops.Int32_prec _ -> "int"
+      | Ops.Uint4x32_prec _ -> "uint4" (* Metal's uint4 type - 128-bit *)
       | Ops.Half_prec _ -> "half"
       | Ops.Bfloat16_prec _ -> "bfloat" (* Metal supports bfloat16 natively *)
       | Ops.Fp8_prec _ -> invalid_arg "Metal backend does not support FP8 precision"
@@ -454,6 +455,7 @@ end) : Ir.Backend_impl.Lowered_backend = struct
       | Ops.Byte_prec _ -> ""
       | Ops.Uint16_prec _ -> ""
       | Ops.Int32_prec _ -> ""
+      | Ops.Uint4x32_prec _ -> "" (* No specific suffix for uint4 *)
       | Ops.Half_prec _ -> "h"
       | Ops.Bfloat16_prec _ -> "bf" (* TODO: Verify actual Metal suffix for bfloat16 *)
       | Ops.Fp8_prec _ -> invalid_arg "Metal backend does not support FP8 precision"
@@ -523,6 +525,7 @@ end) : Ir.Backend_impl.Lowered_backend = struct
                  ^^ space ^^ string "?" ^^ space ^^ v2 ^^ space ^^ string ":" ^^ space
                  ^^ string ("0.0" ^ s)))
       | ToPowOf, _ -> func "pow"
+      | Threefry4x32, _ -> func "threefry4x32" (* Metal implementation of Threefry4x32 *)
       | Arg1, _ | Arg2, _ -> invalid_arg "Metal C_syntax_config: Arg1/Arg2 not operators"
 
     let unop_syntax prec op =
 
@@ -16,7 +16,6 @@ let test_bfloat16_conversions () =
   (* Test round-trip through ndarray *)
   let arr =
     Ndarray.create_array ~debug:"test" Ops.bfloat16 ~dims:[| 3; 2 |] ~padding:None
-      (Assignments.Constant_fill [| 1.0; 2.0; 3.14; -1.5; 0.125; 1000.0 |])
   in
 
   Stdio.printf "\nBFloat16 array values:\n";
@@ -37,7 +36,6 @@ let test_fp8_conversions () =
   (* Test round-trip through ndarray *)
   let arr =
     Ndarray.create_array ~debug:"test" Ops.fp8 ~dims:[| 2; 2 |] ~padding:None
-      (Ops.Constant_fill { values = [| 1.0; 0.5; 2.0; -1.0 |]; strict = true })
   in
 
   Stdio.printf "\nFP8 array values:\n";
@@ -56,7 +54,6 @@ let test_padding () =
   let arr =
     Ndarray.create_array ~debug:"padded_test" Ops.single ~dims:padded_dims 
       ~padding:(Some (padding_config, padding_value))
-      (Ops.Constant_fill { values = [| 1.0; 2.0; 3.0; 4.0; 5.0; 6.0 |]; strict = true })
   in
 
   Stdio.printf "Padded array (dims 4x6, unpadded region 2x3):\n";
 
@@ -36,7 +36,7 @@ let benchmark_overhead backend () =
     Train.to_routine (module Backend) ctx ~name:"init_assign_x" IDX.empty mock_update_x
   in
   let f_routine =
-    Train.to_routine (module Backend) init_assign_x.context IDX.empty update_f.fwd_bprop
+    Train.to_routine (module Backend) init_assign_x.context IDX.empty update_f
   in
   Tensor.print_tree ~with_grad:true ~with_backend_info:true ~depth:9 f;
 
 
@@ -25,7 +25,7 @@ let%diagn_sexp _suspended () =
   (* List.iter ~f:(function Some diff -> Train.set_hosted diff.grad | None -> ()) [ a.diff; b.diff
      ]; *)
   let update = Train.grad_update d in
-  let routine = Train.to_routine (module Backend) ctx IDX.empty update.fwd_bprop in
+  let routine = Train.to_routine (module Backend) ctx IDX.empty update in
   Train.run routine;
   Tensor.print_tree ~with_grad:true ~depth:9 d;
   Stdio.print_endline "\n";
@@ -52,7 +52,7 @@ let%diagn_sexp () : unit =
   List.iter ~f:(function Some diff -> Train.set_hosted diff.grad | None -> ()) [ a.diff; b.diff ];
   (* Train.every_non_literal_on_host g; *)
   let update = Train.grad_update g in
-  let routine = Train.to_routine (module Backend) ctx IDX.empty update.fwd_bprop in
+  let routine = Train.to_routine (module Backend) ctx IDX.empty update in
   Utils.capture_stdout_logs @@ fun () ->
   Train.run routine;
   (* Tensor.print_tree ~with_grad:true ~depth:9 g; *)
 
@@ -77,13 +77,13 @@ let experiment seed ~no_batch_shape_inference ~use_builtin_weight_decay () =
   let update = Train.grad_update scalar_loss in
   let%op learning_rate = 0.1 *. (!..steps - !@step_n) /. !..steps in
   Train.set_hosted learning_rate.value;
-  let sgd = Train.sgd_update ~learning_rate ~weight_decay update in
+  let sgd = Train.sgd_update ~learning_rate ~weight_decay scalar_loss in
 
   let module Backend = (val Backends.fresh_backend ()) in
   let stream = Backend.(new_stream @@ get_device ~ordinal:0) in
   let ctx = Backend.make_context stream in
   let routine =
-    Train.to_routine (module Backend) ctx bindings (Asgns.sequence [ update.fwd_bprop; sgd ])
+    Train.to_routine (module Backend) ctx bindings (Asgns.sequence [ update; sgd ])
   in
   (* Stdio.print_endline "\n******** scalar_loss **********"; Tensor.print_tree ~with_id:true
      ~with_grad:false ~depth:9 scalar_loss; Stdio.print_endline "\n******** learning_rate
 
@@ -31,7 +31,7 @@ let () =
   let%op g = g + (10. /. f) in
   List.iter ~f:(Option.iter ~f:(fun diff -> Train.set_hosted diff.Tensor.grad)) [ a.diff; b.diff ];
   let update = Train.grad_update g in
-  let step = Train.to_routine (module Backend) ctx IDX.empty update.fwd_bprop in
+  let step = Train.to_routine (module Backend) ctx IDX.empty update in
   Utils.capture_stdout_logs @@ fun () ->
   Train.run step;
   Tensor.print ~with_code:false ~with_grad:false `Default g;
 
@@ -55,13 +55,13 @@ let demo () =
   let update = Train.grad_update scalar_loss in
   let%op learning_rate = 0.1 *. (!..steps - !@step_n) /. !..steps in
   Train.set_hosted learning_rate.value;
-  let sgd = Train.sgd_update ~learning_rate ~weight_decay update in
+  let sgd = Train.sgd_update ~learning_rate ~weight_decay scalar_loss in
 
   let module Backend = (val Backends.fresh_backend ~backend_name:"cuda" ()) in
   let stream = Backend.(new_stream @@ get_device ~ordinal:0) in
   let ctx = Backend.make_context stream in
   let routine =
-    Train.to_routine (module Backend) ctx bindings (Asgns.sequence [ update.fwd_bprop; sgd ])
+    Train.to_routine (module Backend) ctx bindings (Asgns.sequence [ update; sgd ])
   in
 
   let points = Tn.points_2d ~xdim:0 ~ydim:1 moons_flat.value in
 
@@ -27,7 +27,7 @@ let%debug_sexp graph_t () : unit =
   let xs = Array.init size ~f:Float.(fun i -> (of_int i / 10.) + 0.1) in
   let x_flat =
     Tensor.term ~grad_spec:Require_grad ~label:[ "x_flat" ]
-      ~fetch_op:(Constant_fill { values = xs; strict = true })
+      ~fetch_op:(fun ~v:_ -> Constant_fill xs)
       ()
   in
   let step_sym, bindings = IDX.get_static_symbol ~static_range:size IDX.empty in
@@ -37,7 +37,7 @@ let%debug_sexp graph_t () : unit =
   Train.set_hosted x_flat.value;
   Train.set_hosted (Option.value_exn ~here:[%here] xkcd.diff).grad;
   let update = Train.grad_update fx in
-  let fx_routine = Train.to_routine (module Backend) ctx bindings update.fwd_bprop in
+  let fx_routine = Train.to_routine (module Backend) ctx bindings update in
   let step_ref = IDX.find_exn fx_routine.bindings step_sym in
   Tensor.print_tree ~with_shape:true ~with_grad:true ~depth:9 xkcd;
   let ys, dys =
 
@@ -22,12 +22,12 @@ let _suspended () =
   let%op v = ("w" [ (-3, 1) ] * "x" [ 2; 0 ]) + "b" [ 6.7 ] in
   Train.every_non_literal_on_host v;
   let code = Train.grad_update v in
-  let routine = Train.to_routine (module Backend) ctx IDX.empty code.fwd_bprop in
+  let routine = Train.to_routine (module Backend) ctx IDX.empty code in
   Train.run routine;
   Stdio.printf "\n%!";
   Tensor.print_tree ~with_id:true ~with_grad:true ~depth:9 v;
   Stdio.printf "\nHigh-level code:\n%!";
-  Ir.Assignments.to_doc () code.fwd_bprop.asgns |> PPrint.ToChannel.pretty 0.7 100 Stdio.stdout;
+  Ir.Assignments.to_doc () code.asgns |> PPrint.ToChannel.pretty 0.7 100 Stdio.stdout;
   Stdio.printf "\n%!"
 
 let _suspended () =
@@ -57,7 +57,7 @@ let _suspended () =
   let x_flat =
     Tensor.term ~grad_spec:Tensor.Require_grad
       ~label:[ "x_flat" ] (* ~input_dims:[] ~output_dims:[ 1 ] *)
-      ~fetch_op:(Constant_fill { values; strict = true })
+      ~fetch_op:(fun ~v:_ -> Constant_fill values)
       ()
   in
   let step_sym, bindings = IDX.get_static_symbol ~static_range:size IDX.empty in
@@ -70,7 +70,7 @@ let _suspended () =
   let stream = Backend.(new_stream @@ get_device ~ordinal:0) in
   let ctx = Backend.make_context stream in
   let update = Train.grad_update fx in
-  let routine = Train.to_routine (module Backend) ctx bindings update.fwd_bprop in
+  let routine = Train.to_routine (module Backend) ctx bindings update in
   let step_ref = IDX.find_exn routine.bindings step_sym in
   let ys = Array.create ~len:size 0. and dys = Array.create ~len:size 0. in
   let open Operation.At in
@@ -111,7 +111,7 @@ let _suspended () =
   (* Yay, the whole shape gets inferred! *)
   let x_flat =
     Tensor.term ~grad_spec:Require_grad ~label:[ "x_flat" ]
-      ~fetch_op:(Constant_fill { values = xs; strict = true })
+      ~fetch_op:(fun ~v:_ -> Constant_fill xs)
       ()
   in
   let step_sym, bindings = IDX.get_static_symbol ~static_range:size IDX.empty in
@@ -120,7 +120,7 @@ let _suspended () =
   Train.set_hosted x.value;
   Train.set_hosted (Option.value_exn ~here:[%here] x.diff).grad;
   let update = Train.grad_update fx in
-  let fx_routine = Train.to_routine (module Backend) ctx bindings update.fwd_bprop in
+  let fx_routine = Train.to_routine (module Backend) ctx bindings update in
   let step_ref = IDX.find_exn fx_routine.bindings step_sym in
   let%track_sexp () =
     let ys, dys =
@@ -155,9 +155,7 @@ let () =
   let module Backend = (val Backends.fresh_backend ()) in
   let stream = Backend.(new_stream @@ get_device ~ordinal:0) in
   let update = Train.grad_update l in
-  let routine =
-    Train.to_routine (module Backend) (Backend.make_context stream) IDX.empty update.fwd_bprop
-  in
+  let routine = Train.to_routine (module Backend) (Backend.make_context stream) IDX.empty update in
   Train.run routine;
   (* Tensor.iter_embedded l ~f:(fun a -> ignore (Backend.to_host routine.context a : bool));
      Backend.await stream; *)
@@ -168,8 +166,7 @@ let () =
   Tensor.print_tree ~with_grad:true ~depth:9 l;
   let%op learning_rate = 0.1 in
   let routine =
-    Train.to_routine (module Backend) routine.context IDX.empty
-    @@ Train.sgd_update ~learning_rate update
+    Train.to_routine (module Backend) routine.context IDX.empty @@ Train.sgd_update ~learning_rate l
   in
   (* learning_rate is virtual so this will not print anything. *)
   Stdio.print_endline
@@ -185,7 +182,7 @@ let () =
   Tensor.print_tree ~with_grad:true ~depth:9 l;
   (* We could reuse the jitted code if we did not use `jit_and_run`. *)
   let update = Train.grad_update l in
-  let routine = Train.to_routine (module Backend) routine.context IDX.empty update.fwd_bprop in
+  let routine = Train.to_routine (module Backend) routine.context IDX.empty update in
   Train.run routine;
   (* Tensor.iter_embedded l ~f:(fun a -> ignore (Backend.to_host routine.context a : bool));
      Backend.await stream; *)