Untested: basic transformer and its building blocks, collab with Claude

lukstafi · lukstafi · commit 1324aac6199c · 2025-09-04T16:42:41.000+02:00
Future work, by Claude:

  1. Positional encoding options: The transformer function uses a
  learned { pos_encoding } but doesn't offer sinusoidal positional
  encoding (the original transformer approach). Could add a comment
  or helper.
  2. Embedding initialization: The embedding matrices (src_embed,
  tgt_embed) use default initialization. Transformers often benefit
  from specific initialization scales.
  3. Dropout locations: While you have attention dropout,
  transformers typically also use:
    - Embedding dropout (after embeddings + position)
    - Residual dropout (after sublayers, before residual add)
  4. Missing gelu activation: Modern transformers often use GELU
  instead of ReLU in FFN. If OCANNL supports it, could be worth
  adding.
  5. Causal mask generation: For decoder self-attention, users need
  to create the causal mask themselves. A helper function might be
  useful.
  6. Output projection initialization: The final w_out in
  transformer projects to vocabulary - often benefits from tied
  weights with embeddings or special initialization.
diff --git a/lib/nn_blocks.ml b/lib/nn_blocks.ml
@@ -1,26 +1,163 @@
+(** This file contains basic building blocks for neural networks, with limited functionality. Feel
+    free to copy-paste and modify as needed.
+
+    We follow "the principle of least commitment": where possible, we use row variables to remain
+    agnostic to the number of axes. This flexibility often remains unused, but it makes explicit the
+    architectural structure. *)
+
 open! Base
 open Operation.DSL_modules
+module Tn = Ir.Tnode
 
 let%op mlp_layer ~label ~hid_dim () x = relu (({ w = uniform () } * x) + { b = 0.; o = [ hid_dim ] })
 
-let mlp ~label ~hid_dims () =
+(** Set rate=0.0 during inference. *)
+let%op dropout ~rate () x =
+  if Float.(rate <= 0.0) then x
+  else
+    let keep_prob = 1.0 - !.rate in
+    let mask = !.rate < uniform () *. x in
+    (* Creates 0/1 mask *)
+    (* Scale by 1/keep_prob to maintain expected value *)
+    x *. mask /. keep_prob
+
+(** Multi-layer perceptron of depth [List.length hid_dims + 1], with a linear output layer. *)
+let%op mlp ~label ~hid_dims () =
   let layers =
     List.mapi hid_dims ~f:(fun i hid_dim ->
         mlp_layer ~label:(("L" ^ Int.to_string i) :: label) ~hid_dim ())
   in
-  fun x -> List.fold layers ~init:x ~f:(fun x layer -> layer x)
+  fun x ->
+    let hidden = List.fold layers ~init:x ~f:(fun x layer -> layer x) in
+    { w_out } * hidden
 
-let%op softmax x =
-  let max_vals = x @^^ "...|...t->... => ...|...0->..." in
-  let exp_vals = exp (x - max_vals) in
-  exp_vals /. (exp_vals ++ "...|...t->... => ...|...0->...")
+let reduce_specified_axes spec =
+  let lhs =
+    if String.contains spec ',' then
+      Str.global_replace (Str.regexp "[A-Za-z][A-Za-z_0-9]*") "0" spec
+    else Str.global_replace (Str.regexp "[A-Za-z]") "0" spec
+  in
+  spec ^ " => " ^ lhs
+
+(** Softmax across specified axes. Does not support non-default row variables. *)
+let%op softmax ~spec ?(temperature = 1.0) () =
+  let spec = reduce_specified_axes spec in
+  fun x ->
+    let x_scaled = if Float.(temperature <> 1.0) then x /. !.temperature else x in
+    let max_vals = x_scaled @^^ spec in
+    let exp_vals = exp (x_scaled - max_vals) in
+    exp_vals /. (exp_vals ++ spec)
 
-let%op basic_multi_head_attention ~label ~num_heads () x =
+let%op multi_head_attention ~label ~num_heads ?temperature ?(dropout_rate = 0.0) () ?mask x =
   let q = { w_q } * x in
   let k = { w_k } * x in
   let v = { w_v } * x in
-  let scores = q +* "...s|h...; ...t|h... => ...|st->h" [ "h" ] k in
+  (* Works with arbitrary number of model axes via `..d..` (row variable syntax). *)
+  let scores =
+    (q +* " ... s | h ..d..; ... t | h ..d.. => ... | s t -> h " [ "h"; "d" ] k) /. sqrt (dim d)
+  in
+  Shape.set_dim h num_heads;
+  (* We don't need to lift [softmax ~spec ()] because it doesn't introduce any new params. *)
+  let attn_weights =
+    softmax ~spec:" ... | ... t -> ..." ?temperature ()
+      (match mask with None -> scores | Some mask -> where mask scores !.(-1e9))
+  in
+  let attn_weights =
+    if Float.(dropout_rate > 0.0) then dropout ~rate:dropout_rate () attn_weights else attn_weights
+  in
+  let attended = attn_weights +* " ... | s t -> h; ... t | h ... => ... s | h ... " v in
+  { w_o } * attended
+
+let%op layer_norm ~label ?(epsilon = 1e-5) () x =
+  let mean = x ++ " ... | ..d..  => ... | 0 " [ "d" ] in
+  let centered = (x - mean) /. dim d in
+  let variance = (centered * centered) ++ " ... | ... => ... |  0 " in
+  let std_dev = sqrt (variance + !.epsilon) in
+  let normalized = centered /. std_dev in
+  (* gamma and beta are learned, but initialized to good defaults *)
+  ({ gamma = 1. } *. normalized) + { beta = 0. }
+
+let%op transformer_encoder_block ~label ~num_heads ~d_ff ?(epsilon = 1e-5) () =
+  let mha = multi_head_attention ~label:(label @ [ "mha" ]) ~num_heads () in
+  (* Standard 2-layer FFN: expand to d_ff then contract back to d_model *)
+  let ffn = mlp ~label:(label @ [ "ffn" ]) ~hid_dims:[ d_ff ] () in
+  let ln1 = layer_norm ~label:(label @ [ "ln1" ]) ~epsilon () in
+  let ln2 = layer_norm ~label:(label @ [ "ln2" ]) ~epsilon () in
+  fun input ->
+    let attn_output = mha input in
+    let x1 = ln1 (input + attn_output) in
+    let ffn_output = ffn x1 in
+    ln2 (x1 + ffn_output)
+
+let%op cross_attention ~label ~num_heads ?temperature ?(dropout_rate = 0.0) () x ~enc_output =
+  let q = { w_q } * x in
+  let k = { w_k } * enc_output in
+  let v = { w_v } * enc_output in
+  let scores =
+    (q +* " ... s | h ..d..; ... t | h ..d.. => ... | s t -> h " [ "h"; "d" ] k) /. sqrt (dim d)
+  in
   Shape.set_dim h num_heads;
-  let attn_weights = softmax scores in
-  let attended = attn_weights +* "...|st->h; ...t|h... => ...s|h..." v in
+  let attn_weights = softmax ~spec:" ... | ... t -> ..." ?temperature () scores in
+  let attn_weights =
+    if Float.(dropout_rate > 0.0) then dropout ~rate:dropout_rate () attn_weights else attn_weights
+  in
+  let attended = attn_weights +* " ... | s t -> h; ... t | h ... => ... s | h ... " v in
   { w_o } * attended
+
+let%op transformer_decoder_block ~label ~num_heads ~d_ff ?(epsilon = 1e-5) () =
+  let masked_mha = multi_head_attention ~label:(label @ [ "masked_mha" ]) ~num_heads () in
+  let cross_mha = cross_attention ~label:(label @ [ "cross_mha" ]) ~num_heads () in
+  (* Standard 2-layer FFN: expand to d_ff then contract back to d_model *)
+  let ffn = mlp ~label:(label @ [ "ffn" ]) ~hid_dims:[ d_ff ] () in
+  let ln1 = layer_norm ~label:(label @ [ "ln1" ]) ~epsilon () in
+  let ln2 = layer_norm ~label:(label @ [ "ln2" ]) ~epsilon () in
+  let ln3 = layer_norm ~label:(label @ [ "ln3" ]) ~epsilon () in
+  fun target ~enc_output ~mask ->
+    let self_attn_output = masked_mha ~mask target in
+    let x1 = ln1 (target + self_attn_output) in
+    let cross_attn_output = cross_mha x1 ~enc_output in
+    let x2 = ln2 (x1 + cross_attn_output) in
+    let ffn_output = ffn x2 in
+    ln3 (x2 + ffn_output)
+
+let transformer_encoder ~label ~num_layers ~num_heads ~d_ff ?(epsilon = 1e-5) () =
+  let layers =
+    List.init num_layers ~f:(fun i ->
+        transformer_encoder_block
+          ~label:(label @ [ "layer" ^ Int.to_string i ])
+          ~num_heads ~d_ff ~epsilon ())
+  in
+  fun x -> List.fold layers ~init:x ~f:(fun x layer -> layer x)
+
+let transformer_decoder ~label ~num_layers ~num_heads ~d_ff ?(epsilon = 1e-5) () =
+  let layers =
+    List.init num_layers ~f:(fun i ->
+        transformer_decoder_block
+          ~label:(label @ [ "layer" ^ Int.to_string i ])
+          ~num_heads ~d_ff ~epsilon ())
+  in
+  fun target ~enc_output ~mask ->
+    List.fold layers ~init:target ~f:(fun x layer -> layer x ~enc_output ~mask)
+
+let%op transformer ~label ~num_encoder_layers ~num_decoder_layers ~num_heads ~d_model ~d_ff
+    ?(epsilon = 1e-5) () =
+  let encoder =
+    transformer_encoder ~label:(label @ [ "encoder" ]) ~num_layers:num_encoder_layers ~num_heads
+      ~d_ff ~epsilon ()
+  in
+  let decoder =
+    transformer_decoder ~label:(label @ [ "decoder" ]) ~num_layers:num_decoder_layers ~num_heads
+      ~d_ff ~epsilon ()
+  in
+  (* All inline definitions, including for d, are lifted up to the unit parameter above. *)
+  Shape.set_dim d d_model;
+  fun src tgt mask ->
+    (* Learned positional encoding *)
+    let enc_output =
+      encoder
+        (src +* " ... s | ..v.. ; ..v.. -> d => ... s | d " [ "d" ] { src_embed } + { pos_encoding })
+    in
+    let tgt_embedded =
+      tgt +* " ... t | ..v.. ; ..v.. -> d => ... t | d " { tgt_embed } + pos_encoding
+    in
+    decoder tgt_embedded ~enc_output ~mask +* " ... | d; d -> ..v.. => ... | ..v.. " { w_out }