Fix attention: it has a hidden dimension because of w_o

lukstafi · lukstafi · commit d27b49acefbc · 2025-09-11T17:04:52.000+02:00
diff --git a/lib/nn_blocks.ml b/lib/nn_blocks.ml
@@ -58,22 +58,24 @@ let%op softmax ~spec ?(temperature = 1.0) () =
     let exp_vals = exp (x_scaled - max_vals) in
     exp_vals /. (exp_vals ++ spec)
 
-let%op multi_head_attention ~label ~num_heads ?temperature ?(dropout_rate = 0.0) () ~train_step
-    ?mask x =
+let%op multi_head_attention ~label ~num_heads ~d_attention ?temperature ?(dropout_rate = 0.0) ()
+    ~train_step ?mask x =
   let q = { w_q } * x in
   let k = { w_k } * x in
   let v = { w_v } * x in
-  (* Works with arbitrary number of model axes via `..d..` (row variable syntax). *)
   let scores =
-    (q +* " ... s | h ..d..; ... t | h ..d.. => ... s | t -> h " [ "h"; "d" ] k) /. sqrt (dim d)
+    (q +* " ... s | h d; ... t | h d => ... s | t -> h" [ "h"; "d" ] k) /. sqrt (dim d)
   in
   Shape.set_dim h num_heads;
+  (* NOTE: often d_attention = d_model / num_heads, but we allow for other values. *)
+  Shape.set_dim d d_attention;
   (* We don't need to lift [softmax ~spec ()] because it doesn't introduce any new params. *)
   let attn_weights =
     softmax ~spec:" ... | t -> ..." ?temperature ()
       (match mask with None -> scores | Some mask -> where mask scores !.(-1e9))
   in
   let attn_weights = dropout ~rate:dropout_rate () ~train_step attn_weights in
+  (* w_o output shape will automatically be set to the model dimension(s) by shape inference. *)
   { w_o } * (attn_weights +* " ... s | t -> h; ... t | h ... => ... s | h ... " v)
 
 let%op layer_norm ~label ?(epsilon = 1e-5) () x =
@@ -85,8 +87,8 @@ let%op layer_norm ~label ?(epsilon = 1e-5) () x =
   (* gamma and beta are learned, but initialized to good defaults *)
   ({ gamma = 1. } *. normalized) + { beta = 0. }
 
-let%op transformer_encoder_block ~label ~num_heads ~d_ff ?(epsilon = 1e-5) () =
-  let mha = multi_head_attention ~label:(label @ [ "mha" ]) ~num_heads () in
+let%op transformer_encoder_block ~label ~num_heads ~d_attention ~d_ff ?(epsilon = 1e-5) () =
+  let mha = multi_head_attention ~label:(label @ [ "mha" ]) ~num_heads ~d_attention () in
   (* Standard 2-layer FFN: expand to d_ff then contract back to d_model *)
   let ffn = mlp ~label:(label @ [ "ffn" ]) ~hid_dims:[ d_ff ] () in
   let ln1 = layer_norm ~label:(label @ [ "ln1" ]) ~epsilon () in
@@ -95,22 +97,25 @@ let%op transformer_encoder_block ~label ~num_heads ~d_ff ?(epsilon = 1e-5) () =
     let x1 = ln1 (input + mha ~train_step input) in
     ln2 (x1 + ffn x1)
 
-let%op cross_attention ~label ~num_heads ?temperature ?(dropout_rate = 0.0) () ~train_step x
-    ~enc_output =
+let%op cross_attention ~label ~num_heads ~d_attention ?temperature ?(dropout_rate = 0.0) ()
+    ~train_step x ~enc_output =
   let q = { w_q } * x in
   let k = { w_k } * enc_output in
   let v = { w_v } * enc_output in
   let scores =
-    (q +* " ... s | h ..d..; ... t | h ..d.. => ... | s t -> h " [ "h"; "d" ] k) /. sqrt (dim d)
+    (q +* " ... s | h d; ... t | h d => ... s | t -> h " [ "h"; "d" ] k) /. sqrt (dim d)
   in
   Shape.set_dim h num_heads;
-  let attn_weights = softmax ~spec:" ... | ... t -> ..." ?temperature () scores in
+  Shape.set_dim d d_attention;
+  let attn_weights = softmax ~spec:" ... | t -> ..." ?temperature () scores in
   let attn_weights = dropout ~rate:dropout_rate () ~train_step attn_weights in
-  { w_o } * (attn_weights +* " ... | s t -> h; ... t | h ... => ... s | h ... " v)
+  { w_o } * (attn_weights +* " ... s | t -> h; ... t | h ... => ... s | h ... " v)
 
-let%op transformer_decoder_block ~label ~num_heads ~d_ff ?(epsilon = 1e-5) () =
-  let masked_mha = multi_head_attention ~label:(label @ [ "masked_mha" ]) ~num_heads () in
-  let cross_mha = cross_attention ~label:(label @ [ "cross_mha" ]) ~num_heads () in
+let%op transformer_decoder_block ~label ~num_heads ~d_attention ~d_ff ?(epsilon = 1e-5) () =
+  let masked_mha =
+    multi_head_attention ~label:(label @ [ "masked_mha" ]) ~num_heads ~d_attention ()
+  in
+  let cross_mha = cross_attention ~label:(label @ [ "cross_mha" ]) ~num_heads ~d_attention () in
   (* Standard 2-layer FFN: expand to d_ff then contract back to d_model *)
   let ffn = mlp ~label:(label @ [ "ffn" ]) ~hid_dims:[ d_ff ] () in
   let ln1 = layer_norm ~label:(label @ [ "ln1" ]) ~epsilon () in
@@ -121,21 +126,21 @@ let%op transformer_decoder_block ~label ~num_heads ~d_ff ?(epsilon = 1e-5) () =
     let x2 = ln2 (x1 + cross_mha ~train_step x1 ~enc_output) in
     ln3 (x2 + ffn x2)
 
-let transformer_encoder ~label ~num_layers ~num_heads ~d_ff ?(epsilon = 1e-5) () =
+let transformer_encoder ~label ~num_layers ~num_heads ~d_attention ~d_ff ?(epsilon = 1e-5) () =
   let layers =
     List.init num_layers ~f:(fun i ->
         transformer_encoder_block
           ~label:(label @ [ "layer" ^ Int.to_string i ])
-          ~num_heads ~d_ff ~epsilon ())
+          ~num_heads ~d_attention ~d_ff ~epsilon ())
   in
   fun ~train_step x -> List.fold layers ~init:x ~f:(fun x layer -> layer ~train_step x)
 
-let transformer_decoder ~label ~num_layers ~num_heads ~d_ff ?(epsilon = 1e-5) () =
+let transformer_decoder ~label ~num_layers ~num_heads ~d_attention ~d_ff ?(epsilon = 1e-5) () =
   let layers =
     List.init num_layers ~f:(fun i ->
         transformer_decoder_block
           ~label:(label @ [ "layer" ^ Int.to_string i ])
-          ~num_heads ~d_ff ~epsilon ())
+          ~num_heads ~d_attention ~d_ff ~epsilon ())
   in
   fun ~train_step target ~enc_output ~mask ->
     List.fold layers ~init:target ~f:(fun x layer -> layer ~train_step x ~enc_output ~mask)
@@ -144,11 +149,11 @@ let%op transformer ~label ~num_encoder_layers ~num_decoder_layers ~num_heads ~d_
     ?(epsilon = 1e-5) () =
   let encoder =
     transformer_encoder ~label:(label @ [ "encoder" ]) ~num_layers:num_encoder_layers ~num_heads
-      ~d_ff ~epsilon ()
+      ~d_attention:(d_model / num_heads) ~d_ff ~epsilon ()
   in
   let decoder =
     transformer_decoder ~label:(label @ [ "decoder" ]) ~num_layers:num_decoder_layers ~num_heads
-      ~d_ff ~epsilon ()
+      ~d_attention:(d_model / num_heads) ~d_ff ~epsilon ()
   in
   (* All inline definitions, including for ds, dt, are lifted up to the unit parameter above. *)
   Shape.set_dim ds d_model;