allenai · epwalsh · Sep 6, 2023 · Sep 6, 2023 · Sep 6, 2023 · Sep 6, 2023
diff --git a/.flake8 b/.flake8
@@ -9,6 +9,8 @@ ignore =
     W503
     # line too long, who cares?
     E501
+    # don't assign a lambda expression
+    E731
 
 exclude =
     .venv

diff --git a/olmo/config.py b/olmo/config.py
@@ -174,6 +174,11 @@ class LayerNormType(StrEnum):
     LayerNorm implemented manually to work around an issue with ROCm.
     """
 
+    triton = "triton"
+    """
+    A triton implementation of layer norm.
+    """
+
 
 class ActivationType(StrEnum):
     gelu = "gelu"

diff --git a/olmo/model.py b/olmo/model.py
@@ -98,6 +98,8 @@ def build(cls, config: ModelConfig, size: Optional[int] = None, **kwargs) -> Lay
             return RMSLayerNorm(config, size=size, low_precision=True, **kwargs)
         elif config.layer_norm_type == LayerNormType.amd_compatible:
             return AMDLayerNorm(config, size=size, **kwargs)
+        elif config.layer_norm_type == LayerNormType.triton:
+            return TritonLayerNorm(config, size=size, **kwargs)
         else:
             raise NotImplementedError(f"Not sure how to handle '{config.layer_norm_type}' LayerNorm type")
 
@@ -184,6 +186,33 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             return x.to(og_dtype)
 
 
+class TritonLayerNorm(LayerNormBase):
+    def __init__(
+        self,
+        config: ModelConfig,
+        size: Optional[int] = None,
+        elementwise_affine: Optional[bool] = None,
+        eps: float = 1e-05,
+    ):
+        super().__init__(config, size=size, elementwise_affine=elementwise_affine, eps=eps)
+        try:
+            from .triton import layer_norm as triton_layer_norm  # type: ignore
+
+            self._layer_norm = triton_layer_norm
+        except ModuleNotFoundError:
+            raise OlmoConfigurationError(
+                f"{self.__class__.__name__} is not available. Please check if you have triton installed"
+            )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        og_dtype = x.dtype
+        x = self._cast_if_autocast_enabled(x, dtype=torch.float32)
+        with torch.autocast(enabled=False, device_type=x.device.type):
+            return self._layer_norm(x, self.normalized_shape, weight=self.weight, bias=self.bias, eps=self.eps).to(
+                og_dtype
+            )
+
+
 class RMSLayerNorm(LayerNormBase):
     """
     RMS layer norm, a simplified :class:`LayerNorm` implementation that can optionally run

diff --git a/olmo/triton/__init__.py b/olmo/triton/__init__.py
@@ -0,0 +1,3 @@
+from .layer_norm import layer_norm
+
+__all__ = ["layer_norm"]