Added cumulative hazard (#7)

JuliaStats · Dec 18, 2017 · efecd69 · efecd69
1 parent 3cbcaf0
commit efecd69
Show file tree

Hide file tree

Showing 9 changed files with 221 additions and 94 deletions.
diff --git a/docs/make.jl b/docs/make.jl
@@ -10,6 +10,7 @@ makedocs(
         "Home" => "index.md",
         "Event Times" => "events.md",
         "Kaplan-Meier" => "km.md",
+        "Nelson-Aalen" => "na.md",
         "Cox" => "cox.md",
     ],
 )

diff --git a/docs/src/km.md b/docs/src/km.md
@@ -24,10 +24,10 @@ using Greenwood's formula:
 
 ```@docs
 Survival.KaplanMeier
-StatsBase.fit(::Type{KaplanMeier},
+StatsBase.fit(::Type{S},
               times::AbstractVector{T},
-              status::AbstractVector{<:Integer}) where {T}
-StatsBase.confint
+              status::AbstractVector{<:Integer}) where {S<:NonparametricEstimator, T}
+StatsBase.confint(km::KaplanMeier, α::Float64=0.05)
 ```
 
 ## References

diff --git a/docs/src/na.md b/docs/src/na.md
@@ -0,0 +1,36 @@
+# Nelson-Aalen Estimator
+
+The [Nelson-Aalen estimator](https://en.wikipedia.org/wiki/Nelson%E2%80%93Aalen_estimator)
+is a nonparametric estimator of the cumulative hazard function.
+
+The estimate is given by
+
+```math
+\hat{H}(t) = \sum_{i: t_i < t} \frac{d_i}{n_i}
+```
+
+where ``d_i`` is the number of observed events at time ``t_i`` and ``n_i`` is the
+number of subjects at risk for the event just before time ``t_i``.
+
+The pointwise standard error of the log of the survivor function can be computed
+directly as the standard error or a Bernoulli random variable with `d_i` successes
+from `n_i` samples:
+
+```math
+\text{SE}(\hat{H}(t)) = \sqrt{\sum_{i: t_i < t} \frac{d_i(n_i-d_i)}{n_i^3}}
+```
+
+## API
+
+```@docs
+Survival.NelsonAalen
+StatsBase.fit(::Type{S},
+              times::AbstractVector{T},
+              status::AbstractVector{<:Integer}) where {S<:NonparametricEstimator, T}
+StatsBase.confint(na::NelsonAalen, α::Float64=0.05)
+```
+
+## References
+
+* Nelson, W. (1969). *Hazard plotting for incomplete failure data*.
+  Journal of Quality Technology 1, 27–52.
diff --git a/src/Survival.jl b/src/Survival.jl
@@ -11,7 +11,9 @@ export
     isevent,
     iscensored,
 
+    NonparametricEstimator,
     KaplanMeier,
+    NelsonAalen,
     fit,
     confint,
 
@@ -30,7 +32,9 @@ abstract type AbstractEstimator end
 abstract type NonparametricEstimator <: AbstractEstimator end
 
 include("eventtimes.jl")
+include("estimator.jl")
 include("kaplanmeier.jl")
+include("nelsonaalen.jl")
 include("cox.jl")
 include("optimization.jl")
 

diff --git a/src/cox.jl b/src/cox.jl
@@ -196,7 +196,7 @@ StatsModels.drop_intercept(::Type{CoxModel}) = true
     fit(::Type{CoxModel}, M::AbstractMatrix, y::AbstractVector; kwargs...)
 
 Given a matrix `M` of predictors and a corresponding vector of events, compute the
-Cox proportional hazard model estimate of coefficients. Returns a [`CoxModel`](@ref)
+Cox proportional hazard model estimate of coefficients. Returns a `CoxModel`
 object.
 """
 function StatsBase.fit(::Type{CoxModel}, M::AbstractMatrix, y::AbstractVector; kwargs...)

diff --git a/src/estimator.jl b/src/estimator.jl
@@ -0,0 +1,91 @@
+# Estimating functions with the following assumptions:
+#  * The input is nonempty
+#  * Time 0 is not included
+
+function _estimator(::Type{S}, tte::AbstractVector{T}, status::BitVector) where {S, T}
+    nobs = length(tte)
+    dᵢ = 0                   # Number of observed events at time t
+    cᵢ = 0                   # Number of censored events at time t
+    nᵢ = nobs                # Number remaining at risk at time t
+    es = estimator_start(S)  # Estimator starting point
+    gw = stderr_start(S)     # Standard Error starting point
+
+    times = T[]              # The set of unique event times
+    nevents = Int[]          # Total observed events at each time
+    ncensor = Int[]          # Total censored events at each time
+    natrisk = Int[]          # Number at risk at each time
+    estimator = Float64[]    # Estimates
+    stderr = Float64[]       # Pointwise standard errors
+
+    t_prev = zero(T)
+
+    @inbounds for i = 1:nobs
+        t = tte[i]
+        s = status[i]
+        # Aggregate over tied times
+        if t == t_prev
+            dᵢ += s
+            cᵢ += !s
+            continue
+        elseif !iszero(t_prev)
+            es = estimator_update(S, es, dᵢ, nᵢ)
+            gw = stderr_update(S, gw, dᵢ, nᵢ)
+            push!(times, t_prev)
+            push!(nevents, dᵢ)
+            push!(ncensor, cᵢ)
+            push!(natrisk, nᵢ)
+            push!(estimator, es)
+            push!(stderr, sqrt(gw))
+        end
+        nᵢ -= dᵢ + cᵢ
+        dᵢ = s
+        cᵢ = !s
+        t_prev = t
+    end
+
+    # We need to do this one more time to capture the last time
+    # since everything in the loop is lagged
+    push!(times, t_prev)
+    push!(nevents, dᵢ)
+    push!(ncensor, cᵢ)
+    push!(natrisk, nᵢ)
+    push!(estimator, es)
+    push!(stderr, sqrt(gw))
+
+    return S{T}(times, nevents, ncensor, natrisk, estimator, stderr)
+end
+
+"""
+    fit(::Type{S}, times, status) where S<:NonparametricEstimator
+
+Given a vector of times to events and a corresponding vector of indicators that
+dictate whether each time is an observed event or is right censored, compute the
+model of type `S`. Return an object of type `S`: [`KaplanMeier`](@ref) and 
+[`NelsonAalen`](@ref) are supported so far.
+"""
+function StatsBase.fit(::Type{S},
+                       times::AbstractVector{T},
+                       status::AbstractVector{<:Integer}) where {S<:NonparametricEstimator, T}
+    nobs = length(times)
+    if length(status) != nobs
+        throw(DimensionMismatch("there must be as many event statuses as times"))
+    end
+    if nobs == 0
+        throw(ArgumentError("the sample must be nonempty"))
+    end
+    p = sortperm(times)
+    t = times[p]
+    s = BitVector(status[p])
+    return _estimator(S, t, s)
+end
+
+function StatsBase.fit(::Type{S}, ets::AbstractVector{<:EventTime}) where S<:NonparametricEstimator
+    length(ets) > 0 || throw(ArgumentError("the sample must be nonempty"))
+    x = sort(ets)
+    # TODO: Refactor, since iterating over the EventTime objects directly in
+    # the _km loop may actually be easier/more efficient than working with
+    # the times and statuses as separate vectors. Plus it might be nice to
+    # make this method the One True Method™ so that folks are encouraged to
+    # use EventTimes instead of raw values.
+    return fit(S, map(t->t.time, x), BitVector(map(t->t.status, x)))
+end
diff --git a/src/kaplanmeier.jl b/src/kaplanmeier.jl
@@ -24,96 +24,11 @@ struct KaplanMeier{T<:Real} <: NonparametricEstimator
     stderr::Vector{Float64}
 end
 
-# Internal Kaplan-Meier function with the following assumptions:
-#  * The input is nonempty
-#  * Time 0 is not included
-function _km(tte::AbstractVector{T}, status::BitVector) where {T}
-    nobs = length(tte)
-    dᵢ = 0                # Number of observed events at time t
-    cᵢ = 0                # Number of censored events at time t
-    nᵢ = nobs             # Number remaining at risk at time t
-    km = 1.0              # Ŝ(t)
-    gw = 0.0              # SE(log(Ŝ(t)))
+estimator_start(::Type{KaplanMeier}) = 1.0  # Estimator starting point
+stderr_start(::Type{KaplanMeier}) = 0.0 # StdErr starting point
 
-    times = T[]           # The set of unique event times
-    nevents = Int[]       # Total observed events at each time
-    ncensor = Int[]       # Total censored events at each time
-    natrisk = Int[]       # Number at risk at each time
-    survival = Float64[]  # Survival estimates
-    stderr = Float64[]    # Pointwise standard errors for log(Ŝ(t))
-
-    t_prev = zero(T)
-
-    @inbounds for i = 1:nobs
-        t = tte[i]
-        s = status[i]
-        # Aggregate over tied times
-        if t == t_prev
-            dᵢ += s
-            cᵢ += !s
-            continue
-        elseif !iszero(t_prev)
-            km *= 1 - dᵢ / nᵢ
-            gw += dᵢ / (nᵢ * (nᵢ - dᵢ))
-            push!(times, t_prev)
-            push!(nevents, dᵢ)
-            push!(ncensor, cᵢ)
-            push!(natrisk, nᵢ)
-            push!(survival, km)
-            push!(stderr, sqrt(gw))
-        end
-        nᵢ -= dᵢ + cᵢ
-        dᵢ = s
-        cᵢ = !s
-        t_prev = t
-    end
-
-    # We need to do this one more time to capture the last time
-    # since everything in the loop is lagged
-    push!(times, t_prev)
-    push!(nevents, dᵢ)
-    push!(ncensor, cᵢ)
-    push!(natrisk, nᵢ)
-    push!(survival, km)
-    push!(stderr, sqrt(gw))
-
-    return KaplanMeier{T}(times, nevents, ncensor, natrisk, survival, stderr)
-end
-
-"""
-    fit(::Type{KaplanMeier}, times, status)
-
-Given a vector of times to events and a corresponding vector of indicators that
-dictate whether each time is an observed event or is right censored, compute the
-Kaplan-Meier estimate of the survivor function. Returns a [`KaplanMeier`](@ref)
-object.
-"""
-function StatsBase.fit(::Type{KaplanMeier},
-                       times::AbstractVector{T},
-                       status::AbstractVector{<:Integer}) where {T}
-    nobs = length(times)
-    if length(status) != nobs
-        throw(DimensionMismatch("there must be as many event statuses as times"))
-    end
-    if nobs == 0
-        throw(ArgumentError("the sample must be nonempty"))
-    end
-    p = sortperm(times)
-    t = times[p]
-    s = BitVector(status[p])
-    return _km(t, s)
-end
-
-function StatsBase.fit(::Type{KaplanMeier}, ets::AbstractVector{<:EventTime})
-    length(ets) > 0 || throw(ArgumentError("the sample must be nonempty"))
-    x = sort(ets)
-    # TODO: Refactor, since iterating over the EventTime objects directly in
-    # the _km loop may actually be easier/more efficient than working with
-    # the times and statuses as separate vectors. Plus it might be nice to
-    # make this method the One True Method™ so that folks are encouraged to
-    # use EventTimes instead of raw values.
-    return _km(map(t->t.time, x), BitVector(map(t->t.status, x)))
-end
+estimator_update(::Type{KaplanMeier}, es, dᵢ, nᵢ) = es * (1 - dᵢ / nᵢ) # Estimator update rule
+stderr_update(::Type{KaplanMeier}, gw, dᵢ, nᵢ) = gw + dᵢ / (nᵢ * (nᵢ - dᵢ)) # StdErr update rule
 
 """
     confint(km::KaplanMeier, α=0.05)

diff --git a/src/nelsonaalen.jl b/src/nelsonaalen.jl
@@ -0,0 +1,43 @@
+"""
+    NelsonAalen
+
+An immutable type containing cumulative hazard function estimates computed
+using the Nelson-Aalen method.
+The type has the following fields:
+
+* `times`: Distinct event times
+* `nevents`: Number of observed events at each time
+* `ncensor`: Number of right censored events at each time
+* `natrisk`: Size of the risk set at each time
+* `chaz`: Estimate of the cumulative hazard at each time
+* `stderr`: Standard error of the cumulative hazard
+Use `fit(NelsonAalen, ...)` to compute the estimates and construct
+this type.
+"""
+struct NelsonAalen{T<:Real} <: NonparametricEstimator
+    times::Vector{T}
+    nevents::Vector{Int}
+    ncensor::Vector{Int}
+    natrisk::Vector{Int}
+    chaz::Vector{Float64}
+    stderr::Vector{Float64}
+end
+
+estimator_start(::Type{NelsonAalen}) = 0.0  # Estimator starting point
+stderr_start(::Type{NelsonAalen}) = 0.0 # StdErr starting point
+
+estimator_update(::Type{NelsonAalen}, es, dᵢ, nᵢ) = es + dᵢ / nᵢ # Estimator update rule
+stderr_update(::Type{NelsonAalen}, gw, dᵢ, nᵢ) = gw + dᵢ * (nᵢ - dᵢ) / (nᵢ^3) # StdErr update rule
+
+"""
+    confint(na::NelsonAalen, α=0.05)
+
+Compute the pointwise confidence intervals for the cumulative hazard
+function as a vector of tuples.
+"""
+function StatsBase.confint(na::NelsonAalen, α::Float64=0.05)
+    q = quantile(Normal(), 1 - α/2)
+    return map(na.chaz, na.stderr) do srv, se
+        srv - q * se, srv + q * se
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,6 +1,7 @@
 using Survival
 using Compat
 using Compat.Test
+using Distributions
 using DataFrames, StatsModels, StatsBase, CSV
 
 @testset "Event times" begin
@@ -145,6 +146,42 @@ end
     @test_throws ArgumentError fit(KaplanMeier, EventTime{Int}[])
 end
 
+@testset "Nelson-Aalen" begin
+    t = [
+        310, 361, 654, 728,  61,  81, 520, 473, 107, 122, 965, 731, 153, 433, 145,  95, 765,
+        735,   5, 687, 345, 444,  60, 208, 821, 305, 226, 426, 705, 363, 167, 641, 740, 245,
+        588, 166, 559, 450, 529, 351, 201, 524, 199, 550, 551, 543, 293, 511, 511, 371, 201,
+         62, 356, 340, 315, 182, 364, 376, 384, 268, 266, 194, 348, 382, 296, 186, 145, 269,
+        350, 272, 292, 332, 285, 243, 276,  79, 240, 202, 235, 224, 239, 173, 252,  92, 192,
+        211, 175, 203, 105, 177,
+    ]
+    s = [
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
+        1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
+        0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
+        0, 0, 0, 0, 0, 0,
+    ]
+
+    na = fit(NelsonAalen, t, s)
+    km = fit(KaplanMeier, t, s)
+
+    @test na.times == km.times
+    @test na.nevents == km.nevents
+    @test na.ncensor == km.ncensor
+    @test na.natrisk == km.natrisk
+    @test exp.(-na.chaz[1:50]) ≈ km.survival[1:50] rtol = 1e-2
+    @test na.stderr[1:50] ≈ km.stderr[1:50] rtol = 2e-2
+    na_conf = confint(na)
+    na_lower, na_upper = getindex.(na_conf, 1), getindex.(na_conf, 2)
+    @test cdf.(Normal.(na.chaz, na.stderr), na_lower) ≈ fill(0.025, length(na.chaz)) rtol = 1e-8
+    @test cdf.(Normal.(na.chaz, na.stderr), na_upper) ≈ fill(0.975, length(na.chaz)) rtol = 1e-8
+    na_conf = confint(na, 0.01)
+    na_lower, na_upper = getindex.(na_conf, 1), getindex.(na_conf, 2)
+    @test cdf.(Normal.(na.chaz, na.stderr), na_lower) ≈ fill(0.005, length(na.chaz)) rtol = 1e-8
+    @test cdf.(Normal.(na.chaz, na.stderr), na_upper) ≈ fill(0.995, length(na.chaz)) rtol = 1e-8
+end
+
+
 @testset "Cox" begin
     filepath = joinpath(@__DIR__, "rossi.csv")
     rossi = CSV.read(filepath)
@@ -220,4 +257,4 @@ end
     end
     @test_throws ErrorException Survival.newton_raphson(wrong_fgh!, [2.2])
 
-end
+end