Skip to content

Commit

Permalink
Added cumulative hazard (#7)
Browse files Browse the repository at this point in the history
  • Loading branch information
Pietro Vertechi authored and ararslan committed Dec 18, 2017
1 parent 3cbcaf0 commit efecd69
Show file tree
Hide file tree
Showing 9 changed files with 221 additions and 94 deletions.
1 change: 1 addition & 0 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ makedocs(
"Home" => "index.md",
"Event Times" => "events.md",
"Kaplan-Meier" => "km.md",
"Nelson-Aalen" => "na.md",
"Cox" => "cox.md",
],
)
Expand Down
6 changes: 3 additions & 3 deletions docs/src/km.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ using Greenwood's formula:

```@docs
Survival.KaplanMeier
StatsBase.fit(::Type{KaplanMeier},
StatsBase.fit(::Type{S},
times::AbstractVector{T},
status::AbstractVector{<:Integer}) where {T}
StatsBase.confint
status::AbstractVector{<:Integer}) where {S<:NonparametricEstimator, T}
StatsBase.confint(km::KaplanMeier, α::Float64=0.05)
```

## References
Expand Down
36 changes: 36 additions & 0 deletions docs/src/na.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Nelson-Aalen Estimator

The [Nelson-Aalen estimator](https://en.wikipedia.org/wiki/Nelson%E2%80%93Aalen_estimator)
is a nonparametric estimator of the cumulative hazard function.

The estimate is given by

```math
\hat{H}(t) = \sum_{i: t_i < t} \frac{d_i}{n_i}
```

where ``d_i`` is the number of observed events at time ``t_i`` and ``n_i`` is the
number of subjects at risk for the event just before time ``t_i``.

The pointwise standard error of the log of the survivor function can be computed
directly as the standard error or a Bernoulli random variable with `d_i` successes
from `n_i` samples:

```math
\text{SE}(\hat{H}(t)) = \sqrt{\sum_{i: t_i < t} \frac{d_i(n_i-d_i)}{n_i^3}}
```

## API

```@docs
Survival.NelsonAalen
StatsBase.fit(::Type{S},
times::AbstractVector{T},
status::AbstractVector{<:Integer}) where {S<:NonparametricEstimator, T}
StatsBase.confint(na::NelsonAalen, α::Float64=0.05)
```

## References

* Nelson, W. (1969). *Hazard plotting for incomplete failure data*.
Journal of Quality Technology 1, 27–52.
4 changes: 4 additions & 0 deletions src/Survival.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ export
isevent,
iscensored,

NonparametricEstimator,
KaplanMeier,
NelsonAalen,
fit,
confint,

Expand All @@ -30,7 +32,9 @@ abstract type AbstractEstimator end
abstract type NonparametricEstimator <: AbstractEstimator end

include("eventtimes.jl")
include("estimator.jl")
include("kaplanmeier.jl")
include("nelsonaalen.jl")
include("cox.jl")
include("optimization.jl")

Expand Down
2 changes: 1 addition & 1 deletion src/cox.jl
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ StatsModels.drop_intercept(::Type{CoxModel}) = true
fit(::Type{CoxModel}, M::AbstractMatrix, y::AbstractVector; kwargs...)
Given a matrix `M` of predictors and a corresponding vector of events, compute the
Cox proportional hazard model estimate of coefficients. Returns a [`CoxModel`](@ref)
Cox proportional hazard model estimate of coefficients. Returns a `CoxModel`
object.
"""
function StatsBase.fit(::Type{CoxModel}, M::AbstractMatrix, y::AbstractVector; kwargs...)
Expand Down
91 changes: 91 additions & 0 deletions src/estimator.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Estimating functions with the following assumptions:
# * The input is nonempty
# * Time 0 is not included

function _estimator(::Type{S}, tte::AbstractVector{T}, status::BitVector) where {S, T}
nobs = length(tte)
dᵢ = 0 # Number of observed events at time t
cᵢ = 0 # Number of censored events at time t
nᵢ = nobs # Number remaining at risk at time t
es = estimator_start(S) # Estimator starting point
gw = stderr_start(S) # Standard Error starting point

times = T[] # The set of unique event times
nevents = Int[] # Total observed events at each time
ncensor = Int[] # Total censored events at each time
natrisk = Int[] # Number at risk at each time
estimator = Float64[] # Estimates
stderr = Float64[] # Pointwise standard errors

t_prev = zero(T)

@inbounds for i = 1:nobs
t = tte[i]
s = status[i]
# Aggregate over tied times
if t == t_prev
dᵢ += s
cᵢ += !s
continue
elseif !iszero(t_prev)
es = estimator_update(S, es, dᵢ, nᵢ)
gw = stderr_update(S, gw, dᵢ, nᵢ)
push!(times, t_prev)
push!(nevents, dᵢ)
push!(ncensor, cᵢ)
push!(natrisk, nᵢ)
push!(estimator, es)
push!(stderr, sqrt(gw))
end
nᵢ -= dᵢ + cᵢ
dᵢ = s
cᵢ = !s
t_prev = t
end

# We need to do this one more time to capture the last time
# since everything in the loop is lagged
push!(times, t_prev)
push!(nevents, dᵢ)
push!(ncensor, cᵢ)
push!(natrisk, nᵢ)
push!(estimator, es)
push!(stderr, sqrt(gw))

return S{T}(times, nevents, ncensor, natrisk, estimator, stderr)
end

"""
fit(::Type{S}, times, status) where S<:NonparametricEstimator
Given a vector of times to events and a corresponding vector of indicators that
dictate whether each time is an observed event or is right censored, compute the
model of type `S`. Return an object of type `S`: [`KaplanMeier`](@ref) and
[`NelsonAalen`](@ref) are supported so far.
"""
function StatsBase.fit(::Type{S},
times::AbstractVector{T},
status::AbstractVector{<:Integer}) where {S<:NonparametricEstimator, T}
nobs = length(times)
if length(status) != nobs
throw(DimensionMismatch("there must be as many event statuses as times"))
end
if nobs == 0
throw(ArgumentError("the sample must be nonempty"))
end
p = sortperm(times)
t = times[p]
s = BitVector(status[p])
return _estimator(S, t, s)
end

function StatsBase.fit(::Type{S}, ets::AbstractVector{<:EventTime}) where S<:NonparametricEstimator
length(ets) > 0 || throw(ArgumentError("the sample must be nonempty"))
x = sort(ets)
# TODO: Refactor, since iterating over the EventTime objects directly in
# the _km loop may actually be easier/more efficient than working with
# the times and statuses as separate vectors. Plus it might be nice to
# make this method the One True Method™ so that folks are encouraged to
# use EventTimes instead of raw values.
return fit(S, map(t->t.time, x), BitVector(map(t->t.status, x)))
end
93 changes: 4 additions & 89 deletions src/kaplanmeier.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,96 +24,11 @@ struct KaplanMeier{T<:Real} <: NonparametricEstimator
stderr::Vector{Float64}
end

# Internal Kaplan-Meier function with the following assumptions:
# * The input is nonempty
# * Time 0 is not included
function _km(tte::AbstractVector{T}, status::BitVector) where {T}
nobs = length(tte)
dᵢ = 0 # Number of observed events at time t
cᵢ = 0 # Number of censored events at time t
nᵢ = nobs # Number remaining at risk at time t
km = 1.0 # Ŝ(t)
gw = 0.0 # SE(log(Ŝ(t)))
estimator_start(::Type{KaplanMeier}) = 1.0 # Estimator starting point
stderr_start(::Type{KaplanMeier}) = 0.0 # StdErr starting point

times = T[] # The set of unique event times
nevents = Int[] # Total observed events at each time
ncensor = Int[] # Total censored events at each time
natrisk = Int[] # Number at risk at each time
survival = Float64[] # Survival estimates
stderr = Float64[] # Pointwise standard errors for log(Ŝ(t))

t_prev = zero(T)

@inbounds for i = 1:nobs
t = tte[i]
s = status[i]
# Aggregate over tied times
if t == t_prev
dᵢ += s
cᵢ += !s
continue
elseif !iszero(t_prev)
km *= 1 - dᵢ / nᵢ
gw += dᵢ / (nᵢ * (nᵢ - dᵢ))
push!(times, t_prev)
push!(nevents, dᵢ)
push!(ncensor, cᵢ)
push!(natrisk, nᵢ)
push!(survival, km)
push!(stderr, sqrt(gw))
end
nᵢ -= dᵢ + cᵢ
dᵢ = s
cᵢ = !s
t_prev = t
end

# We need to do this one more time to capture the last time
# since everything in the loop is lagged
push!(times, t_prev)
push!(nevents, dᵢ)
push!(ncensor, cᵢ)
push!(natrisk, nᵢ)
push!(survival, km)
push!(stderr, sqrt(gw))

return KaplanMeier{T}(times, nevents, ncensor, natrisk, survival, stderr)
end

"""
fit(::Type{KaplanMeier}, times, status)
Given a vector of times to events and a corresponding vector of indicators that
dictate whether each time is an observed event or is right censored, compute the
Kaplan-Meier estimate of the survivor function. Returns a [`KaplanMeier`](@ref)
object.
"""
function StatsBase.fit(::Type{KaplanMeier},
times::AbstractVector{T},
status::AbstractVector{<:Integer}) where {T}
nobs = length(times)
if length(status) != nobs
throw(DimensionMismatch("there must be as many event statuses as times"))
end
if nobs == 0
throw(ArgumentError("the sample must be nonempty"))
end
p = sortperm(times)
t = times[p]
s = BitVector(status[p])
return _km(t, s)
end

function StatsBase.fit(::Type{KaplanMeier}, ets::AbstractVector{<:EventTime})
length(ets) > 0 || throw(ArgumentError("the sample must be nonempty"))
x = sort(ets)
# TODO: Refactor, since iterating over the EventTime objects directly in
# the _km loop may actually be easier/more efficient than working with
# the times and statuses as separate vectors. Plus it might be nice to
# make this method the One True Method™ so that folks are encouraged to
# use EventTimes instead of raw values.
return _km(map(t->t.time, x), BitVector(map(t->t.status, x)))
end
estimator_update(::Type{KaplanMeier}, es, dᵢ, nᵢ) = es * (1 - dᵢ / nᵢ) # Estimator update rule
stderr_update(::Type{KaplanMeier}, gw, dᵢ, nᵢ) = gw + dᵢ / (nᵢ * (nᵢ - dᵢ)) # StdErr update rule

"""
confint(km::KaplanMeier, α=0.05)
Expand Down
43 changes: 43 additions & 0 deletions src/nelsonaalen.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""
NelsonAalen
An immutable type containing cumulative hazard function estimates computed
using the Nelson-Aalen method.
The type has the following fields:
* `times`: Distinct event times
* `nevents`: Number of observed events at each time
* `ncensor`: Number of right censored events at each time
* `natrisk`: Size of the risk set at each time
* `chaz`: Estimate of the cumulative hazard at each time
* `stderr`: Standard error of the cumulative hazard
Use `fit(NelsonAalen, ...)` to compute the estimates and construct
this type.
"""
struct NelsonAalen{T<:Real} <: NonparametricEstimator
times::Vector{T}
nevents::Vector{Int}
ncensor::Vector{Int}
natrisk::Vector{Int}
chaz::Vector{Float64}
stderr::Vector{Float64}
end

estimator_start(::Type{NelsonAalen}) = 0.0 # Estimator starting point
stderr_start(::Type{NelsonAalen}) = 0.0 # StdErr starting point

estimator_update(::Type{NelsonAalen}, es, dᵢ, nᵢ) = es + dᵢ / nᵢ # Estimator update rule
stderr_update(::Type{NelsonAalen}, gw, dᵢ, nᵢ) = gw + dᵢ * (nᵢ - dᵢ) / (nᵢ^3) # StdErr update rule

"""
confint(na::NelsonAalen, α=0.05)
Compute the pointwise confidence intervals for the cumulative hazard
function as a vector of tuples.
"""
function StatsBase.confint(na::NelsonAalen, α::Float64=0.05)
q = quantile(Normal(), 1 - α/2)
return map(na.chaz, na.stderr) do srv, se
srv - q * se, srv + q * se
end
end
39 changes: 38 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using Survival
using Compat
using Compat.Test
using Distributions
using DataFrames, StatsModels, StatsBase, CSV

@testset "Event times" begin
Expand Down Expand Up @@ -145,6 +146,42 @@ end
@test_throws ArgumentError fit(KaplanMeier, EventTime{Int}[])
end

@testset "Nelson-Aalen" begin
t = [
310, 361, 654, 728, 61, 81, 520, 473, 107, 122, 965, 731, 153, 433, 145, 95, 765,
735, 5, 687, 345, 444, 60, 208, 821, 305, 226, 426, 705, 363, 167, 641, 740, 245,
588, 166, 559, 450, 529, 351, 201, 524, 199, 550, 551, 543, 293, 511, 511, 371, 201,
62, 356, 340, 315, 182, 364, 376, 384, 268, 266, 194, 348, 382, 296, 186, 145, 269,
350, 272, 292, 332, 285, 243, 276, 79, 240, 202, 235, 224, 239, 173, 252, 92, 192,
211, 175, 203, 105, 177,
]
s = [
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0,
]

na = fit(NelsonAalen, t, s)
km = fit(KaplanMeier, t, s)

@test na.times == km.times
@test na.nevents == km.nevents
@test na.ncensor == km.ncensor
@test na.natrisk == km.natrisk
@test exp.(-na.chaz[1:50]) km.survival[1:50] rtol = 1e-2
@test na.stderr[1:50] km.stderr[1:50] rtol = 2e-2
na_conf = confint(na)
na_lower, na_upper = getindex.(na_conf, 1), getindex.(na_conf, 2)
@test cdf.(Normal.(na.chaz, na.stderr), na_lower) fill(0.025, length(na.chaz)) rtol = 1e-8
@test cdf.(Normal.(na.chaz, na.stderr), na_upper) fill(0.975, length(na.chaz)) rtol = 1e-8
na_conf = confint(na, 0.01)
na_lower, na_upper = getindex.(na_conf, 1), getindex.(na_conf, 2)
@test cdf.(Normal.(na.chaz, na.stderr), na_lower) fill(0.005, length(na.chaz)) rtol = 1e-8
@test cdf.(Normal.(na.chaz, na.stderr), na_upper) fill(0.995, length(na.chaz)) rtol = 1e-8
end


@testset "Cox" begin
filepath = joinpath(@__DIR__, "rossi.csv")
rossi = CSV.read(filepath)
Expand Down Expand Up @@ -220,4 +257,4 @@ end
end
@test_throws ErrorException Survival.newton_raphson(wrong_fgh!, [2.2])

end
end

0 comments on commit efecd69

Please sign in to comment.