In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import check_random_state
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")


In [41]:
np.random.seed(0)
seeds = np.random.permutation(5000)[:1400]

# Functions

### Covariates

In [3]:
from scipy.stats import norm

def sample_norm_covariates(n_datapoints, random_seed):
    covariate = norm.rvs(1,0.3, size=n_datapoints, random_state = random_seed)
    covariate[covariate <0] = 0
    return covariate
    
def sample_uniform_covariates(n_datapoints,rng):
    covariate = rng.uniform(low=0, high=1.0, size=n_datapoints).round(decimals=6)
    return covariate

def sample_categorical_covariates(n_datapoints, rng, cat_number):
    covariate = rng.choice(cat_number, size=n_datapoints)
    return covariate

def sample_dataframe(n_datapoints, n_norm, n_unif, n_cat, random_seed):
    
    rng = np.random.RandomState(random_seed)
    
    norm_cov = np.transpose([sample_norm_covariates(n_datapoints,rng) for i in range(n_norm)])
    unif_cov = np.transpose([sample_uniform_covariates(n_datapoints,rng) for i in range(n_unif)])
    cat_cov = np.transpose([sample_categorical_covariates(n_datapoints,rng, 3) for i in range(n_cat)])
    
    df = pd.DataFrame(norm_cov)
    df = pd.concat([df,pd.DataFrame(unif_cov)],axis =1)
    df = pd.concat([df,pd.DataFrame(cat_cov)],axis =1)
    df = df.sample(frac = 1, axis = 1, random_state = random_seed)
    df.columns = [f'X{i}' for i in range(n_norm+n_unif+n_cat)]
    
    return df

### Hazards 

In [6]:
def weibull_hazard(t, k=1., s=1., t_shift=100, base_rate=1e2):
    # t_shift is a trick to avoid avoid negative powers at t=0 when k < 1.
    t = t + t_shift
    return base_rate * (k / s) * (t / s) ** (k - 1.)

def type1_hazards(df, t):
    baseline = weibull_hazard(t, k=0.003)
    #s = ((df["X0"]+df["X1"])/2 * (1 - (df["X2"]+df["X3"])/2 )).to_numpy()
    s = ((df["X0"]+df["X1"]+df["X2"])/3 * (df["X3"]+df["X4"]+df["X5"])/3 ).to_numpy()
    return s.reshape(-1, 1) * baseline.reshape(1, -1)

def type2_hazards(df, t):
    # Weibull hazards with k = 1 is just a constant over time:
    baseline = weibull_hazard(t, k=3, s=8e3)
    s = (
        ( (df["X6"]+df["X7"]+df["X8"])/3 * (df["X9"]+df["X10"]+df["X11"])/3  + .001) * (df["X12"]+df["X13"])/2
    ).to_numpy()
    return s.reshape(-1, 1) * baseline.reshape(1, -1)

def type3_hazards(df, t):
    return np.vstack([
        0.5 * weibull_hazard(t, k=6 * x, s=4e3) * y
        for x, y in zip((df["X14"]+ df["X15"]+df["X16"])/3, (df["X17"]+df["X18"]+df["X19"])/3)
    ])

### Event Times

In [7]:
from scipy.stats import bernoulli

def sample_events_by_type(hazards,total_days, random_state=None):
    rng = check_random_state(random_state)
    outcomes = bernoulli.rvs(hazards, random_state=rng)
    any_event_mask = np.any(outcomes, axis=1)
    duration = np.full(outcomes.shape[0], fill_value=total_days)
    occurrence_rows, occurrence_cols = np.where(outcomes)
    # Some individuals might have more than one event occurrence,
    # we only keep the first one.
    # ex: trials = [[0, 0, 1, 0, 1]] -> duration = 2
    _, first_occurrence_idxs = np.unique(occurrence_rows, return_index=True)
    duration[any_event_mask] = occurrence_cols[first_occurrence_idxs]
    jitter = rng.rand(duration.shape[0])
    return pd.DataFrame(dict(event=any_event_mask, duration=duration + jitter))


def first_event(event_frames, event_ids, random_seed=None):
    rng = check_random_state(random_seed)
    event = np.zeros(event_frames[0].shape[0], dtype=np.int32)
    max_duration = np.max([ef["duration"].max() for ef in event_frames])
    duration = np.full_like(event_frames[0]["duration"], fill_value=max_duration)
    
    out = pd.DataFrame(
        {
            "event": event,
            "duration": duration,
        }
    )
    for event_id, ef in zip(event_ids, event_frames):
        mask = ef["event"] & (ef["duration"] < out["duration"])
        out.loc[mask, "event"] = event_id
        out.loc[mask, "duration"] = ef.loc[mask, "duration"]
    return out



### Censoring

In [8]:
def uniform_censoring(occurrences, censoring_weight=0.5, frac = 1, offset=0, random_state=None):
    n_datapoints = occurrences.shape[0]
    rng = check_random_state(random_state)
    max_duration = occurrences["duration"].max()
    censoring_durations = rng.randint(
        low=offset, high=max_duration/frac, size=n_datapoints
    )
    # reduce censoring randomly by setting durations back to the max,
    # effectively ensuring that a fraction of the datapoints will not
    # be censured.
    disabled_censoring_mask = rng.rand(n_datapoints) > censoring_weight
    censoring_durations[disabled_censoring_mask] = max_duration
    
    out = occurrences.copy()
    censor_mask = occurrences["duration"] > censoring_durations
    out.loc[censor_mask, "event"] = 0
    out.loc[censor_mask, "duration"] = censoring_durations[censor_mask]
    return out

### Generate Data

In [9]:
def sample_competing_events(data,total_days, uniform_censoring_weight=1.0, frac=1, max_observation_duration=2000, random_seed=None):
    
    rng = check_random_state(random_seed)
    t = np.linspace(0, total_days, total_days)
    
    hazard_funcs = [type1_hazards, type2_hazards, type3_hazards]
    event_ids = np.arange(len(hazard_funcs)) + 1
    all_hazards = np.asarray([hazard_func(data, t) for hazard_func in hazard_funcs])
    
    occurrences_by_type = [sample_events_by_type(all_hazards[i],total_days, random_state=rng) for i in range(all_hazards.shape[0])]
    occurrences = first_event(occurrences_by_type, event_ids)
    
    censored_occurrences = uniform_censoring(occurrences, censoring_weight=uniform_censoring_weight, frac = frac, random_state=rng)
    
    if max_observation_duration is not None:
        # censor all events after max_observation_duration
        max_duration_mask = censored_occurrences["duration"] > max_observation_duration
        censored_occurrences.loc[max_duration_mask, "duration"] = max_observation_duration
        censored_occurrences.loc[max_duration_mask, "event"] = 0
    return (
        censored_occurrences,
        occurrences,
        all_hazards  # shape = (n_event_types, n_observations, n_timesteps)
    )

In [14]:
def sample_data(n_datapoints,total_days,frac, seed):
    data = sample_dataframe(n_datapoints, 8, 6, 6, seed) 
    (events, events_uncensored, all_hazards) = sample_competing_events(data, total_days, frac=frac, random_seed= seed)
    data['event'] = pd.Series(events['event'])
    data['event'] = data['event'].replace(2,1)
    data['event'] = data['event'].replace(3,1)
    data['duration'] = pd.Series(events['duration'])
    
    return data

# Number of Samples

In [15]:
N = 1000

In [39]:
data = sample_data(int(N),1200,0.2,seeds[0])
perc = np.sum(data['event']==0)*100/len(data['event'])
perc


50.1

In [42]:
sd = 0 
for seed_ in seeds:
    data = sample_data(int(N),1200,0.2,seed_)
    perc = np.sum(data['event']==0)*100/len(data['event'])
    if (48. <= perc <= 52.):
        sd = sd+1
        data.to_pickle(f"numfeatdata_20_{sd}.pkl")


print(f'Number of simulation: {sd}')
print('')



Number of simulation: 101



In [43]:
import os

os.remove(f'numfeatdata_20_101.pkl') 