In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import norm, jarque_bera, kstest, gamma, expon

# 1. Parametric: Bootstrapping

In [2]:
df = pd.read_csv("ScanRecords.csv")
df['Date'] = pd.to_datetime(df['Date'])
print("Shape of df: ", df.shape)
df.head()

Shape of df:  (618, 4)


Unnamed: 0,Date,Time,Duration,PatientType
0,2023-08-01,8.23,0.949176,Type 2
1,2023-08-01,8.49,0.479593,Type 1
2,2023-08-01,9.12,0.496112,Type 2
3,2023-08-01,10.26,0.691947,Type 2
4,2023-08-01,10.64,0.345412,Type 1


In [3]:
df1 = df[df["PatientType"] == "Type 1"]
df2 = df[df["PatientType"] == "Type 2"]

In [4]:
print(df.groupby('PatientType')['Duration'].describe())  

             count      mean       std       min       25%       50%  \
PatientType                                                            
Type 1       379.0  0.432661  0.097774  0.093731  0.368123  0.435888   
Type 2       239.0  0.669339  0.187286  0.220708  0.523962  0.646603   

                  75%       max  
PatientType                      
Type 1       0.496907  0.708922  
Type 2       0.796523  1.146789  


## Type 1

## 1.1. Duration (Normal)

In [5]:
df1.head()

Unnamed: 0,Date,Time,Duration,PatientType
1,2023-08-01,8.49,0.479593,Type 1
4,2023-08-01,10.64,0.345412,Type 1
5,2023-08-01,11.07,0.42227,Type 1
6,2023-08-01,11.13,0.356129,Type 1
8,2023-08-01,11.56,0.423303,Type 1


In [7]:
def bootstrap_Type1(df1, B1, alpha):
    
    #np.random.seed(515)

    n = len(df1)
    X_bar = np.mean(df1['Duration'])  # sample mean
    St_Dev = np.std(df1['Duration'], ddof=1)  # sample variance

    # Initialize empty arrays 
    X_star_bar = np.empty(B1) # bootstap sample mean
    X_star_sd = np.empty(B1) # bootstap sample variance
    Q_star = np.empty(B1)  # store the bootstrap quantities (bootstrap t-statistic)

    # Bootstrapping
    for b in range(B1):
        J = np.random.choice(np.arange(n), size=n, replace=True)  # Resampling with replacement
        X_star = df1['Duration'].iloc[J]  # Construct the bootstrap sample
        X_star_bar[b] = np.mean(X_star)  # Calculate the bootstrap sample mean
        X_star_sd[b] = np.std(X_star, ddof=1)  # Calculate the bootstrap sample std (unbiased)
        Q_star[b] = np.sqrt(n) * (X_star_bar[b] - X_bar) / X_star_sd[b]  # T-statistic


    # cv for mean
    cv_mean_LB = np.quantile(X_star_bar, alpha / 2)
    cv_mean_UB = np.quantile(X_star_bar, 1 - alpha / 2)

    # ci for the mean
    CI_mean_LB = X_bar - cv_mean_UB * St_Dev / np.sqrt(n)
    CI_mean_UP = X_bar - cv_mean_LB * St_Dev / np.sqrt(n)

    # ci for std
    ci_std_LB = np.quantile(X_star_sd, alpha / 2)
    ci_std_UB = np.quantile(X_star_sd, 1 - alpha / 2)


    results = {
        "Bootstrap mean of Duration": np.mean(X_star_bar),
        "95% CI for Mean Duration": (CI_mean_LB, CI_mean_UP),
        "Bootstrap std of Duration": np.mean(X_star_sd),
        "95% CI for Std Duration": (ci_std_LB, ci_std_UB)
    }
    
    return results

results = bootstrap_Type1(df1, B1=499, alpha=0.05)
for key, value in results.items():
    print(f"{key}: {value}")


Bootstrap mean of Duration: 0.43251061296850873
95% CI for Mean Duration: (np.float64(0.43043568196570087), np.float64(0.43053810075044235))
Bootstrap std of Duration: 0.09751270930223084
95% CI for Std Duration: (np.float64(0.08941876370936568), np.float64(0.10429982796802262))


In [8]:
print("Bootstrap mean of Duration: ", 0.4326680574092081*60)
print("95% CI for Mean Duration: (", 0.43043568196570087*60, ",", 0.43053810075044235*60,")")
print("Bootstrap std of Duration: ", 0.09758786441071532*60)
print("95% CI for Mean Duration: (", 0.09076566779046391*60, ",", 0.10474547648320777*60, ")")

Bootstrap mean of Duration:  25.960083444552488
95% CI for Mean Duration: ( 25.826140917942052 , 25.832286045026542 )
Bootstrap std of Duration:  5.855271864642919
95% CI for Mean Duration: ( 5.445940067427834 , 6.284728588992466 )


## 1.2. Mean Interarrival time (exponential)

In [24]:
numRows = len(df1)
interArrivals1 = []

for i in range(numRows - 1):
    if df1.iloc[i]["Date"] == df1.iloc[i + 1]["Date"]:
        # interaarival time of arrivals on the same day
        interval = df1.iloc[i + 1]["Time"] - df1.iloc[i]["Time"]
    else:
        interval = (17 - df1.iloc[i]["Time"]) + (df1.iloc[i + 1]["Time"] - 8)
    interArrivals1.append(interval)

interArrivals1 = np.array(interArrivals1)

print("Mean Inter-arrival time: ", np.mean(interArrivals1))

Mean Inter-arrival time:  0.5453439153439154


In [25]:
def bootstrap_Type1(interArrivals, B1, alpha):

    np.random.seed(515)

    n1 = len(interArrivals)
    X_bar = np.mean(interArrivals)
    St_Dev = np.std(interArrivals, ddof=1)  

    # Empty arrays for bootstrap quantities
    X_star_bar = np.empty(B1)
    X_star_sd = np.empty(B1)
    Q_star_mean = np.empty(B1)
    Q_star_sd = np.empty(B1)

    # Bootstrapping
    for b in range(B1):
        J1 = np.random.choice(np.arange(n1), size=n1, replace=True)  # Resampling with replacement
        X_star = interArrivals[J1]  # Construct the bootstrap sample
        X_star_bar[b] = np.mean(X_star)  # Calculate the bootstrap sample mean
        X_star_sd[b] = np.std(X_star, ddof=1)  # Calculate the bootstrap sample std (unbiased)
        Q_star_mean[b] = np.sqrt(n1) * (X_star_bar[b] - X_bar) / St_Dev  # Bootstrap quantity for the mean
        Q_star_sd[b] = (X_star_sd[b] - St_Dev) / (St_Dev / np.sqrt(n1))  # Bootstrap quantity for std

    # cv for mean
    cv_mean_LB = np.quantile(Q_star_mean, alpha / 2)
    cv_mean_UB = np.quantile(Q_star_mean, 1 - alpha / 2)

    # cv for std
    cv_sd_LB = np.quantile(Q_star_sd, alpha / 2)
    cv_sd_UB = np.quantile(Q_star_sd, 1 - alpha / 2)

    # ci for the mean
    CI_mean_LB = X_bar - cv_mean_UB * St_Dev / np.sqrt(n1)
    CI_mean_UP = X_bar - cv_mean_LB * St_Dev / np.sqrt(n1)

    # ci for the standard deviation
    CI_sd_LB = St_Dev + cv_sd_LB * (St_Dev / np.sqrt(n1))
    CI_sd_UP = St_Dev + cv_sd_UB * (St_Dev / np.sqrt(n1))

    results = {
        "Bootstrap mean of interArrivals": np.mean(X_star_bar),
        "95% CI for Mean interArrivals": (CI_mean_LB, CI_mean_UP),
        "Bootstrap std of interArrivals": np.mean(X_star_sd),
        "95% CI for std interArrivals": (CI_sd_LB, CI_sd_UP)
    }

    return results

# Example usage
results = bootstrap_Type1(interArrivals1, B1=499, alpha=0.05)
for key, value in results.items():
    print(f"{key}: {value}")


Bootstrap mean of interArrivals: 0.5455232687597417
95% CI for Mean interArrivals: (np.float64(0.481760582010582), np.float64(0.6028756613756614))
Bootstrap std of interArrivals: 0.5812950397873389
95% CI for std interArrivals: (np.float64(0.5094733560774076), np.float64(0.6537034883756188))


In [26]:
print("Bootstrap mean of interArrivals: ",0.5455232687597417*60)
print("95% CI for Mean interArrivals: (", 0.481760582010582*60, ",", 0.6028756613756614*60,")")
print("Bootstrap std of interArrivals: ", 0.5812950397873389*60)
print("95% CI for Mean interArrivals: (", 0.5094733560774076*60, ",", 0.6537034883756188*60, ")")

Bootstrap mean of interArrivals:  32.7313961255845
95% CI for Mean interArrivals: ( 28.905634920634917 , 36.172539682539686 )
Bootstrap std of interArrivals:  34.87770238724033
95% CI for Mean interArrivals: ( 30.56840136464446 , 39.222209302537124 )


In [None]:
def bootstrap_Type1(interArrivals, B1, alpha):

    n1 = len(interArrivals)
    X_bar = np.mean(interArrivals)
    St_Dev = np.std(interArrivals)

    # empty arrays for bootstrap quantities
    X_star_bar = np.empty(B1)
    X_star_sd = np.empty(B1)
    Q_star = np.empty(B1)

    # bootstrapping
    for b in range(B1):
        J1 = np.random.choice(np.arange(n1), size=n1, replace=True) # Resampling with replacement
        X_star = interArrivals[J1] # Construct the bootstrap sample
        X_star_bar[b] = np.mean(X_star) # Calculate the bootstrap sample mean
        X_star_sd[b] = np.std(X_star) # Calculate the bootstrap sample variance
        Q_star[b] = np.sqrt(n1) * (X_star_bar[b] - X_bar) / X_star_sd[b] # Calculate the bootstrap quantity

    # Critical values
    cv_LB = np.quantile(Q_star, alpha / 2)
    cv_UB = np.quantile(Q_star, 1 - alpha / 2)

    # Confidence interval
    CI_LB = X_bar - cv_LB * St_Dev / np.sqrt(n1)
    CI_UP = X_bar - cv_UB * St_Dev / np.sqrt(n1)

    results = {
        "Bootstrap mean of interArrivals": np.mean(X_star_bar),
        "95% CI for Mean interArrivals": (CI_LB, CI_UP),
        "Bootstrap std of interArrivals": np.mean(X_star_sd),
        "95% CI for std interArrivals": (cv_LB, cv_UB)
    }
    
    return results

results = bootstrap_Type1(interArrivals1, B1=499, alpha=0.05)
for key, value in results.items():
    print(f"{key}: {value}")

Bootstrap mean of interArrivals: 0.5446245400854618
95% CI for Mean interArrivals: (np.float64(0.6203758653854785), np.float64(0.4895444229586992))
Bootstrap std of interArrivals: 0.5795309007316473
95% CI for std interArrivals: (np.float64(-2.50422752248658), np.float64(1.8623349718944266))


# 2. Type 2

## 2.1. Duration (Gamma)

In [11]:
shape_b, loc_b, scale_b = gamma.fit(df2['Duration'], floc=0) 
print(shape_b)
print(scale_b)

12.584814582926688
0.05318623207452434


In [34]:
def mc_bootstrap_gamma(df, nr_sim=1000, B=499, alpha=0.05):

    np.random.seed(515)

    n2 = len(df)  # Sample size
    reject = np.zeros(nr_sim)  # Vector to store rejections

    # Initial statistics
    xbar = np.mean(df)
    std = np.std(df, ddof=1)

    # Pre-allocate arrays for bootstrap results
    gamma_shape = np.empty(B)
    gamma_scale = np.empty(B)

    # Monte Carlo simulation
    for i in range(nr_sim):
        X = np.random.choice(df, size=n2, replace=True)  # Bootstrap sample
        X_bar = np.mean(X)
        St_Dev = np.std(X, ddof=1)
        Q = np.sqrt(n2) * (X_bar - xbar) / St_Dev

        Q_star = np.empty(B)

        # Bootstrap procedure
        for b in range(B):
            J = np.random.choice(n2, size=n2, replace=True)
            X_star = X[J]
            X_bar_star = np.mean(X_star)
            St_Dev_star = np.std(X_star, ddof=1)
            Q_star[b] = np.sqrt(n2) * (X_bar_star - X_bar) / St_Dev_star

            # Fit gamma distribution to bootstrap sample
            shape_b, loc_b, scale_b = gamma.fit(X_star, floc=0)
            gamma_shape[b] = shape_b
            gamma_scale[b] = scale_b

        # Critical values
        cv_LB = np.quantile(Q_star, alpha / 2)
        cv_UB = np.quantile(Q_star, 1 - alpha / 2)

        # Confidence interval for the mean
        CI_LB = X_bar - cv_UB * std / np.sqrt(n2)
        CI_UP = X_bar - cv_LB * std / np.sqrt(n2)

        # Hypothesis test
        if Q < cv_LB or Q > cv_UB:
            reject[i] = 1

    # Calculate empirical rejection frequency
    ERF = np.mean(reject)

    # Bootstrap confidence intervals for gamma parameters
    shape_CI = np.percentile(gamma_shape, [alpha / 2 * 100, (1 - alpha / 2) * 100])
    scale_CI = np.percentile(gamma_scale, [alpha / 2 * 100, (1 - alpha / 2) * 100])

    # Output results
    results = {
        "Rejection Percentage": 100 * ERF,
        "Confidence Interval for the Mean": (CI_LB, CI_UP),
        "Bootstrap CI for the Shape": shape_CI,
        "Bootstrap Gamma Shape ": np.mean(gamma_shape),
        "Bootstrap CI for the Scale": scale_CI,
        "Bootstrap Gamma Scale ": np.mean(gamma_scale),
    }
    

    return results


results = mc_bootstrap(df2['Duration'].values)
for key, value in results.items():
    print(f"{key}: {value}")


Rejection Percentage: 3.5999999999999996
Confidence Interval for the Mean: (np.float64(0.6585128688697752), np.float64(0.7080340902257598))
Bootstrap CI for the Shape: [10.64875932 14.54114995]
Bootstrap Gamma Shape : 12.390540738953662
Bootstrap CI for the Scale: [0.0468788  0.06449381]
Bootstrap Gamma Scale : 0.05559457514412516


## 2.2. Interarrival (Normal)

In [37]:
numRows = len(df2)
interArrivals2 = []

for i in range(numRows - 1):
    if df2.iloc[i]["Date"] == df2.iloc[i + 1]["Date"]:
        # interaarival time of arrivals on the same day
        interval = df2.iloc[i + 1]["Time"] - df2.iloc[i]["Time"]
    else:
        interval = (17 - df2.iloc[i]["Time"]) + (df2.iloc[i + 1]["Time"] - 8)
    interArrivals2.append(interval)

interArrivals2 = np.array(interArrivals2)

print(np.mean(interArrivals2))
print(np.std(interArrivals2))

0.8666386554621849
0.3101335502810958


In [44]:
def mc_bootstrap_normal(data, nr_sim=1000, B=499, alpha=0.05):

    np.random.seed(515)

    n2 = len(data)  # Sample size
    reject = np.zeros(nr_sim)  # Vector to store rejections

    # Initial statistics
    xbar = np.mean(data)
    std = np.std(data, ddof=1)

    # to store bootstrap results
    means = np.empty(B)
    stds = np.empty(B)

    # mc simulation
    for i in range(nr_sim):
        X = np.random.choice(data, size=n2, replace=True)  # Bootstrap sample
        X_bar = np.mean(X)
        St_Dev = np.std(X, ddof=1)
        Q = np.sqrt(n2) * (X_bar - xbar) / St_Dev

        Q_star = np.empty(B)

        # Bootstrap procedure
        for b in range(B):
            J = np.random.choice(n2, size=n2, replace=True)
            X_star = X[J]
            X_bar_star = np.mean(X_star)
            St_Dev_star = np.std(X_star, ddof=1)
            Q_star[b] = np.sqrt(n2) * (X_bar_star - X_bar) / St_Dev_star

            # Fit normal distribution to bootstrap sample
            mean_b, std_b = norm.fit(X_star)
            means[b] = mean_b
            stds[b] = std_b

        # Critical values
        cv_LB = np.quantile(Q_star, alpha / 2)
        cv_UB = np.quantile(Q_star, 1 - alpha / 2)

        # Confidence interval for the mean
        CI_LB = X_bar - cv_UB * std / np.sqrt(n2)
        CI_UP = X_bar - cv_LB * std / np.sqrt(n2)

        # Hypothesis test
        if Q < cv_LB or Q > cv_UB:
            reject[i] = 1

    # Calculate empirical rejection frequency
    ERF = np.mean(reject)

    # Bootstrap confidence intervals for parameters
    mean_CI = np.percentile(means, [alpha / 2 * 100, (1 - alpha / 2) * 100])
    std_CI = np.percentile(stds, [alpha / 2 * 100, (1 - alpha / 2) * 100])

    # Output results
    results = {
        "Rejection Percentage": 100 * ERF,
        "Bootstrap CI for the Mean": mean_CI,
        "Bootstrap Mean": np.mean(means),
        "Bootstrap CI for the STD": std_CI,
        "Bootstrap STD": np.mean(stds),
    }
    return results


# Example Usage for Normally Distributed Data
normal_results = mc_bootstrap_normal(interArrivals2)
for key, value in normal_results.items():
    print(f"{key}: {value}")


Rejection Percentage: 4.3999999999999995
Bootstrap CI for the Mean: [0.8345042  0.91065336]
Bootstrap Mean: 0.8728765093211635
Bootstrap CI for the STD: [0.26310298 0.32246551]
Bootstrap STD: 0.2919576914357435
