# Bootstrap-t Method

In [36]:
# Reading the input file

observedSamples = []
with open('uniform_df_15.txt', 'r') as file:
    for line in file:
        num_strings = line.strip().split(", ")
        nums = [round(float(num), 5) for num in num_strings]
        observedSamples.extend(nums)
BootstrapedSamples, MeanOfBootstrapedSamples = bootstrap(observedSamples, 1000) #No. of bootsraped samples is 1000.

Standard Error Calculation.

In [37]:
valToBeSubtracted = sum(MeanOfBootstrapedSamples)/len(MeanOfBootstrapedSamples)
standardError = math.sqrt(sum((item - valToBeSubtracted) ** 2 for item in MeanOfBootstrapedSamples) / (len(MeanOfBootstrapedSamples) - 1))
print(standardError)

0.17297060288684662


The confidence interval can be written as:
$$
\hat{\theta} \pm \hat{t}^{(1-\alpha)} \cdot \hat{\text{se}}_B\\

\implies \theta \in \left[ \hat{\theta} - \hat{t}^{(1-\alpha)} \cdot \hat{\text{se}}_B, \; \hat{\theta} - \hat{t}^{\alpha} \cdot \hat{\text{se}}_B \right]
$$
where, $\newline$
$\hat{t}^{(1-\alpha)} = -\hat{t}^{\alpha}$ $\newline$
$\hat{t}^{\alpha}$ represents the 100.$\alpha^{th}$ percentile point of bootstrap-t distribution. $\newline$
$\hat{\theta}$ represents the plug-in estimate (mean of the given sample).$\newline$
$\theta$ represents the true mean of the underlying distribution.$\newline$
$\hat{\text{se}}_B$ is the standard error calculated above.

In [38]:
plugInEstimate = statistics.mean(observedSamples)

t_statistics = bootstrap_t_percentile(observedSamples)
bootstrap_t_5 = np.percentile(t_statistics, 5)
bootstrap_t_95 = np.percentile(t_statistics, 95)
bootstrap_t_values = [bootstrap_t_5, bootstrap_t_95]

print(f"5th percentile at {bootstrap_t_5:.2f} and 95th percentile at {bootstrap_t_95:.2f}.")

5th percentile at -1.80 and 95th percentile at 1.74.


Finding Confidence Intervals.

In [39]:
# Calculate confidence intervals
bootstrap_t_confidence_interval = [
    plugInEstimate - bootstrap_t_values[0] * standardError,
    plugInEstimate - bootstrap_t_values[1] * standardError
]
print("95% Confidence Interval (Standard Normal Distribution):", bootstrap_t_confidence_interval)

95% Confidence Interval (Standard Normal Distribution): [np.float64(0.3575694198028659), np.float64(-0.255137324924904)]


Plotting

In [40]:
# Plotting with Plotly
fig = go.Figure()

# Calculate KDE using scipy
kde = gaussian_kde(t_statistics, bw_method='scott')
x = np.linspace(min(t_statistics), max(t_statistics), 1000)
y = kde(x)

# Plot the KDE of the t-statistics
fig.add_trace(go.Scatter(x=x, y=y, mode='lines', name='Bootstrap T-Statistic Distribution'))

# Mark the confidence interval
fig.add_trace(go.Scatter(x=[bootstrap_t_confidence_interval[0], bootstrap_t_confidence_interval[0]], y=[0, max(y)], mode='lines', name='95% CI Lower Bound', line=dict(color='red', dash='dash')))
fig.add_trace(go.Scatter(x=[bootstrap_t_confidence_interval[1], bootstrap_t_confidence_interval[1]], y=[0, max(y)], mode='lines', name='95% CI Upper Bound', line=dict(color='red', dash='dash')))
fig.add_trace(go.Scatter(x=[bootstrap_t_confidence_interval[0], bootstrap_t_confidence_interval[1]], y=[0, 0], mode='lines', fill='tonexty', fillcolor='rgba(255,0,0,0.1)', showlegend=False))

# Add permanent labels for the confidence interval points with values
fig.add_trace(go.Scatter(
    x=[bootstrap_t_confidence_interval[0]],
    y=[0],  # Position the label slightly below the top of the curve
    mode='markers+text',
    text=[f'[{bootstrap_t_confidence_interval[0]:.2f}]'],
    textposition='top left',
    marker=dict(color='red', size=3),
    showlegend=False
))

fig.add_trace(go.Scatter(
    x=[bootstrap_t_confidence_interval[1]],
    y=[0.0001],  # Position the label slightly below the top of the curve
    mode='markers+text',
    text=[f'[{bootstrap_t_confidence_interval[1]:.2f}]'],
    textposition='top right',
    marker=dict(color='red', size=3),
    showlegend=False
))

# Update layout with labeled x-axis
fig.update_layout(
    title='Bootstrap T-Statistic Distribution with 95% Confidence Interval',
    xaxis_title='T-Statistic Value',
    yaxis_title='Density'
)

# Save and open the figure in a web browser
import plotly.io as pio
pio.write_html(fig, 'bootstrap_t_uniform.html')

Doing the same procedure, but for given data of type log normal; so as to observe how well bootstrap-t performs in case of skewed distributions.

# Defining a boot function  
This function will take input file, and output file as the parameters; and display the resultant pdf with the required confidence intervals in the output file.  
Function 'boot' is made to deploy code reusability, so that the same code can be executed for different input files.

In [41]:
def boot(inputf, outputf):
    import numpy as np
    import math
    import random
    import statistics
    import seaborn as sns
    import matplotlib.pyplot as plt
    import plotly.graph_objects as go
    from scipy.stats import gaussian_kde
    from scipy.stats import norm
    import plotly.io as pio

    def bootstrap(zdata,nreal):
        realisations = []
        zreal = []                            # declare an empty list to store the bootstrap repliations which contain mean of the 
                                            # bootstraped samples.
        for l in range(0,nreal):              # loop over the L bootstrap realizations
            samples = random.choices(zdata, k=len(zdata)) # n Monte Carlo simulations, sample with replacement
            realisations.append(samples)
            zreal.append(statistics.mean(samples))       # calculate the realization of the statistic and append to list
        return [samples, zreal]                          # return the list of realizations of the statistic

    def bootstrap_t_percentile(observed_samples, n_bootstrap=1000):
        """
        Perform bootstrap resampling on the observed_samples and return the ith percentile points of the t-statistic.
        
        Parameters:
        - i (int): The percentile to compute (e.g., 95 for the 95th percentile).
        - observed_samples (list or np.array): The list of observed samples.
        - n_bootstrap (int): The number of bootstrap resamples to perform.
        
        Returns:
        - float: The ith percentile point of the bootstrap t-statistics.
        """
        observed_samples = np.array(observed_samples)
        n = len(observed_samples)
        means = np.empty(n_bootstrap)
        stds = np.empty(n_bootstrap)
        
        for j in range(n_bootstrap):
            sample = np.random.choice(observed_samples, size=n, replace=True)
            means[j] = np.mean(sample)
            stds[j] = np.std(sample, ddof=1)
        
        # Calculate t-statistics
        t_statistics = (means - np.mean(observed_samples)) / (stds / np.sqrt(n))
        
        return t_statistics, statistics.mean(means)


    # Reading the input file

    observedSamples = []
    with open(inputf, 'r') as file:
        for line in file:
            num_strings = line.strip().split(", ")
            nums = [round(float(num), 5) for num in num_strings]
            observedSamples.extend(nums)
    BootstrapedSamples, MeanOfBootstrapedSamples = bootstrap(observedSamples, 1000) #No. of bootsraped samples is 1000.

    plugInEstimate = statistics.mean(observedSamples)

    t_statistics, sampledMean = bootstrap_t_percentile(observedSamples)
    bootstrap_t_5 = np.percentile(t_statistics, 5)
    bootstrap_t_95 = np.percentile(t_statistics, 95)
    bootstrap_t_values = [bootstrap_t_5, bootstrap_t_95]

    print(f"5th percentile at {bootstrap_t_5:.2f} and 95th percentile at {bootstrap_t_95:.2f}.")

    # Calculate confidence intervals
    bootstrap_t_confidence_interval = [
        sampledMean + bootstrap_t_values[0] * standardError,
        sampledMean + bootstrap_t_values[1] * standardError
    ]
    print("95% Confidence Interval (Standard Normal Distribution):", bootstrap_t_confidence_interval)

    # Plotting with Plotly
    fig = go.Figure()

    # Calculate KDE using scipy
    kde = gaussian_kde(t_statistics, bw_method='scott')
    x = np.linspace(min(t_statistics), max(t_statistics), 1000)
    y = kde(x)

    # Plot the KDE of the t-statistics
    fig.add_trace(go.Scatter(x=x, y=y, mode='lines', name='Bootstrap T-Statistic Distribution'))

    # Mark the confidence interval
    fig.add_trace(go.Scatter(x=[bootstrap_t_confidence_interval[0], bootstrap_t_confidence_interval[0]], y=[0, max(y)], mode='lines', name='95% CI Lower Bound', line=dict(color='red', dash='dash')))
    fig.add_trace(go.Scatter(x=[bootstrap_t_confidence_interval[1], bootstrap_t_confidence_interval[1]], y=[0, max(y)], mode='lines', name='95% CI Upper Bound', line=dict(color='red', dash='dash')))
    fig.add_trace(go.Scatter(x=[bootstrap_t_confidence_interval[0], bootstrap_t_confidence_interval[1]], y=[0, 0], mode='lines', fill='tonexty', fillcolor='rgba(255,0,0,0.1)', showlegend=False))

    # Add permanent labels for the confidence interval points with values
    fig.add_trace(go.Scatter(
        x=[bootstrap_t_confidence_interval[0]],
        y=[0],  # Position the label slightly below the top of the curve
        mode='markers+text',
        text=[f'[{bootstrap_t_confidence_interval[0]:.2f}]'],
        textposition='top left',
        marker=dict(color='red', size=3),
        showlegend=False
    ))

    fig.add_trace(go.Scatter(
        x=[bootstrap_t_confidence_interval[1]],
        y=[0.0001],  # Position the label slightly below the top of the curve
        mode='markers+text',
        text=[f'[{bootstrap_t_confidence_interval[1]:.2f}]'],
        textposition='top right',
        marker=dict(color='red', size=3),
        showlegend=False
    ))

    # Update layout with labeled x-axis
    fig.update_layout(
        title='Bootstrap T-Statistic Distribution with 95% Confidence Interval',
        xaxis_title='T-Statistic Value',
        yaxis_title='Density'   
    )

    # Save and open the figure in a web browser
    import plotly.io as pio
    pio.write_html(fig, outputf)

In [49]:
# Values drawn from an gamma pdf.
# Distribution's Mean = 2; Sample's Mean = 2.16
boot('input_files/gamma_df_15.txt', 'outputs/bootstrap_t_gamma.html')

# Values drawn from an beta pdf.
# Distribution's Mean = 1; Sample's Mean = 0.89
boot('input_files/beta_df_15.txt', 'outputs/bootstrap_t_beta.html')

# Values drawn from an exponential pdf.
# with mean 1. 
# [Mean can't be -ve for exponential functions]
boot('input_files/exponential_df_15.txt', 'outputs/bootstrap_t_exponential.html')
# Values drawn from a log-normal pdf, with mean 1. Mean can't be -ve for exponential functions.
boot('input_files/log_normal_df_15.txt', 'outputs/bootstrap_t_log_normal.html')


5th percentile at -3.02 and 95th percentile at 1.42.
95% Confidence Interval (Standard Normal Distribution): [np.float64(1.620547934881268), np.float64(2.38872828007821)]
5th percentile at -1.58 and 95th percentile at 2.62.
95% Confidence Interval (Standard Normal Distribution): [np.float64(0.6254140703381181), np.float64(1.3516260172758585)]
5th percentile at -2.83 and 95th percentile at 1.35.
95% Confidence Interval (Standard Normal Distribution): [np.float64(0.41616318082788206), np.float64(1.138902550528159)]
5th percentile at -3.44 and 95th percentile at 1.33.
95% Confidence Interval (Standard Normal Distribution): [np.float64(1.8435945649115175), np.float64(2.6689354133533714)]


In [56]:
l = [0.94112885, 0.62052257, 1.88190003, 1.63611996, 1.2675438, 6.1795139, 5.33595219, 0.86413477, 0.66357437, 3.05580574, 1.85401612, 2.09942304, 2.02025857, 3.03101126, 1.0314522]
print (statistics.mean(l))
print(statistics.stdev(l)**2)

2.1654904913333333
2.7226247232650658
