<a href="https://colab.research.google.com/github/bettytan123/Sample-Size-Calculation/blob/main/Python_SampleSize_ConfidenceInterval_ContinuousMean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
############################################
## This script will simulate data from known statistical distirbutions; then estimate GAN and sample from it
##
## Authors: Betty and Chris, feat David
## Date: February 2023
############################################

# Sample Size Confidence Interval

In [3]:
! pip install sdv --quiet
! pip install --upgrade scipy --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.2/103.2 KB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.6/63.6 KB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m79.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.0/140.0 KB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.0/47.0 KB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 KB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.2/9.2 MB[0m [31m107.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 KB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
########################
## Import dependency packages
########################

## Import SDV for GANs and sampling mechanisms, etc.
from sdv.tabular import CTGAN

## Import pandas for data structures
import pandas as pd

## Import numpy for numerical computing
import numpy as np

## Import scipy for statistical distirbution function 
import scipy

## For plotting
import matplotlib.pyplot as plt

## For timing
from time import time

## Random Numbers
import random

# For progress bars
from tqdm.notebook import tqdm, trange

# Do not display warnings (bad practice)
import warnings
warnings.filterwarnings("ignore")

#able to faster the process 
import torch


In [5]:
## Set seed 
random.seed(12345)

In [6]:
#####################################
##
## Sample Size for Precision of a Continuous Normal Random Variable --- by Mathemtical Theory
##
#####################################

In [7]:
scipy.stats.norm.ppf(q=0.025, loc=0, scale=1)

-1.9599639845400545

In [8]:
## Sample size by math
def samp_size_cont_ci(alpha, width, sigma):
    Number = (4 * (scipy.stats.norm.ppf(q=alpha/2, loc=0, scale=1)**2) * (sigma**2) ) / (width**2)
    return Number 

In [9]:
samp_size_cont_ci(alpha=0.05, width=0.6, sigma=1) # 42.68288

42.6828757854903

In [10]:
#################################
##
## Sample Size for Precision of a Continuous Normal Random Variable --- by numerical simulation
##
################################

## Sample Size for Precision of a Continuous Normal Random Variable --- by numerical simulation


In [11]:
mean = 0
sd = 1
n = int(np.ceil(samp_size_cont_ci(alpha=0.05, width=0.6, sigma=1)))

## Generate random data
x = np.random.normal(loc=mean, scale=sd, size=n)
## Analyze generated/simulated data
res = scipy.stats.ttest_1samp(x, popmean=mean, axis=0, 
                        nan_policy='propagate', 
                        alternative='two-sided')

In [12]:
res

TtestResult(statistic=1.067948122506726, pvalue=0.2916439828881932, df=42)

In [13]:
def norm_ci_samp_size(n, mean, sd):
    ## Generate random data
    x = np.random.normal(loc=mean, scale = sd, size=n)
    ## Analyze generated/simulated data
    res = scipy.stats.ttest_1samp(x, popmean=mean, axis=0, 
                        nan_policy='propagate', 
                        alternative='two-sided')
    
    ## Extract the estimate and CI
    p_ll, p_ul = res.confidence_interval(0.95)
    p_hat = np.mean(x)
    
    # Return the estimate and the CI to the user
    # [] make it a list instead of scalar
    out = pd.DataFrame({'mean': [p_hat],
                        'll95_mean': [p_ll],
                        'ul95_mean': [p_ul]})
    
    return(out)


In [14]:
## Replicate above function number of simulation replicate times

# parameter available  
n = int(np.ceil(samp_size_cont_ci(alpha=0.05, width=0.6, sigma=1)))
mean = 0
sd = 1

## Number simulation replicates
n_rep = 10000

## Simulate n_rep copies of sample size trials
sim_out = []

#Record start time
t0 = time()

## Loop over number simulation replicates, storing results in list
for i in range(0, n_rep):
    sim_out.append(norm_ci_samp_size(n=n, mean=mean, sd=sd))

#Record end time
t1 = time()

#Calculate runtime
runtime = t1 - t0
print(f'runtime is {runtime}')

### Aggregate results into dataframe
sim_df = pd.concat(sim_out)

#Calculate means of each column in the array #axis means column mean 
sim_means = np.mean(sim_df, axis=0)
print(f'sim_means is {sim_means}')

#Calculate the width of the confidence interval
ci_width = sim_means[2] - sim_means[1]
print(f'ci_width is {ci_width}')

runtime is 9.1525559425354
sim_means is mean         0.001715
ll95_mean   -0.304443
ul95_mean    0.307874
dtype: float64
ci_width is 0.6123173531216268


## Sample Size for Precision of a Continuous Normal Random Variable --- by GAN (generative adversarial network) just fitting once (i.e., one input


In [24]:
# np.random.seed(42)

# parameter available 
pop_n = 200_000 ## This is the most interesting parameter to vary...
pop_mu = pop_mean = 0
pop_sd = 1
sim_n = int(np.ceil(samp_size_cont_ci(alpha=0.05, width=0.6, sigma=1)))

# Parameters of normal parent distribution
mu = 0
sd = 0.1
n = 1000

## Simulate data as input to GAN
x = np.random.normal(loc=pop_mu, scale=pop_sd, size=pop_n)

## Convert numpy vector to pandas Series and plot the histogram/density
# pd.Series(x).hist(bins=100)

## Convert vector to pandas dataFrame
x_pd = pd.DataFrame({"x": x})
# x_pd


## Feed the simulated data into SDV and sample synthetic data from the fitted GAN
## Instantiate an SDV class object
model = CTGAN(
    epochs=300,
    cuda=True,
    batch_size=10_000,
    verbose=True
)

## Fit a GAN to the simulated data from above
t0 = time()
model.fit(x_pd)
t1 = time()
runtime = t1-t0

def sim_gan_data2(pop_mu, pop_sd, pop_n, sim_n, model):
    ## Sample synthetic data from the trained GAN
    sampled = model.sample(num_rows=sim_n)
    
    return(sampled)

Epoch 1, Loss G: -0.0333,Loss D:  0.0824
Epoch 2, Loss G: -0.0702,Loss D:  0.1741
Epoch 3, Loss G:  0.1126,Loss D: -0.0835
Epoch 4, Loss G:  0.1710,Loss D: -0.1624
Epoch 5, Loss G:  0.0738,Loss D: -0.0987
Epoch 6, Loss G: -0.1755,Loss D:  0.0700
Epoch 7, Loss G: -0.0864,Loss D: -0.0879
Epoch 8, Loss G: -0.2201,Loss D:  0.0153
Epoch 9, Loss G: -0.2559,Loss D:  0.0530
Epoch 10, Loss G: -0.2320,Loss D: -0.0001
Epoch 11, Loss G: -0.2339,Loss D: -0.0055
Epoch 12, Loss G: -0.2521,Loss D: -0.0242
Epoch 13, Loss G: -0.2654,Loss D: -0.0464
Epoch 14, Loss G: -0.3108,Loss D:  0.0231
Epoch 15, Loss G: -0.3412,Loss D: -0.0051
Epoch 16, Loss G: -0.3371,Loss D: -0.0445
Epoch 17, Loss G: -0.4108,Loss D: -0.0052
Epoch 18, Loss G: -0.4297,Loss D: -0.0258
Epoch 19, Loss G: -0.4824,Loss D: -0.0555
Epoch 20, Loss G: -0.5591,Loss D: -0.0191
Epoch 21, Loss G: -0.5404,Loss D: -0.0492
Epoch 22, Loss G: -0.5752,Loss D: -0.0400
Epoch 23, Loss G: -0.5245,Loss D: -0.0482
Epoch 24, Loss G: -0.5539,Loss D: -0.0308
E

In [25]:
def norm_ci_samp_size_gan2(pop_n, pop_mean, pop_sd, sim_n, model):
    ## Generate random data
    x = sim_gan_data2(pop_mu=pop_mean, 
             pop_sd=pop_sd, 
             pop_n=pop_n, 
             sim_n=sim_n,
             model=model)
    ## Analyze generated/simulated data
    res = scipy.stats.ttest_1samp(x, popmean=mean, axis=0, 
                        nan_policy='propagate', 
                        alternative='two-sided')
    
    ## Extract the estimate and CI
    p_ll, p_ul = res.confidence_interval(0.95)
    p_hat = np.mean(x)
    
    # Return the estimate and the CI to the user
    # [] make it a list instead of scalar
    out = pd.DataFrame({'mean': [p_hat],
                        'll95_mean': [p_ll],
                        'ul95_mean': [p_ul]})
    
    return(out)

In [26]:
## Replicate above function number of simulation replicate times

# parameter available 
pop_n = 1000 ## This is the most interesting parameter to vary...
pop_mean = 0
pop_sd = 1
sim_n = int(np.ceil(samp_size_cont_ci(alpha=0.05, width=0.6, sigma=1)))

## Number simulation replicates
n_rep = 2000

## Simulate n_rep copies of sample size trials
sim_out = []

#Record start time
t0 = time()

## Loop over number simulation replicates, storing results in list
for i in trange(0, n_rep):
    sim_out.append(norm_ci_samp_size_gan2(pop_n=pop_n, pop_mean=pop_mean, pop_sd=pop_sd, sim_n=sim_n, model=model))

#Record end time
t1 = time()

#Calculate runtime
runtime = t1 - t0
print(f'runtime is {runtime}')

### Aggregate results into dataframe
sim_df = pd.concat(sim_out)

#Calculate means of each column in the array #axis means column mean 
sim_means = np.mean(sim_df, axis=0)
print(f'sim_means is {sim_means} +/- {np.std(sim_df, axis=0)}')

#Calculate the width of the confidence interval
ci_width = sim_means[2] - sim_means[1]
print(f'ci_width is {ci_width}')

  0%|          | 0/2000 [00:00<?, ?it/s]

runtime is 58.94261646270752
sim_means is mean        -0.013714
ll95_mean   -0.309848
ul95_mean    0.282421
dtype: float64 +/- mean         0.145635
ll95_mean    0.151294
ul95_mean    0.146620
dtype: float64
ci_width is 0.5922695625321005


- runtime is 17.135186433792114
- sim_means is mean        -0.000508
- ll95_mean   -0.306213
- ul95_mean    0.305197
- dtype: float64
- ci_width is 0.6114101213569088

In [None]:
##############################################
##
## Properties/info on the Jupyter Notebook session
##
#############################################