<a href="https://colab.research.google.com/github/bettytan123/Sample-Size-Calculation/blob/main/binomial_Simulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
############################################
## This script will simulate data from known statistical distirbutions; then estimate GAN and sample from it
##
## Authors: Betty and Chris
## Date: February 2023
############################################

In [3]:
! pip install sdv --quiet
! pip install --upgrade scipy --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.2/103.2 KB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 KB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.0/47.0 KB[0m [31m529.1 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.0/140.0 KB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.6/63.6 KB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.2/9.2 MB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 KB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
########################
## Import dependency packages
########################

## Import SDV for GANs and sampling mechanisms, etc.
from sdv.tabular import CTGAN

## Import pandas for data structures
import pandas as pd

## Import numpy for numerical computing
import numpy as np

## Import scipy for statistical distirbution function 
import scipy
from scipy import stats

## For plotting
import matplotlib.pyplot as plt

## For timing
from time import time

## Random Numbers
import random

# For progress bars
from tqdm.notebook import tqdm, trange

# Do not display warnings (bad practice)
import warnings
warnings.filterwarnings("ignore")

#able to faster the process 
import torch

In [5]:
## Set seed 
random.seed(12345)
np.random.seed(42)

In [6]:
#####################################
##
## Sample Size for Precision of a binanry Random Variable --- by Mathemtical Theory
##
#####################################

In [7]:

import scipy
def samp_size_bin_ci(alpha, proportion, width):
    Number = (4 * (scipy.stats.norm.ppf(q=alpha/2, loc=0, scale=1)**2) * proportion*(1-proportion) / (width**2))
    return Number


In [8]:
samp_size_bin_ci(alpha=0.05, proportion=0.2, width=0.1) ##245.8534

245.8533645244241

## Sample Size for Precision of a binaral Random Variable --- by numerical simulation


In [24]:
alpha= 0.05
n = int(np.ceil(samp_size_bin_ci(alpha=0.05, proportion=0.2, width=0.1)))
k = 100 #he number of successes.
n_size =1000
p = k/n
## Generate random data
x = np.random.binomial(n= n, p = p, size = n_size)
## Analyze generated/simulated data
res = scipy.stats.binomtest(k=k, n =n, p=p, alternative='two-sided')


In [25]:
res

BinomTestResult(k=100, n=246, alternative='two-sided', statistic=0.4065040650406504, pvalue=1.0)

In [26]:
# n, p = 10, .5  # number of trials, probability of each trial
# s = np.random.binomial(n, p, 1000)
# # np.random.binomial(n= 10, p = 0.5, size=2 )

In [27]:
def bin_ci_samp_size(n, p ,size):
    ## Generate random data
    x = np.random.binomial(n= n, p = p, size= n_size)
    ## Analyze generated/simulated data
    res = scipy.stats.binomtest(k=k, n =n, p=p, alternative='two-sided')
    
    ## Extract the estimate and CI
    p_ll, p_ul = res.proportion_ci(confidence_level=0.95)
    p_hat = np.mean(x)
    
    # Return the estimate and the CI to the user
    # [] make it a list instead of scalar
    out = pd.DataFrame({'mean': [p_hat],
                        'll95_mean': [p_ll],
                        'ul95_mean': [p_ul]})
    
    return(out)


In [29]:
## Replicate above function number of simulation replicate times

# parameter available  
n = int(np.ceil(samp_size_bin_ci(alpha=0.05, proportion=0.2, width=0.1)))

k = 100 
p = k/n
## Number simulation replicates
n_rep = 10000

## Simulate n_rep copies of sample size trials
sim_out = []

#Record start time
t0 = time()

## Loop over number simulation replicates, storing results in list
for i in range(0, n_rep):
    sim_out.append(bin_ci_samp_size(n, p, size= n_size))

#Record end time
t1 = time()

#Calculate runtime
runtime = t1 - t0
print(f'runtime is {runtime}')

### Aggregate results into dataframe
sim_df = pd.concat(sim_out)

#Calculate means of each column in the array #axis means column mean 
sim_means = np.mean(sim_df, axis=0)
print(f'sim_means is {sim_means}')

#Calculate the width of the confidence interval
ci_width = sim_means[2] - sim_means[1]
print(f'ci_width is {ci_width}')

runtime is 64.52251148223877
sim_means is mean         100.002039
ll95_mean      0.344552
ul95_mean      0.470738
dtype: float64
ci_width is 0.12618563749466377


## Sample Size for Precision of a Continuous Normal Random Variable --- by GAN (generative adversarial network) just fitting once (i.e., one input

In [30]:
# np.random.seed(42)

# parameter available 

sim_n = int(np.ceil(samp_size_bin_ci(alpha=0.05, proportion=0.2, width=0.1)))
n =1000
k = 100 
p = k/n

# Parameters of binomal parent distribution

k = 100 
n_size = 1000

## Simulate data as input to GAN
x = np.random.binomial(n= n, p = p, size = n_size)

## Convert numpy vector to pandas Series and plot the histogram/density
# pd.Series(x).hist(bins=100)

## Convert vector to pandas dataFrame
x_pd = pd.DataFrame({"x": x})
# x_pd


## Feed the simulated data into SDV and sample synthetic data from the fitted GAN
## Instantiate an SDV class object
model = CTGAN(
    epochs=300,
    cuda=True,
    batch_size=10_000,
    verbose=True
)

## Fit a GAN to the simulated data from above
t0 = time()
model.fit(x_pd)
t1 = time()
runtime = t1-t0

def sim_gan_data2(n, p, k, sim_n, model):
    ## Sample synthetic data from the trained GAN
    sampled = model.sample(num_rows=sim_n)
    
    return(sampled)

Epoch 1, Loss G:  0.0578,Loss D:  0.0014
Epoch 2, Loss G:  0.0579,Loss D:  0.0143
Epoch 3, Loss G:  0.0607,Loss D:  0.0121
Epoch 4, Loss G:  0.0585,Loss D:  0.0135
Epoch 5, Loss G:  0.0562,Loss D:  0.0206
Epoch 6, Loss G:  0.0585,Loss D:  0.0194
Epoch 7, Loss G:  0.0587,Loss D:  0.0232
Epoch 8, Loss G:  0.0553,Loss D:  0.0300
Epoch 9, Loss G:  0.0508,Loss D:  0.0367
Epoch 10, Loss G:  0.0474,Loss D:  0.0351
Epoch 11, Loss G:  0.0455,Loss D:  0.0481
Epoch 12, Loss G:  0.0413,Loss D:  0.0469
Epoch 13, Loss G:  0.0434,Loss D:  0.0462
Epoch 14, Loss G:  0.0413,Loss D:  0.0620
Epoch 15, Loss G:  0.0355,Loss D:  0.0597
Epoch 16, Loss G:  0.0389,Loss D:  0.0668
Epoch 17, Loss G:  0.0395,Loss D:  0.0733
Epoch 18, Loss G:  0.0383,Loss D:  0.0850
Epoch 19, Loss G:  0.0335,Loss D:  0.0808
Epoch 20, Loss G:  0.0325,Loss D:  0.0893
Epoch 21, Loss G:  0.0468,Loss D:  0.0883
Epoch 22, Loss G:  0.0398,Loss D:  0.0881
Epoch 23, Loss G:  0.0552,Loss D:  0.0820
Epoch 24, Loss G:  0.0576,Loss D:  0.0929
E

In [31]:
def bin_ci_samp_size_gan2(n,p, k, sim_n, model):
    ## Generate random data

    x = sim_gan_data2(n=n, 
                      p=p, 
                      k=k, 
                      sim_n=sim_n, 
                      model=model)
    
    ## Analyze generated/simulated data
    res = scipy.stats.binomtest(k=k, n =n, p=p, alternative='two-sided')
    
    
    ## Extract the estimate and CI
    p_ll, p_ul = res.proportion_ci(confidence_level=0.95)
    p_hat = np.mean(x)
    
    # Return the estimate and the CI to the user
    # [] make it a list instead of scalar
    out = pd.DataFrame({'mean': [p_hat],
                        'll95_mean': [p_ll],
                        'ul95_mean': [p_ul]})
    
    return(out)



In [33]:
## Replicate above function number of simulation replicate times

# parameter available 
sim_n = int(np.ceil(samp_size_bin_ci(alpha=0.05, proportion=0.2, width=0.1)))
n =1000
k = 100 
p = k/n
n_size = 1000



## Number simulation replicates
n_rep = 2000

## Simulate n_rep copies of sample size trials
sim_out = []

#Record start time
t0 = time()

## Loop over number simulation replicates, storing results in list
for i in trange(0, n_rep):
    sim_out.append(bin_ci_samp_size_gan2(n=n,p=p, k=k, sim_n=sim_n, model=model))
    

#Record end time
t1 = time()

#Calculate runtime
runtime = t1 - t0
print(f'runtime is {runtime}')

### Aggregate results into dataframe
sim_df = pd.concat(sim_out)

#Calculate means of each column in the array #axis means column mean 
sim_means = np.mean(sim_df, axis=0)
print(f'sim_means is {sim_means} +/- {np.std(sim_df, axis=0)}')

#Calculate the width of the confidence interval
ci_width = sim_means[2] - sim_means[1]
print(f'ci_width is {ci_width}')

  0%|          | 0/2000 [00:00<?, ?it/s]

runtime is 80.92124938964844
sim_means is mean         96.641205
ll95_mean     0.082105
ul95_mean     0.120288
dtype: float64 +/- mean         7.619357e-01
ll95_mean    1.387779e-17
ul95_mean    0.000000e+00
dtype: float64
ci_width is 0.03818260216311377
