# Sample Dataset

Now that we can generate a stratified sample, let's generate a random dataset that we can use to test the adaptive script.

In [157]:
import pandas as pd
import numpy as np
from scipy import stats

In [158]:
def load_age_race():
    fn = 'csv/age.csv'
    df_age = pd.read_csv(fn,thousands=',')
    display(df_age)

    fn = 'csv/race.csv'
    df_race = pd.read_csv(fn,thousands=',')
    display(df_race)
    return df_age,df_race


In [159]:
def get_age_race_matrix(df_age=[],df_race=[],gender='male'):

    # total_vets = df_age['Grand Total'].iloc[-1]
    # display('Number of total vets: {}'.format(total_vets))
    # total_gender = df_age.loc[df_age['gender']==gender,'Grand Total'].values[0]
    # frac_gender = 1.0*total_gender / total_vets
    # print('Total number of {0} : {1} of {2}, {3:0.2f}%'.format(gender,total_gender,total_vets,100*frac_gender))
    
    dfa_g = df_age.loc[df_age['gender']==gender].drop(columns=['gender','Grand Total'])
    # display(dfa_g)
    xlist = dfa_g.values.tolist()[0]

    total_vets = df_race['All Veterans'].iloc[-1]
    # display('Number of total vets: {}'.format(total_vets))
    total_gender = df_race.loc[df_race['gender']==gender,'All Veterans'].values[0]
    frac_gender = 1.0*total_gender / total_vets
    print('Total number of {0} : {1} of {2} total vets, {3:0.2f}%'.format(gender,total_gender,total_vets,100*frac_gender))
    
    total_Hisp = df_race.loc[df_race['gender']==gender,'Hispanic or Latino (of any race)'].values[0]
    frac_gender_Hisp = 1.0*total_Hisp / total_gender
    print('Total number of {0} Hispanics : {1} of {2} {3}, {4:0.2f}%'
          .format(gender,total_Hisp,total_gender,gender,100*frac_gender_Hisp))
    dfr_g = df_race.loc[df_race['gender']==gender].drop(columns=['gender',
                                                                 'All Veterans',
                                                                 'Hispanic or Latino (of any race)',
                                                                 'White alone, Not Hispanic or Latino'])
    # display(list(df_race))
    ylist = dfr_g.values.tolist()[0]

    XY = np.zeros((len(xlist),len(ylist)))

    for ix,x in enumerate(xlist):
        for iy,y in enumerate(ylist):
            XY[ix,iy] = x*y/(total_gender**2)

    # print('Sum of XY is : {}'.format(np.sum(XY)))
    
    df_g = pd.DataFrame(data=XY,index=list(dfa_g),columns=list(dfr_g))
    # display(df_g)
    return df_g,frac_gender,frac_gender_Hisp

In [160]:
def gen_xk(pk):
    shape = pk.shape
    sz = shape[0]*shape[1]
    a = np.linspace(0,sz-1,sz)
    # display(a)
    b = a.reshape(shape)
    return b

In [161]:
def convert_sample(df_g=[],xk=0,str_sample=0):
    unique,counts = np.unique(str_sample,return_counts=True)
    out_count = np.zeros(xk.shape)
    for u,c in zip(unique,counts):
        # display(u,c)
        # display(np.where(xk==u))
        out_count[np.where(xk==u)] = c
    # display(out_count)
    # display(unique,counts)
    # display(xk)
    df_sample = pd.DataFrame(data=out_count,index=df_g.index,columns=df_g.columns)
    df_sample = df_sample.astype(int)
    return df_sample


In [162]:
def append_sample(df=[],df_app=[],gender='male',ethnicity='Hispanic'):
    stroke = ['True','False']
    afib = ['True','False']
    for age,row in df_app.iterrows():
        # display(age,row)
        for c_idx in row.to_numpy().nonzero()[0]:
            # display(c_idx,row.iloc[c_idx])
            nb_entries = row.iloc[c_idx]
            race = row.index[c_idx]
            frames = [df]
            for i in range(nb_entries):

                sidx = np.random.randint(0,2,1)[0]
                aidx = np.random.randint(0,2,1)[0]
                app_list = [age,race,ethnicity,gender,stroke[sidx],afib[aidx]]
                df_row = pd.DataFrame([app_list],columns=list(df))
                frames += [df_row]
            df = pd.concat(frames)
    df = df.reset_index(drop=True)
    return df
            

            


In [163]:
def generate_sample_dataset(nb_samples=500):

    df_age, df_race = load_age_race()
    nb_actual_total = 0
    gender_Hisp = np.zeros((2,2))

    columns = ['Age','Race','Ethinicity','Gender','Stroke','Afib']
    df_sample = pd.DataFrame([],columns=columns)

    for idx,gender in enumerate(['female','male']):
        df_g,frac_gender,frac_gender_Hisp = get_age_race_matrix(df_age=df_age,df_race=df_race,gender=gender)
        # display(df_g)
        
        nb_gender = int(np.round(frac_gender*nb_samples))
        nb_gender_Hisp = int(np.round(frac_gender_Hisp*nb_gender))
        nb_gender_nonHisp = nb_gender - nb_gender_Hisp
        
        gender_Hisp[idx,:] = [nb_gender_Hisp,nb_gender_nonHisp]
        # display(gender_Hisp)

        nb_actual_total = nb_actual_total + nb_gender
        # fail safe catch to make sure we have the correct number of samples
        if gender == 'male':
            if nb_actual_total != nb_samples:
                print('Whoops check to make sure this is correct!')
                nb_gender = nb_gender + (nb_samples - nb_actual_total)
        
        print('We will select {} {} of total {} samples'.format(nb_gender,gender,nb_samples))
        print('From the {} {}, we will sample (Hispanics,non-Hispanics) :: ({},{}) '
              .format(nb_gender,gender,nb_gender_Hisp,nb_gender_nonHisp))
        
        pk = df_g.to_numpy()
        xk = gen_xk(pk)
        # display(xk.flatten())
        custm = stats.rv_discrete(name='custm',values=(xk,pk))

        str_sample_Hisp = custm.rvs(size=nb_gender_Hisp)
        df_sample_Hisp = convert_sample(df_g=df_g,xk=xk,str_sample=str_sample_Hisp)
        print('\n\nStratified sample for >>>{} {}<<< across race and age for those of Hispanic or Latino ethnicity'.format(nb_gender_Hisp,gender))
        # display(df_sample_Hisp)
        df_sample = append_sample(df=df_sample,df_app=df_sample_Hisp,gender=gender,ethnicity='Hispanic')
        # display(df_sample)
        
        str_sample_nonHisp = custm.rvs(size=nb_gender_nonHisp)
        df_sample_nonHisp = convert_sample(df_g=df_g,xk=xk,str_sample=str_sample_nonHisp)
        print('\n\nStratified sample for >>>{} {}<<< across race and age for those NOT of Hispanic or Latino ethnicity'.format(nb_gender_nonHisp,gender))
        # display(df_sample_nonHisp)
        df_sample = append_sample(df=df_sample,df_app=df_sample_nonHisp,gender=gender,ethnicity='non_Hispanic')
        # display(df_sample)
        # display(df_sample_nonHisp.sum())

    print('The total number of {} samples is broken down as follows:'.format(nb_samples))
    df_gender_hisp = pd.DataFrame(gender_Hisp,columns=['Hispanic','Not Hispanic'],index = ['female','male'])
    df_gender_hisp = df_gender_hisp.astype(int)
    display(df_gender_hisp)
    return df_sample

_

''

In [164]:
for nb_samples in [500,1000,5000]:
    for v in range(3):
        df = generate_sample_dataset(nb_samples=nb_samples)
        fn = 'csv/eg/dataset_nb{}_v{}.csv'.format(nb_samples,v)
        print('Saving sample dataset to : {}'.format(fn))
        df.to_csv(fn)

Unnamed: 0,gender,< 20,20-24,25-29,30-34,35-39,40-44,45-49,50-54,55-59,60-64,65-69,70-74,75-79,80-84,85+,Grand Total
0,female,1982,52510,122488,176050,206062,201715,189245,216411,236853,219081,155285,97883,51287,36608,49301,2012760
1,male,4939,191021,525440,768082,875081,867717,1015059,1374473,1589911,1637797,1593892,2538943,1814368,1135534,1452927,17385184
2,total,6921,243531,647928,944132,1081143,1069432,1204304,1590884,1826765,1856878,1749177,2636826,1865655,1172142,1502227,19397944


Unnamed: 0,gender,All Veterans,"White, alone","Black or African American, alone","American Indian and Alaska Native, alone","Asian, alone","Native Hawaiian and Other Pacific Islander, alone","Some other race, alone",Two or more races,Hispanic or Latino (of any race),"White alone, Not Hispanic or Latino"
0,female,2012760,1395496,402563,20334,49386,8193,43481,93307,197270,1277917
1,male,17385184,14140396,1987685,129825,293706,34393,287645,511534,1341362,13299234
2,total,19397944,15535892,2390248,150159,343092,42586,331126,604840,1538632,14577151


Total number of female : 2012760 of 19397944 total vets, 10.38%
Total number of female Hispanics : 197270 of 2012760 female, 9.80%
We will select 52 female of total 500 samples
From the 52 female, we will sample (Hispanics,non-Hispanics) :: (5,47) 


Stratified sample for >>>5 female<<< across race and age for those of Hispanic or Latino ethnicity


Stratified sample for >>>47 female<<< across race and age for those NOT of Hispanic or Latino ethnicity
Total number of male : 17385184 of 19397944 total vets, 89.62%
Total number of male Hispanics : 1341362 of 17385184 male, 7.72%
We will select 448 male of total 500 samples
From the 448 male, we will sample (Hispanics,non-Hispanics) :: (35,413) 


Stratified sample for >>>35 male<<< across race and age for those of Hispanic or Latino ethnicity


Stratified sample for >>>413 male<<< across race and age for those NOT of Hispanic or Latino ethnicity
The total number of 500 samples is broken down as follows:


Unnamed: 0,Hispanic,Not Hispanic
female,5,47
male,35,413


Saving sample dataset to : csv/eg/dataset_nb500_v0.csv


Unnamed: 0,gender,< 20,20-24,25-29,30-34,35-39,40-44,45-49,50-54,55-59,60-64,65-69,70-74,75-79,80-84,85+,Grand Total
0,female,1982,52510,122488,176050,206062,201715,189245,216411,236853,219081,155285,97883,51287,36608,49301,2012760
1,male,4939,191021,525440,768082,875081,867717,1015059,1374473,1589911,1637797,1593892,2538943,1814368,1135534,1452927,17385184
2,total,6921,243531,647928,944132,1081143,1069432,1204304,1590884,1826765,1856878,1749177,2636826,1865655,1172142,1502227,19397944


Unnamed: 0,gender,All Veterans,"White, alone","Black or African American, alone","American Indian and Alaska Native, alone","Asian, alone","Native Hawaiian and Other Pacific Islander, alone","Some other race, alone",Two or more races,Hispanic or Latino (of any race),"White alone, Not Hispanic or Latino"
0,female,2012760,1395496,402563,20334,49386,8193,43481,93307,197270,1277917
1,male,17385184,14140396,1987685,129825,293706,34393,287645,511534,1341362,13299234
2,total,19397944,15535892,2390248,150159,343092,42586,331126,604840,1538632,14577151


Total number of female : 2012760 of 19397944 total vets, 10.38%
Total number of female Hispanics : 197270 of 2012760 female, 9.80%
We will select 52 female of total 500 samples
From the 52 female, we will sample (Hispanics,non-Hispanics) :: (5,47) 


Stratified sample for >>>5 female<<< across race and age for those of Hispanic or Latino ethnicity


Stratified sample for >>>47 female<<< across race and age for those NOT of Hispanic or Latino ethnicity
Total number of male : 17385184 of 19397944 total vets, 89.62%
Total number of male Hispanics : 1341362 of 17385184 male, 7.72%
We will select 448 male of total 500 samples
From the 448 male, we will sample (Hispanics,non-Hispanics) :: (35,413) 


Stratified sample for >>>35 male<<< across race and age for those of Hispanic or Latino ethnicity


Stratified sample for >>>413 male<<< across race and age for those NOT of Hispanic or Latino ethnicity
The total number of 500 samples is broken down as follows:


Unnamed: 0,Hispanic,Not Hispanic
female,5,47
male,35,413


Saving sample dataset to : csv/eg/dataset_nb500_v1.csv


Unnamed: 0,gender,< 20,20-24,25-29,30-34,35-39,40-44,45-49,50-54,55-59,60-64,65-69,70-74,75-79,80-84,85+,Grand Total
0,female,1982,52510,122488,176050,206062,201715,189245,216411,236853,219081,155285,97883,51287,36608,49301,2012760
1,male,4939,191021,525440,768082,875081,867717,1015059,1374473,1589911,1637797,1593892,2538943,1814368,1135534,1452927,17385184
2,total,6921,243531,647928,944132,1081143,1069432,1204304,1590884,1826765,1856878,1749177,2636826,1865655,1172142,1502227,19397944


Unnamed: 0,gender,All Veterans,"White, alone","Black or African American, alone","American Indian and Alaska Native, alone","Asian, alone","Native Hawaiian and Other Pacific Islander, alone","Some other race, alone",Two or more races,Hispanic or Latino (of any race),"White alone, Not Hispanic or Latino"
0,female,2012760,1395496,402563,20334,49386,8193,43481,93307,197270,1277917
1,male,17385184,14140396,1987685,129825,293706,34393,287645,511534,1341362,13299234
2,total,19397944,15535892,2390248,150159,343092,42586,331126,604840,1538632,14577151


Total number of female : 2012760 of 19397944 total vets, 10.38%
Total number of female Hispanics : 197270 of 2012760 female, 9.80%
We will select 52 female of total 500 samples
From the 52 female, we will sample (Hispanics,non-Hispanics) :: (5,47) 


Stratified sample for >>>5 female<<< across race and age for those of Hispanic or Latino ethnicity


Stratified sample for >>>47 female<<< across race and age for those NOT of Hispanic or Latino ethnicity
Total number of male : 17385184 of 19397944 total vets, 89.62%
Total number of male Hispanics : 1341362 of 17385184 male, 7.72%
We will select 448 male of total 500 samples
From the 448 male, we will sample (Hispanics,non-Hispanics) :: (35,413) 


Stratified sample for >>>35 male<<< across race and age for those of Hispanic or Latino ethnicity


Stratified sample for >>>413 male<<< across race and age for those NOT of Hispanic or Latino ethnicity
The total number of 500 samples is broken down as follows:


Unnamed: 0,Hispanic,Not Hispanic
female,5,47
male,35,413


Saving sample dataset to : csv/eg/dataset_nb500_v2.csv


Unnamed: 0,gender,< 20,20-24,25-29,30-34,35-39,40-44,45-49,50-54,55-59,60-64,65-69,70-74,75-79,80-84,85+,Grand Total
0,female,1982,52510,122488,176050,206062,201715,189245,216411,236853,219081,155285,97883,51287,36608,49301,2012760
1,male,4939,191021,525440,768082,875081,867717,1015059,1374473,1589911,1637797,1593892,2538943,1814368,1135534,1452927,17385184
2,total,6921,243531,647928,944132,1081143,1069432,1204304,1590884,1826765,1856878,1749177,2636826,1865655,1172142,1502227,19397944


Unnamed: 0,gender,All Veterans,"White, alone","Black or African American, alone","American Indian and Alaska Native, alone","Asian, alone","Native Hawaiian and Other Pacific Islander, alone","Some other race, alone",Two or more races,Hispanic or Latino (of any race),"White alone, Not Hispanic or Latino"
0,female,2012760,1395496,402563,20334,49386,8193,43481,93307,197270,1277917
1,male,17385184,14140396,1987685,129825,293706,34393,287645,511534,1341362,13299234
2,total,19397944,15535892,2390248,150159,343092,42586,331126,604840,1538632,14577151


Total number of female : 2012760 of 19397944 total vets, 10.38%
Total number of female Hispanics : 197270 of 2012760 female, 9.80%
We will select 104 female of total 1000 samples
From the 104 female, we will sample (Hispanics,non-Hispanics) :: (10,94) 


Stratified sample for >>>10 female<<< across race and age for those of Hispanic or Latino ethnicity


Stratified sample for >>>94 female<<< across race and age for those NOT of Hispanic or Latino ethnicity
Total number of male : 17385184 of 19397944 total vets, 89.62%
Total number of male Hispanics : 1341362 of 17385184 male, 7.72%
We will select 896 male of total 1000 samples
From the 896 male, we will sample (Hispanics,non-Hispanics) :: (69,827) 


Stratified sample for >>>69 male<<< across race and age for those of Hispanic or Latino ethnicity


Stratified sample for >>>827 male<<< across race and age for those NOT of Hispanic or Latino ethnicity
The total number of 1000 samples is broken down as follows:


Unnamed: 0,Hispanic,Not Hispanic
female,10,94
male,69,827


Saving sample dataset to : csv/eg/dataset_nb1000_v0.csv


Unnamed: 0,gender,< 20,20-24,25-29,30-34,35-39,40-44,45-49,50-54,55-59,60-64,65-69,70-74,75-79,80-84,85+,Grand Total
0,female,1982,52510,122488,176050,206062,201715,189245,216411,236853,219081,155285,97883,51287,36608,49301,2012760
1,male,4939,191021,525440,768082,875081,867717,1015059,1374473,1589911,1637797,1593892,2538943,1814368,1135534,1452927,17385184
2,total,6921,243531,647928,944132,1081143,1069432,1204304,1590884,1826765,1856878,1749177,2636826,1865655,1172142,1502227,19397944


Unnamed: 0,gender,All Veterans,"White, alone","Black or African American, alone","American Indian and Alaska Native, alone","Asian, alone","Native Hawaiian and Other Pacific Islander, alone","Some other race, alone",Two or more races,Hispanic or Latino (of any race),"White alone, Not Hispanic or Latino"
0,female,2012760,1395496,402563,20334,49386,8193,43481,93307,197270,1277917
1,male,17385184,14140396,1987685,129825,293706,34393,287645,511534,1341362,13299234
2,total,19397944,15535892,2390248,150159,343092,42586,331126,604840,1538632,14577151


Total number of female : 2012760 of 19397944 total vets, 10.38%
Total number of female Hispanics : 197270 of 2012760 female, 9.80%
We will select 104 female of total 1000 samples
From the 104 female, we will sample (Hispanics,non-Hispanics) :: (10,94) 


Stratified sample for >>>10 female<<< across race and age for those of Hispanic or Latino ethnicity


Stratified sample for >>>94 female<<< across race and age for those NOT of Hispanic or Latino ethnicity
Total number of male : 17385184 of 19397944 total vets, 89.62%
Total number of male Hispanics : 1341362 of 17385184 male, 7.72%
We will select 896 male of total 1000 samples
From the 896 male, we will sample (Hispanics,non-Hispanics) :: (69,827) 


Stratified sample for >>>69 male<<< across race and age for those of Hispanic or Latino ethnicity


Stratified sample for >>>827 male<<< across race and age for those NOT of Hispanic or Latino ethnicity
The total number of 1000 samples is broken down as follows:


Unnamed: 0,Hispanic,Not Hispanic
female,10,94
male,69,827


Saving sample dataset to : csv/eg/dataset_nb1000_v1.csv


Unnamed: 0,gender,< 20,20-24,25-29,30-34,35-39,40-44,45-49,50-54,55-59,60-64,65-69,70-74,75-79,80-84,85+,Grand Total
0,female,1982,52510,122488,176050,206062,201715,189245,216411,236853,219081,155285,97883,51287,36608,49301,2012760
1,male,4939,191021,525440,768082,875081,867717,1015059,1374473,1589911,1637797,1593892,2538943,1814368,1135534,1452927,17385184
2,total,6921,243531,647928,944132,1081143,1069432,1204304,1590884,1826765,1856878,1749177,2636826,1865655,1172142,1502227,19397944


Unnamed: 0,gender,All Veterans,"White, alone","Black or African American, alone","American Indian and Alaska Native, alone","Asian, alone","Native Hawaiian and Other Pacific Islander, alone","Some other race, alone",Two or more races,Hispanic or Latino (of any race),"White alone, Not Hispanic or Latino"
0,female,2012760,1395496,402563,20334,49386,8193,43481,93307,197270,1277917
1,male,17385184,14140396,1987685,129825,293706,34393,287645,511534,1341362,13299234
2,total,19397944,15535892,2390248,150159,343092,42586,331126,604840,1538632,14577151


Total number of female : 2012760 of 19397944 total vets, 10.38%
Total number of female Hispanics : 197270 of 2012760 female, 9.80%
We will select 104 female of total 1000 samples
From the 104 female, we will sample (Hispanics,non-Hispanics) :: (10,94) 


Stratified sample for >>>10 female<<< across race and age for those of Hispanic or Latino ethnicity


Stratified sample for >>>94 female<<< across race and age for those NOT of Hispanic or Latino ethnicity
Total number of male : 17385184 of 19397944 total vets, 89.62%
Total number of male Hispanics : 1341362 of 17385184 male, 7.72%
We will select 896 male of total 1000 samples
From the 896 male, we will sample (Hispanics,non-Hispanics) :: (69,827) 


Stratified sample for >>>69 male<<< across race and age for those of Hispanic or Latino ethnicity


Stratified sample for >>>827 male<<< across race and age for those NOT of Hispanic or Latino ethnicity
The total number of 1000 samples is broken down as follows:


Unnamed: 0,Hispanic,Not Hispanic
female,10,94
male,69,827


Saving sample dataset to : csv/eg/dataset_nb1000_v2.csv


Unnamed: 0,gender,< 20,20-24,25-29,30-34,35-39,40-44,45-49,50-54,55-59,60-64,65-69,70-74,75-79,80-84,85+,Grand Total
0,female,1982,52510,122488,176050,206062,201715,189245,216411,236853,219081,155285,97883,51287,36608,49301,2012760
1,male,4939,191021,525440,768082,875081,867717,1015059,1374473,1589911,1637797,1593892,2538943,1814368,1135534,1452927,17385184
2,total,6921,243531,647928,944132,1081143,1069432,1204304,1590884,1826765,1856878,1749177,2636826,1865655,1172142,1502227,19397944


Unnamed: 0,gender,All Veterans,"White, alone","Black or African American, alone","American Indian and Alaska Native, alone","Asian, alone","Native Hawaiian and Other Pacific Islander, alone","Some other race, alone",Two or more races,Hispanic or Latino (of any race),"White alone, Not Hispanic or Latino"
0,female,2012760,1395496,402563,20334,49386,8193,43481,93307,197270,1277917
1,male,17385184,14140396,1987685,129825,293706,34393,287645,511534,1341362,13299234
2,total,19397944,15535892,2390248,150159,343092,42586,331126,604840,1538632,14577151


Total number of female : 2012760 of 19397944 total vets, 10.38%
Total number of female Hispanics : 197270 of 2012760 female, 9.80%
We will select 519 female of total 5000 samples
From the 519 female, we will sample (Hispanics,non-Hispanics) :: (51,468) 


Stratified sample for >>>51 female<<< across race and age for those of Hispanic or Latino ethnicity


Stratified sample for >>>468 female<<< across race and age for those NOT of Hispanic or Latino ethnicity
Total number of male : 17385184 of 19397944 total vets, 89.62%
Total number of male Hispanics : 1341362 of 17385184 male, 7.72%
We will select 4481 male of total 5000 samples
From the 4481 male, we will sample (Hispanics,non-Hispanics) :: (346,4135) 


Stratified sample for >>>346 male<<< across race and age for those of Hispanic or Latino ethnicity


Stratified sample for >>>4135 male<<< across race and age for those NOT of Hispanic or Latino ethnicity
The total number of 5000 samples is broken down as follows:


Unnamed: 0,Hispanic,Not Hispanic
female,51,468
male,346,4135


Saving sample dataset to : csv/eg/dataset_nb5000_v0.csv


Unnamed: 0,gender,< 20,20-24,25-29,30-34,35-39,40-44,45-49,50-54,55-59,60-64,65-69,70-74,75-79,80-84,85+,Grand Total
0,female,1982,52510,122488,176050,206062,201715,189245,216411,236853,219081,155285,97883,51287,36608,49301,2012760
1,male,4939,191021,525440,768082,875081,867717,1015059,1374473,1589911,1637797,1593892,2538943,1814368,1135534,1452927,17385184
2,total,6921,243531,647928,944132,1081143,1069432,1204304,1590884,1826765,1856878,1749177,2636826,1865655,1172142,1502227,19397944


Unnamed: 0,gender,All Veterans,"White, alone","Black or African American, alone","American Indian and Alaska Native, alone","Asian, alone","Native Hawaiian and Other Pacific Islander, alone","Some other race, alone",Two or more races,Hispanic or Latino (of any race),"White alone, Not Hispanic or Latino"
0,female,2012760,1395496,402563,20334,49386,8193,43481,93307,197270,1277917
1,male,17385184,14140396,1987685,129825,293706,34393,287645,511534,1341362,13299234
2,total,19397944,15535892,2390248,150159,343092,42586,331126,604840,1538632,14577151


Total number of female : 2012760 of 19397944 total vets, 10.38%
Total number of female Hispanics : 197270 of 2012760 female, 9.80%
We will select 519 female of total 5000 samples
From the 519 female, we will sample (Hispanics,non-Hispanics) :: (51,468) 


Stratified sample for >>>51 female<<< across race and age for those of Hispanic or Latino ethnicity


Stratified sample for >>>468 female<<< across race and age for those NOT of Hispanic or Latino ethnicity
Total number of male : 17385184 of 19397944 total vets, 89.62%
Total number of male Hispanics : 1341362 of 17385184 male, 7.72%
We will select 4481 male of total 5000 samples
From the 4481 male, we will sample (Hispanics,non-Hispanics) :: (346,4135) 


Stratified sample for >>>346 male<<< across race and age for those of Hispanic or Latino ethnicity


Stratified sample for >>>4135 male<<< across race and age for those NOT of Hispanic or Latino ethnicity
The total number of 5000 samples is broken down as follows:


Unnamed: 0,Hispanic,Not Hispanic
female,51,468
male,346,4135


Saving sample dataset to : csv/eg/dataset_nb5000_v1.csv


Unnamed: 0,gender,< 20,20-24,25-29,30-34,35-39,40-44,45-49,50-54,55-59,60-64,65-69,70-74,75-79,80-84,85+,Grand Total
0,female,1982,52510,122488,176050,206062,201715,189245,216411,236853,219081,155285,97883,51287,36608,49301,2012760
1,male,4939,191021,525440,768082,875081,867717,1015059,1374473,1589911,1637797,1593892,2538943,1814368,1135534,1452927,17385184
2,total,6921,243531,647928,944132,1081143,1069432,1204304,1590884,1826765,1856878,1749177,2636826,1865655,1172142,1502227,19397944


Unnamed: 0,gender,All Veterans,"White, alone","Black or African American, alone","American Indian and Alaska Native, alone","Asian, alone","Native Hawaiian and Other Pacific Islander, alone","Some other race, alone",Two or more races,Hispanic or Latino (of any race),"White alone, Not Hispanic or Latino"
0,female,2012760,1395496,402563,20334,49386,8193,43481,93307,197270,1277917
1,male,17385184,14140396,1987685,129825,293706,34393,287645,511534,1341362,13299234
2,total,19397944,15535892,2390248,150159,343092,42586,331126,604840,1538632,14577151


Total number of female : 2012760 of 19397944 total vets, 10.38%
Total number of female Hispanics : 197270 of 2012760 female, 9.80%
We will select 519 female of total 5000 samples
From the 519 female, we will sample (Hispanics,non-Hispanics) :: (51,468) 


Stratified sample for >>>51 female<<< across race and age for those of Hispanic or Latino ethnicity


Stratified sample for >>>468 female<<< across race and age for those NOT of Hispanic or Latino ethnicity
Total number of male : 17385184 of 19397944 total vets, 89.62%
Total number of male Hispanics : 1341362 of 17385184 male, 7.72%
We will select 4481 male of total 5000 samples
From the 4481 male, we will sample (Hispanics,non-Hispanics) :: (346,4135) 


Stratified sample for >>>346 male<<< across race and age for those of Hispanic or Latino ethnicity


Stratified sample for >>>4135 male<<< across race and age for those NOT of Hispanic or Latino ethnicity
The total number of 5000 samples is broken down as follows:


Unnamed: 0,Hispanic,Not Hispanic
female,51,468
male,346,4135


Saving sample dataset to : csv/eg/dataset_nb5000_v2.csv
