In [6]:
import pandas as pd
import numpy as np
from scipy import stats

In [None]:
def load_age_race():
    fn = 'csv/age.csv'
    df_age = pd.read_csv(fn,thousands=',')
    display(df_age)

    fn = 'csv/race.csv'
    df_race = pd.read_csv(fn,thousands=',')
    display(df_race)
    return df_age,df_race


In [None]:
def get_age_race_matrix(df_age=[],df_race=[],gender='male'):

    # total_vets = df_age['Grand Total'].iloc[-1]
    # display('Number of total vets: {}'.format(total_vets))
    # total_gender = df_age.loc[df_age['gender']==gender,'Grand Total'].values[0]
    # frac_gender = 1.0*total_gender / total_vets
    # print('Total number of {0} : {1} of {2}, {3:0.2f}%'.format(gender,total_gender,total_vets,100*frac_gender))
    
    dfa_g = df_age.loc[df_age['gender']==gender].drop(columns=['gender','Grand Total'])
    # display(dfa_g)
    xlist = dfa_g.values.tolist()[0]

    total_vets = df_race['All Veterans'].iloc[-1]
    # display('Number of total vets: {}'.format(total_vets))
    total_gender = df_race.loc[df_race['gender']==gender,'All Veterans'].values[0]
    frac_gender = 1.0*total_gender / total_vets
    print('Total number of {0} : {1} of {2} total vets, {3:0.2f}%'.format(gender,total_gender,total_vets,100*frac_gender))
    
    total_Hisp = df_race.loc[df_race['gender']==gender,'Hispanic or Latino (of any race)'].values[0]
    frac_gender_Hisp = 1.0*total_Hisp / total_gender
    print('Total number of {0} Hispanics : {1} of {2} {3}, {4:0.2f}%'
          .format(gender,total_Hisp,total_gender,gender,100*frac_gender_Hisp))
    dfr_g = df_race.loc[df_race['gender']==gender].drop(columns=['gender',
                                                                 'All Veterans',
                                                                 'Hispanic or Latino (of any race)',
                                                                 'White alone, Not Hispanic or Latino'])
    # display(list(df_race))
    ylist = dfr_g.values.tolist()[0]

    XY = np.zeros((len(xlist),len(ylist)))

    for ix,x in enumerate(xlist):
        for iy,y in enumerate(ylist):
            XY[ix,iy] = x*y/(total_gender**2)

    # print('Sum of XY is : {}'.format(np.sum(XY)))
    
    df_g = pd.DataFrame(data=XY,index=list(dfa_g),columns=list(dfr_g))
    # display(df_g)
    return df_g,frac_gender,frac_gender_Hisp

In [None]:
def gen_xk(pk):
    shape = pk.shape
    sz = shape[0]*shape[1]
    a = np.linspace(0,sz-1,sz)
    # display(a)
    b = a.reshape(shape)
    return b

In [None]:
def convert_sample(df_g=[],xk=0,str_sample=0):
    unique,counts = np.unique(str_sample,return_counts=True)
    out_count = np.zeros(xk.shape)
    for u,c in zip(unique,counts):
        # display(u,c)
        # display(np.where(xk==u))
        out_count[np.where(xk==u)] = c
    # display(out_count)
    # display(unique,counts)
    # display(xk)
    df_sample = pd.DataFrame(data=out_count,index=df_g.index,columns=df_g.columns)
    df_sample = df_sample.astype(int)
    return df_sample


In [None]:
def compute_sampling(nb_samples=500):

    df_age, df_race = load_age_race()
    nb_actual_total = 0
    gender_Hisp = np.zeros((2,2))
    for idx,gender in enumerate(['female','male']):
        df_g,frac_gender,frac_gender_Hisp = get_age_race_matrix(df_age=df_age,df_race=df_race,gender=gender)
        # display(df_g)
        
        nb_gender = int(np.round(frac_gender*nb_samples))
        nb_gender_Hisp = int(np.round(frac_gender_Hisp*nb_gender))
        nb_gender_nonHisp = nb_gender - nb_gender_Hisp
        
        gender_Hisp[idx,:] = [nb_gender_Hisp,nb_gender_nonHisp]
        # display(gender_Hisp)

        nb_actual_total = nb_actual_total + nb_gender
        # fail safe catch to make sure we have the correct number of samples
        if gender == 'male':
            if nb_actual_total != nb_samples:
                print('Whoops check to make sure this is correct!')
                nb_gender = nb_gender + (nb_samples - nb_actual_total)
        
        print('We will select {} {} of total {} samples'.format(nb_gender,gender,nb_samples))
        print('From the {} {}, we will sample (Hispanics,non-Hispanics) :: ({},{}) '
              .format(nb_gender,gender,nb_gender_Hisp,nb_gender_nonHisp))
        
        pk = df_g.to_numpy()
        xk = gen_xk(pk)
        # display(xk.flatten())
        custm = stats.rv_discrete(name='custm',values=(xk,pk))

        str_sample_Hisp = custm.rvs(size=nb_gender_Hisp)
        df_sample_Hisp = convert_sample(df_g=df_g,xk=xk,str_sample=str_sample_Hisp)
        print('\n\nStratified sample for >>>{} {}<<< across race and age for those of Hispanic or Latino ethnicity'.format(nb_gender_Hisp,gender))
        display(df_sample_Hisp)
        
        str_sample_nonHisp = custm.rvs(size=nb_gender_nonHisp)
        df_sample_nonHisp = convert_sample(df_g=df_g,xk=xk,str_sample=str_sample_nonHisp)
        print('\n\nStratified sample for >>>{} {}<<< across race and age for those NOT of Hispanic or Latino ethnicity'.format(nb_gender_nonHisp,gender))
        display(df_sample_nonHisp)

    print('The total number of {} samples is broken down as follows:'.format(nb_samples))
    df_gender_hisp = pd.DataFrame(gender_Hisp,columns=['Hispanic','Not Hispanic'],index = ['female','male'])
    df_gender_hisp = df_gender_hisp.astype(int)
    display(df_gender_hisp)






In [7]:
def get_gender_hisp(nb_samples=500):

    df_age, df_race = load_age_race()
    gender_hisp = np.zeros((2,2))
    for idx,gender in enumerate(['female','male']):
        df_g,frac_gender,frac_gender_Hisp = get_age_race_matrix(df_age=df_age,df_race=df_race,gender=gender)
        
        nb_gender = int(np.round(frac_gender*nb_samples))
        nb_gender_Hisp = int(np.round(frac_gender_Hisp*nb_gender))
        nb_gender_nonHisp = nb_gender - nb_gender_Hisp
        
        gender_hisp[idx,:] = [nb_gender_Hisp,nb_gender_nonHisp]

    df_gender_hisp = pd.DataFrame(gender_hisp,columns=['Hispanic','Not Hispanic'],index = ['female','male'])
    df_gender_hisp = df_gender_hisp.astype(int)
    return df_gender_hisp



In [8]:
def get_adaptive_sample(nb_samples=500,df_avail=[]):

    df_gender_hisp = get_gender_hisp(nb_samples = nb_samples)
    print('The total number of {} samples is broken down as follows:'.format(nb_samples))
    display(df_gender_hisp)
    df_sample = df_avail
    return df_sample

In [11]:
def get_stratified_sample(nb_samples = 1000,fn=''):

    df_avail = pd.read_csv(fn,index_col=0)

    if nb_samples >= df_avail.shape[0]:
        print('Number of samples {} is larger or equal to available dataset {}'.format(nb_samples,df_avail.shape[0]))
        print('Not enough available data so will use the entire available dataset')
        df_sample = df_avail
    else:
        df_sample = get_adaptive_sample(nb_samples=nb_samples,df_avail=df_avail)

    return df_sample




In [12]:

# How many samples do you want to get?
nb_samples = 1000
# path to csv for available dataset
fn = 'csv/eg/dataset_nb5000_v0.csv'

df_sample = get_stratified_sample(nb_samples=nb_samples,fn=fn)

display(df_sample)

Number of samples 1000 is larger or equal to available dataset 1000
Not enough available data so will use the entire available dataset


Unnamed: 0,Age,Race,Ethinicity,Gender,Stroke,Afib
0,35-39,"White, alone",Hispanic,female,False,False
1,50-54,"White, alone",Hispanic,female,False,True
2,50-54,"White, alone",Hispanic,female,False,True
3,50-54,"American Indian and Alaska Native, alone",Hispanic,female,True,False
4,55-59,"Black or African American, alone",Hispanic,female,False,True
...,...,...,...,...,...,...
995,85+,"Some other race, alone",non-Hispanic,male,True,False
996,85+,"Some other race, alone",non-Hispanic,male,False,False
997,85+,"Some other race, alone",non-Hispanic,male,True,False
998,85+,Two or more races,non-Hispanic,male,True,False
