In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import seaborn as sns
sns.axes_style("whitegrid")
from matplotlib.lines import Line2D
import os
from datetime import datetime
import nbimporter
from scipy import stats
from statsmodels.stats.proportion import proportions_ztest
import matplotlib

# load demographic data

In [31]:
data_dir = os.path.join('..','..','data')
# core sample
core_data =  pd.read_csv(os.path.join(data_dir, 'core_dem.csv'), dtype = str)
core_data['prlfc_dem_age'] = pd.to_numeric(core_data['prlfc_dem_age'])
# excluded sample
excluded_data =  pd.read_csv(os.path.join(data_dir, 'excluded_dem.csv'), dtype = str)
excluded_data['prlfc_dem_age'] = pd.to_numeric(excluded_data['prlfc_dem_age'])
# low completion sample
lowCompl_data =  pd.read_csv(os.path.join(data_dir, 'lowCompl_dem.csv'), dtype = str)
lowCompl_data['prlfc_dem_age'] = pd.to_numeric(lowCompl_data['prlfc_dem_age'])

# FUNCTIONS extract demographics



In [4]:

def get_age_sex_data(list_of_ids, survey_data,column_extention):
    """
    Gathers age and sex data from survey and census based on specific IDs
    
    Args:
        list_of_ids (list or array): list of ids to be included
        survey_data (pd.DataFrame): dataframe containing survey data for one week
        census_data (pd.DataFrame): dataframe containing census information

    Returns:
        df (pd.DataFrame): survey data
        df2 ((pd.DataFrame): census data
        ages (list): list of age ranges as strings
    """
    
    ids = [value for value in list_of_ids if value in list(survey_data['PROLIFIC_PID'])]
    survey_data = survey_data.set_index('PROLIFIC_PID')
    
    survey = survey_data[['prlfc_dem_age', 'DemC5']].loc[ids].astype(float)
    survey.columns = ['Age', 'Sex']
    bins= [i*5 for i in range(21)]
    labels = [str(i*5) + '-' + str((i+1)*5-1) for i in range(20)]

    survey['Age'] = pd.cut(survey['Age'], bins=bins, labels=labels, right=False)
    survey = survey.set_index('Age')

    ages = [value for value in labels if value in list(survey.index)]

    df_dict = {'Age':ages, 'Male':[], 'Female':[]}

    for age in ages:
        to_count = survey['Sex'].loc[age]
        if type(to_count) == np.float64:
                to_count = [to_count]
        counter = Counter(to_count)
        df_dict['Male'].append(counter[1.0])
        df_dict['Female'].append(-counter[2.0])

    df = pd.DataFrame(df_dict)
    df = df.set_index('Age')
    total = sum(df['Male']) - sum(df['Female'])
    df['Male'] = df['Male']/total * 100
    df['Female'] = df['Female']/total * 100
    
    df['Male'] = df['Male'].round(decimals=2)
    df['Female'] = df['Female'].round(decimals=2).abs()
    df = df.rename(columns = {'Male': 'Male_'+column_extention, 'Female': 'Female_'+column_extention})
    
    return df, ages

def rand_dist_age_sex(nIDs, all_sub_data, raw_data, column):
    """
    Computes the eucledian distance in age between  all subjects and a randomly drawn 
    sub-sample of defined size
    
    Args: 
        nIDs (int): number of random samples to draw
        all_sub_data (list): true age distribution in the overall sample
        raw_data (df): data to compute age distribution on
        column (string): column in random sample to compute distance on "Male" or "Female"
    
    Returns:
        euclid_dist (float): eucledian distance between the random sample and the true data
    """
    ids = raw_data['PROLIFIC_PID'].sample(nIDs)
    age_sex,_ = get_age_sex_data(ids, raw_data, '')
    age_sex = all_sub_data.join(age_sex, how='outer')
    age_sex = age_sex.fillna(0)
    euclid_dist = np.linalg.norm(all_sub_data[column+'_good']-age_sex[column+'_'])
    return euclid_dist

def rand_euclDistance_dist_age(compareIds, true_sample, rawData , column , nSample, col_name_extension):
    """
    Generates distribution of eucledian distances between the age distrubution of a randomly drawn 
    sample of given size and the true sampele
    
    Args: 
        compareIds (list): sub-sample to compare against (either exluced subjects or subjects with low completion rate)
        true_sample (df): age distribution in the overall sample
        raw_data (df): data to compute age distribution on
        nSample (int): number of random samples
    
    Returns:
        rand_dist (df): distribution eucledian distances between age in randomly drawn sub-samples 
                        and age in thetrue sample 
    """
    rand_dist = pd.DataFrame()
    rand_dist['rand_eucl_dist'+ col_name_extension] = []
    for i in range(0,nSample):
        rand_dist.loc[i,'rand_eucl_dist'+ col_name_extension] = rand_dist_age_sex(len(compareIds), true_sample, rawData, column)
        
    return rand_dist


def poli_affil(list_of_ids, survey_data, col_name_extention):
    ids = [value for value in list_of_ids if value in list(survey_data['PROLIFIC_PID'])]
    survey_data = survey_data.set_index('PROLIFIC_PID')
    survey = survey_data[['DemM6', 'DemM7']].loc[ids].astype(str)
    party_col_name = 'party'+ col_name_extention
    lean_col_name = 'polit. lean.'+col_name_extention 
    survey = survey.rename(columns = {'DemM6': party_col_name,'DemM7': lean_col_name})
    party_names={'1.0':'Republican', '2.0':'Democrat', '3.0':'Independant', '4.0':'Other'}
    lean_names = {'1.0':'strongly liberal', '2.0':'moderately liberal', '3.0':'slightly liberal',
                 '4.0':'neutral', '5.0':'slightly conservative', '6.0':'moderately conservative', '7.0':'strongly conservative'}
    survey = survey.replace({party_col_name: party_names}) 
    survey = survey.replace({lean_col_name: lean_names}) 
    party = survey[party_col_name].value_counts(normalize=True) * 100
    party = party.to_frame()
    party[party_col_name] = party[party_col_name].round(decimals=2)
    polit_lean = survey[lean_col_name].value_counts(normalize=True) * 100
    polit_lean = polit_lean.to_frame()
    polit_lean[lean_col_name] = polit_lean[lean_col_name].round(decimals=2)
    return party, polit_lean



def rand_dist_party(nIDs, all_sub_data, raw_data):
    """
    Computes the eucledian distance in party identity between all subjects and a randomly drawn 
    sub-sample of defined size
    
    Args: 
        nIDs (int): number of random samples to draw
        all_sub_data (list): true party identity in the overall sample
        raw_data (df): data to compute party identity distribution on
    
    Returns:
        euclid_dist (float): eucledian distance between the random sample and the true data
    """
    ids = raw_data['PROLIFIC_PID'].sample(nIDs)
    party,_ = poli_affil(ids, raw_data, '')
    party = all_sub_data.join(party, how='outer')
    party = party.fillna(0)
    euclid_dist = np.linalg.norm(all_sub_data['party_good']-party['party'])
    return euclid_dist


def rand_euclDistance_dist_party(compareIds, true_sample, rawData, nSample,col_name_extension):
    """
    Generates distribution of eucledian distances between the age distrubution of a randomly drawn 
    sample of given size and the true sampele
    
    Args: 
        compareIds (list): sub-sample to compare against (either exluced subjects or subjects with low completion rate)
        true_sample (df): age distribution in the overall sample
        raw_data (df): data to compute age distribution on
        nSample (int): number of random samples
    
    Returns:
        rand_dist (df): distribution eucledian distances between age in randomly drawn sub-samples 
                        and age in thetrue sample 
    """
    rand_dist = pd.DataFrame()
    rand_dist['rand_eucl_dist'+ col_name_extension] = []
    for i in range(0,nSample):
        rand_dist.loc[i,'rand_eucl_dist'+ col_name_extension] = rand_dist_party(len(compareIds), true_sample, rawData)
        
    return rand_dist

def income_brac(list_of_ids, survey_data, col_name_extention):
    ids = [value for value in list_of_ids if value in list(survey_data['PROLIFIC_PID'])]
    survey_data = survey_data.set_index('PROLIFIC_PID')
    survey = survey_data[['DemW18_R1']].loc[ids].astype(str)
    income_col_name = 'income'+ col_name_extention
    survey = survey.rename(columns = {'DemW18_R1': income_col_name})
    income_names = {'1.0': 'Less than $249', '2.0' : '$250 - $499', '3.0': '$500 - $999', '4.0':'$1000 -$1499', '5.0':'$1500 - $2999', '6.0':'more than $3000', '7.0' :'Dont know'}
    survey = survey.replace({income_col_name: income_names}) 
    income = survey[income_col_name].value_counts(normalize=True) * 100
    income = income.to_frame()
    income[income_col_name] = income[income_col_name].round(decimals=2)
    return income



def rand_dist_income(nIDs, all_sub_data, raw_data):
    """
    Computes the eucledian distance in party identity between all subjects and a randomly drawn 
    sub-sample of defined size
    
    Args: 
        nIDs (int): number of random samples to draw
        all_sub_data (list): true party identity in the overall sample
        raw_data (df): data to compute party identity distribution on
    
    Returns:
        euclid_dist (float): eucledian distance between the random sample and the true data
    """
    ids = raw_data['PROLIFIC_PID'].sample(nIDs)
    income = income_brac(ids, raw_data, '')
    income = all_sub_data.join(income, how='outer')
    income = income.fillna(0)
    euclid_dist = np.linalg.norm(income['income_good']-income['income'])
    return euclid_dist


def rand_euclDistance_dist_income(compareIds, true_sample, rawData, nSample,col_name_extension):
    """
    Generates distribution of eucledian distances between the age distrubution of a randomly drawn 
    sample of given size and the true sampele
    
    Args: 
        compareIds (list): sub-sample to compare against (either exluced subjects or subjects with low completion rate)
        true_sample (df): age distribution in the overall sample
        raw_data (df): data to compute age distribution on
        nSample (int): number of random samples
    
    Returns:
        rand_dist (df): distribution eucledian distances between age in randomly drawn sub-samples 
                        and age in thetrue sample 
    """
    rand_dist = pd.DataFrame()
    rand_dist['rand_eucl_dist'+col_name_extension] = []
    for i in range(0,nSample):
        rand_dist.loc[i,'rand_eucl_dist'+col_name_extension] = rand_dist_income(len(compareIds), true_sample, rawData)
        
    return rand_dist




def highest_education(list_of_ids, survey_data, col_name_extention):
    ids = [value for value in list_of_ids if value in list(survey_data['PROLIFIC_PID'])]
    survey_data = survey_data.set_index('PROLIFIC_PID')
    survey = survey_data[['DemC23']].loc[ids].astype(str)
    education_col_name = 'education'+ col_name_extention
    survey = survey.rename(columns = {'DemC23': education_col_name})
    education_names = {'1.0': 'Some high school', '2.0' : 'High school', '3.0': 'Some college', 
                    '4.0':'Associate degree', '5.0':'Bachelor degree', '6.0':'Some graduate education', 
                    '7.0' :'Master degree', '8.0': 'PhD', '9.0': 'Professional degree', '10.0': 'other'}
    survey = survey.replace({education_col_name: education_names}) 
    education = survey[education_col_name].value_counts(normalize=True) * 100
    education = education.to_frame()
    education[education_col_name] = education[education_col_name].round(decimals=2)
    return education



def rand_dist_education(nIDs, all_sub_data, raw_data):
    """
    Computes the eucledian distance in party identity between all subjects and a randomly drawn 
    sub-sample of defined size
    
    Args: 
        nIDs (int): number of random samples to draw
        all_sub_data (list): true party identity in the overall sample
        raw_data (df): data to compute party identity distribution on
    
    Returns:
        euclid_dist (float): eucledian distance between the random sample and the true data
    """
    ids = raw_data['PROLIFIC_PID'].sample(nIDs)
    education = highest_education(ids, raw_data, '')
    education = all_sub_data.join(education, how='outer')
    education = education.fillna(0)
    euclid_dist = np.linalg.norm(education['education_good']-education['education'])
    return euclid_dist


def rand_euclDistance_dist_education(compareIds, true_sample, rawData, nSample,col_name_extension):
    """
    Generates distribution of eucledian distances between the education distrubution of a randomly drawn 
    sample of given size and the true sampele
    
    Args: 
        compareIds (list): sub-sample to compare against (either exluced subjects or subjects with low completion rate)
        true_sample (df): age distribution in the overall sample
        raw_data (df): data to compute education distribution on
        nSample (int): number of random samples
    
    Returns:
        rand_dist (df): distribution eucledian distances between education in randomly drawn sub-samples 
                        and age in thetrue sample 
    """
    rand_dist = pd.DataFrame()
    rand_dist['rand_eucl_dist'+col_name_extension] = []
    for i in range(0,nSample):
        rand_dist.loc[i,'rand_eucl_dist'+col_name_extension] = rand_dist_education(len(compareIds), true_sample, rawData)
        
    return rand_dist



def raceEthnicity(list_of_ids , survey_data , col_name_extention):
    """
    Extract and merge race and ethnicity data 
    Args:
        list_of_ids (list): subejct IDs for which to return race and tehnicity information
        survey_data (pd.Dataframe): qualtrics data
        col_name_extention (string): column name extention to characterize teh group of subjects in list_of_ids
    """
    ids = [value for value in list_of_ids if value in list(survey_data['PROLIFIC_PID'])]
    survey_data = survey_data.set_index('PROLIFIC_PID')
    survey = survey_data[['DemC9']].loc[ids].astype(str)
    race_col_name = 'raceEth'+ col_name_extention
    survey = survey.rename(columns = {'DemC9': race_col_name})
    race_names = {'1.0': 'American Indian/Alaska Native', '2.0' : 'Asian', '3.0': 'Native Hawaiian or Other Pacific Islander', 
                    '4.0':'Black or African American', '5.0':'White', '6.0':'Multiracial', 
                    '7.0' :'Other', '8.0': 'Prefer not to disclose'}
    survey = survey.replace({race_col_name: race_names}) 

    # add ethnicity to race column
    tmp = survey_data[['DemC8']].loc[ids].astype(str)
    ethnic_names = {'1.0':  'Hispanic or Latino',  '2.0' : 'Not Hispanic or Latino', '3.0':'Prefer not to disclose'}
    tmp = tmp.replace({'DemC8': ethnic_names}) 
    hisp_idx = tmp.index[tmp['DemC8']=='Hispanic or Latino'].tolist()
    survey[race_col_name].loc[hisp_idx]= 'Hispanic or Latino'
    raceEth = survey[race_col_name].value_counts(normalize=True) * 100
    raceEth = raceEth.to_frame()
    raceEth[race_col_name] = raceEth[race_col_name].round(decimals=2)
    return raceEth

def rand_dist_raceEthnicity(nIDs, all_sub_data, raw_data):
    """
    Computes the eucledian distance in race/ethnicity identity between all subjects and a randomly drawn 
    sub-sample of defined size
    
    Args: 
        nIDs (int): number of random samples to draw
        all_sub_data (list): true race/ethnic identity in the overall sample
        raw_data (df): data to compute race/ethnic identity distribution on
    
    Returns:
        euclid_dist (float): eucledian distance between the random sample and the true data
    """
    ids = raw_data['PROLIFIC_PID'].sample(nIDs)
    raceEth = raceEthnicity(ids, raw_data, '')
    raceEth = all_sub_data.join(raceEth, how='outer')
    raceEth = raceEth.fillna(0)
    euclid_dist = np.linalg.norm(raceEth['raceEth_good']-raceEth['raceEth'])
    return euclid_dist


def rand_euclDistance_dist_raceEthnicity(compareIds, true_sample, rawData, nSample,col_name_extension):
    """
    Generates distribution of eucledian distances between the education distrubution of a randomly drawn 
    sample of given size and the true sampele
    
    Args: 
        compareIds (list): sub-sample to compare against (either exluced subjects or subjects with low completion rate)
        true_sample (df): age distribution in the overall sample
        raw_data (df): data to compute education distribution on
        nSample (int): number of random samples
    
    Returns:
        rand_dist (df): distribution eucledian distances between education in randomly drawn sub-samples 
                        and age in thetrue sample 
    """
    rand_dist = pd.DataFrame()
    rand_dist['rand_eucl_dist'+col_name_extension] = []
    for i in range(0,nSample):
        rand_dist.loc[i,'rand_eucl_dist'+col_name_extension] = rand_dist_raceEthnicity(len(compareIds), true_sample, rawData)
        
    return rand_dist

# plot functions

In [5]:
def age_sex_butterfly_plot(data, col_label_loss, ax):
    """
    Plots a population age pyramid for core data with overlayed excluded or low completion data
    
    Args:
       list_of_ids (list or array): list of ids to be included
        survey_data (pd.DataFrame): dataframe containing survey data for one week
        census_data (pd.DataFrame): dataframe containing census information

    Returns:
        ax: axes of the chart
    """
    bar_plot1 = sns.barplot(x='Female_good', y = data.index, data = data,  lw=0, color='indianred', alpha=0.7,ax=ax)
    sns.barplot(x='Male_good', y=data.index, data = data,  lw=0, color='royalblue', alpha=0.7,ax=ax)
    sns.barplot(x='Female_' + col_label_loss, y=data.index,data = data, lw=0, color='indianred', alpha=0.4,ax=ax)
    sns.barplot(x='Male_'+col_label_loss, y=data.index, data = data, lw=0, color='royalblue', alpha=0.4,ax=ax).invert_yaxis()
    
    ax.set_ylabel('ages')
    ax.set_xlabel('percent')
    max_val = abs(age_sex_plot).max().max() * 1.05
    ax.set_xlim((-max_val, max_val))
    custom_labels = ['female core', 'male core', 'female '+ col_label_loss, 'male '+ col_label_loss]
    custom_lines = [Line2D([0], [0], color='indianred', alpha=0.7, linewidth=25),
                Line2D([0], [0], color='royalblue', alpha=0.7, linewidth=25), 
                Line2D([0], [0], color='indianred', alpha=0.4, linewidth=25),
                Line2D([0], [0], color='royalblue', alpha=0.4, linewidth=25)]
    ax.legend(custom_lines, custom_labels, loc='best')
    ax.set_xticklabels(np.round(abs(bar_plot1.get_xticks()), 4))
    ax.set_yticklabels(data.index)


    return ax


def hist_sampledEuclidDist(sampled_dist, true_dist, ax, title, linewidth):
    """
    Plots histogram of sampled eucledian distance distribution,2.5th and 97.5th percentile
    and true eucledian distance between an attrition sample and the good sample of subjects.
    
    Args:
        sampled_dist (pd.DataFrame): distribution of eucledian distances between random subsamples and good subjects
        true_dist (list): eucledian distance between attrition sample and good sample
        ax (axis handle): axis handle axes of the chart
        title (string): plot title

    Returns:
        ax: axes of the chart
    """
    sns.set_style("whitegrid")
    p2_5, p50, p97_5, = np.percentile(sampled_dist.iloc[:,0], [2.5, 50, 97.5])
    ax = sampled_dist.plot.hist(bins= 100, ax=ax)
    ax.axvline(true_dist, color='r', linewidth=linewidth)
    ax.axvline(p2_5, color='black', linestyle = ':', linewidth=linewidth)
    ax.axvline(p97_5, color='black', linestyle = ':', linewidth=linewidth)
    ax.set_xlabel('eucl. dist.')
    ax.set_ylabel('frequency')
    ax.set_title(title)
    ax.get_legend().remove()
    ax.set_ylim([0, 400])
    tick_spacing = 5
    ax.set_yticks(np.arange(0, 400+1, 200.0))
    
    ax.xaxis.set_major_locator(plt.MultipleLocator(tick_spacing))

    return ax


def nested_pie(data, col_outer, col_inner, ax, leg_title, colors):
    """
    Plots a nested pie chart
    
    Args:
        data (pd.Dataframe): data to plot as pie charts
        col_outer (string): column name in data for outer pie
        col_inner (string): column name in data for inner pie
        ax (axes): axes to plot the pie in
        title (string): legend title

    Returns:
        ax: axes of the pie chart
    """

    exp_colors_outer = colors
    
    exp_colors_inner= []
    for i_col, _ in enumerate(exp_colors_outer):
        exp_colors_inner.append(sns.set_hls_values(matplotlib.colors.to_rgb(exp_colors_outer[i_col]),l=0.7))



    

    # outer pie
    mypie, mylables,prct = ax.pie(data[col_outer], radius = 0.8, autopct='%1.1f%%',pctdistance=1.1, colors = exp_colors_outer)
    plt.setp(mypie,  edgecolor='white')
    plt.setp(mylables, color = 'black')
    plt.setp(prct, color = 'black')
    plt.setp(mypie,  edgecolor='white')
    plt.setp(mylables, color = 'black')
    plt.setp(prct, color = 'black')
    # inner pie
    mypieInner,mylablesInner, prctInner =  ax.pie(data[col_inner], radius = 0.5, colors = exp_colors_inner,autopct='%1.1f%%',pctdistance=0.7)
    plt.setp(prctInner, color = 'black')
    plt.setp(mypieInner,  edgecolor='black')
    # legend
    ax.legend(data.index,loc = 'upper right', title = leg_title)

    return ax

# post-hoc comparisons

age

In [25]:
core_data_fem_age = core_data.loc[core_data.DemC5== '2.0','prlfc_dem_age' ]
excluded_data_fem_age = excluded_data.loc[excluded_data.DemC5== '2.0','prlfc_dem_age' ]
lowCompl_data_fem_age = lowCompl_data.loc[lowCompl_data.DemC5== '2.0','prlfc_dem_age' ]

core_data_male_age = core_data.loc[core_data.DemC5== '1.0','prlfc_dem_age' ]
excluded_data_male_age = excluded_data.loc[excluded_data.DemC5== '1.0','prlfc_dem_age' ]
lowCompl_data_male_age = lowCompl_data.loc[lowCompl_data.DemC5== '1.0','prlfc_dem_age' ]


bins=  [i*5 for i in range(3, 21)]
age_count = pd.DataFrame(columns=['core','excl'])
age_count['bins'] = bins
age_count = age_count.set_index('bins')
age_count_fem = age_count.copy()
age_count_male = age_count.copy()


for i_bin in bins:
        
    age_count_fem.loc[i_bin, 'core'] = sum((core_data_fem_age >= i_bin) & (core_data_fem_age < i_bin+5))
    age_count_fem.loc[i_bin, 'excl'] = sum((excluded_data_fem_age >= i_bin) & (excluded_data_fem_age < i_bin+5))
    age_count_fem.loc[i_bin, 'low_compl'] = sum((lowCompl_data_fem_age >= i_bin) & (lowCompl_data_fem_age < i_bin+5))


    age_count_male.loc[i_bin, 'core'] = sum((core_data_male_age >= i_bin) & (core_data_male_age < i_bin+5))
    age_count_male.loc[i_bin, 'excl'] = sum((excluded_data_male_age >= i_bin) & (excluded_data_male_age < i_bin+5))
    age_count_male.loc[i_bin, 'low_compl'] = sum((lowCompl_data_male_age >= i_bin) & (lowCompl_data_male_age < i_bin+5))

age_count_male = age_count_male.drop([85, 90, 95, 100])   
age_count_fem = age_count_fem.drop([85, 90, 95, 100])    


    
age_prop_fem = pd.DataFrame()
age_prop_fem['core'] = age_count_fem.core.div(sum(age_count_fem.core))
age_prop_fem['excl'] = age_count_fem.excl.div(sum(age_count_fem.excl))
age_prop_fem['low_compl'] = age_count_fem.low_compl.div(sum(age_count_fem.low_compl))
age_prop_fem['abs diff. core - excl.'] = abs(age_prop_fem['core'] - age_prop_fem['excl'])
age_prop_fem['abs diff. core - low compl.'] = abs(age_prop_fem['core'] - age_prop_fem['low_compl'])

age_prop_male = pd.DataFrame()
age_prop_male['core'] = age_count_male.core.div(sum(age_count_male.core))
age_prop_male['excl'] = age_count_male.excl.div(sum(age_count_male.excl))
age_prop_male['low_compl'] = age_count_male.low_compl.div(sum(age_count_male.low_compl))
age_prop_male['abs diff. core - excl.'] = abs(age_prop_male['core'] - age_prop_male['excl'])
age_prop_male['abs diff. core - low compl.'] = abs(age_prop_male['core'] - age_prop_male['low_compl'])


for i in age_count_male.index:
    count = np.array([age_count_male.loc[i, 'core'], age_count_male.loc[i, 'excl']])
    nobs = np.array([age_count_male.core.sum(), age_count_male.excl.sum()])
    stat, pval = proportions_ztest(count, nobs)
    age_prop_male.loc[i, 'z-val (core/excl.)'] = stat
    age_prop_male.loc[i, 'p-val (core/excl.)'] = pval
    
for i in age_count_fem.index:
    count = np.array([age_count_fem.loc[i, 'core'], age_count_fem.loc[i, 'excl']])
    nobs = np.array([age_count_fem.core.sum(), age_count_fem.excl.sum()])
    stat, pval = proportions_ztest(count, nobs)
    age_prop_fem.loc[i, 'z-val (core/excl.)'] = stat
    age_prop_fem.loc[i, 'p-val (core/excl.)'] = pval
    
for i in age_count_male.index:
    count = np.array([age_count_male.loc[i, 'core'], age_count_male.loc[i, 'low_compl']])
    nobs = np.array([age_count_male.core.sum(), age_count_male.low_compl.sum()])
    stat, pval = proportions_ztest(count, nobs)
    age_prop_male.loc[i, 'z-val (core/low compl.)'] = stat
    age_prop_male.loc[i, 'p-val (core/low compl.)'] = pval
    
for i in age_count_fem.index:
    count = np.array([age_count_fem.loc[i, 'core'], age_count_fem.loc[i, 'low_compl']])
    nobs = np.array([age_count_fem.core.sum(), age_count_fem.low_compl.sum()])
    stat, pval = proportions_ztest(count, nobs)
    age_prop_fem.loc[i, 'z-val (core/low compl.)'] = stat
    age_prop_fem.loc[i, 'p-val (core/low compl.)'] = pval

age_prop_fem = age_prop_fem.astype(float).round(decimals=3)

age_prop_male = age_prop_male.astype(float).round(decimals=3)

labels = [str(i*5) + '-' + str((i+1)*5-1) for i in range(3,17)]
labels[0] = '18-19'

age_prop_male['age'] = labels
age_prop_fem['age'] = labels

age_prop_fem.set_index('age', drop = True, inplace = True)
age_prop_male.set_index('age', drop = True, inplace = True)

    
age_prop_male.to_csv('attrition_male_age_bracket_proportion.csv')
age_prop_fem.to_csv('attrition_female_age_bracket_proportion.csv')


  zstat = value / std


education

In [32]:
education_names = {'1.0': 'Some high school', '2.0' : 'High school', '3.0': 'Some college', 
                    '4.0':'Associate degree', '5.0':'Bachelor degree', '6.0':'Some graduate education', 
                    '7.0' :'Master degree', '8.0': 'PhD', '9.0': 'Professional degree', '10.0': 'other'}
core_data.DemC23 = core_data.DemC23.replace(education_names)
excluded_data.DemC23 = excluded_data.DemC23.replace(education_names)
lowCompl_data.DemC23 = lowCompl_data.DemC23.replace(education_names)

core_ed = pd.DataFrame(core_data['DemC23'].value_counts()/ core_data['DemC23'].count())
core_ed.rename(columns = {'DemC23': 'core'}, inplace = True)
core_ed_count = pd.DataFrame(core_data['DemC23'].value_counts())
core_ed_count.rename(columns = {'DemC23': 'core'}, inplace = True)

excl_ed = pd.DataFrame(excluded_data['DemC23'].value_counts()/ excluded_data['DemC23'].count())
excl_ed.rename(columns = {'DemC23': 'excl.'}, inplace = True)
excl_ed_count = pd.DataFrame(excluded_data['DemC23'].value_counts())
excl_ed_count.rename(columns = {'DemC23': 'excl.'}, inplace = True)
                             
lowcompl_ed = pd.DataFrame(lowCompl_data['DemC23'].value_counts()/ lowCompl_data['DemC23'].count())
lowcompl_ed.rename(columns = {'DemC23': 'low compl.'}, inplace = True)
lowcompl_ed_count = pd.DataFrame(lowCompl_data['DemC23'].value_counts())
lowcompl_ed_count.rename(columns = {'DemC23': 'low compl.'}, inplace = True)

education_count = core_ed_count.merge(excl_ed_count, left_index = True, right_index=True, how = 'outer')
education_count  = education_count.merge(lowcompl_ed_count, left_index = True, right_index=True, how = 'outer')
education_count = education_count.reindex(['PhD', 'Master degree', 'Some graduate education',
                                         'Bachelor degree', 'Professional degree','Associate degree', 
                                         'Some college', 'High school', 'Some high school', 'other'])
                                 
education_prop = core_ed.merge(excl_ed, left_index = True, right_index=True, how = 'outer')
education_prop  = education_prop.merge(lowcompl_ed, left_index = True, right_index=True, how = 'outer')
education_prop['abs diff. core - excl.'] = abs(education_prop['core'] - education_prop['excl.'])
education_prop['abs diff. core - low compl.'] = abs(education_prop['core'] - education_prop['low compl.'])
education_prop = education_prop.reindex(['PhD', 'Master degree', 'Some graduate education',
                                         'Bachelor degree', 'Professional degree','Associate degree', 
                                         'Some college', 'High school', 'Some high school', 'other'])

education_count.to_csv('attrition_education_bracket_count.csv')

for i in education_count.index:
    count = np.array([education_count.loc[i, 'core'], education_count.loc[i, 'excl.']])
    nobs = np.array([education_count.core.sum(), education_count['excl.'].sum()])
    stat, pval = proportions_ztest(count, nobs)
    education_prop.loc[i, 'z-val (core/excl.)'] = stat
    education_prop.loc[i, 'p-val (core/excl.)'] = pval
    
    count = np.array([education_count.loc[i, 'core'], education_count.loc[i, 'low compl.']])
    nobs = np.array([education_count.core.sum(), education_count['low compl.'].sum()])
    stat, pval = proportions_ztest(count, nobs)
    education_prop.loc[i, 'z-val (core/low compl.)'] = stat
    education_prop.loc[i, 'p-val (core/low compl.)'] = pval
    
education_prop = education_prop.astype(float).round(decimals=3)
education_prop.to_csv('attrition_education_bracket_proportion.csv')


Unnamed: 0,core,excl.,low compl.,abs diff. core - excl.,abs diff. core - low compl.,z-val (core/excl.),p-val (core/excl.),z-val (core/low compl.),p-val (core/low compl.)
PhD,0.015,0.018,0.005,0.003,0.01,-0.308,0.758,1.585,0.113
Master degree,0.138,0.086,0.088,0.052,0.05,2.099,0.036,2.606,0.009
Some graduate education,0.033,0.05,0.045,0.017,0.012,-1.222,0.222,-1.107,0.268
Bachelor degree,0.332,0.339,0.291,0.007,0.041,-0.207,0.836,1.533,0.125
Professional degree,0.02,0.023,0.018,0.002,0.003,-0.214,0.831,0.354,0.723
Associate degree,0.109,0.09,0.123,0.018,0.014,0.809,0.418,-0.768,0.442
Some college,0.232,0.24,0.323,0.008,0.091,-0.254,0.8,-3.618,0.0
High school,0.111,0.136,0.095,0.024,0.016,-1.045,0.296,0.896,0.37
Some high school,0.008,0.014,0.01,0.005,0.002,-0.722,0.47,-0.281,0.779
other,0.001,0.005,0.003,0.004,0.002,-1.326,0.185,-0.803,0.422


income

In [34]:
income_names = {'1.0': 'Less than $249', '2.0' : '$250 - $499', '3.0': '$500 - $999', '4.0':'$1000 -$1499', '5.0':'$1500 - $2999', '6.0':'more than $3000', '7.0' :'Dont know'}


core_data.DemW18_R1 = core_data.DemW18_R1.replace(income_names)
excluded_data.DemW18_R1 = excluded_data.DemW18_R1.replace(income_names)
lowCompl_data.DemW18_R1 = lowCompl_data.DemW18_R1.replace(income_names)


income_prop = pd.DataFrame()
income_count= pd.DataFrame()
core_income = pd.DataFrame(core_data['DemW18_R1'].value_counts()/ core_data['DemW18_R1'].count())
core_income.rename(columns = {'DemW18_R1': 'core'}, inplace = True)
core_income_count = pd.DataFrame(core_data['DemW18_R1'].value_counts())
core_income_count.rename(columns = {'DemW18_R1': 'core'}, inplace = True)

excl_income = pd.DataFrame(excluded_data['DemW18_R1'].value_counts()/ excluded_data['DemW18_R1'].count())
excl_income.rename(columns = {'DemW18_R1': 'excl.'}, inplace = True)
excl_income_count = pd.DataFrame(excluded_data['DemW18_R1'].value_counts())
excl_income_count.rename(columns = {'DemW18_R1': 'excl.'}, inplace = True)
                             
lowcompl_income = pd.DataFrame(lowCompl_data[ 'DemW18_R1'].value_counts()/ lowCompl_data['DemW18_R1'].count())
lowcompl_income.rename(columns = {'DemW18_R1': 'low compl.'}, inplace = True)
lowcompl_income_count = pd.DataFrame(lowCompl_data['DemW18_R1'].value_counts())
lowcompl_income_count.rename(columns = {'DemW18_R1': 'low compl.'}, inplace = True)

income_count = core_income_count.merge(excl_income_count, left_index = True, right_index=True, how = 'outer')
income_count  = income_count.merge(lowcompl_income_count, left_index = True, right_index=True, how = 'outer')
income_count = income_count.reindex(['more than $3000', '$1500 - $2999',  
                                   '$1000 -$1499','$500 - $999', 
                                   '$250 - $499', 'Less than $249', 'Dont know'])
                                 
income_prop = core_income.merge(excl_income, left_index = True, right_index=True, how = 'outer')
income_prop  = income_prop.merge(lowcompl_income, left_index = True, right_index=True, how = 'outer')
income_prop['abs diff. core - excl.'] = abs(income_prop['core'] - income_prop['excl.'])
income_prop['abs diff. core - low compl.'] = abs(income_prop['core'] - income_prop['low compl.'])
income_prop = income_prop.reindex(['more than $3000', '$1500 - $2999',  
                                   '$1000 -$1499','$500 - $999', 
                                   '$250 - $499', 'Less than $249', 'Dont know'])

income_count.to_csv('attrition_income_bracket_count.csv')

for i in income_count.index:
    count = np.array([income_count.loc[i, 'core'], income_count.loc[i, 'excl.']])
    nobs = np.array([income_count.core.sum(), income_count['excl.'].sum()])
    stat, pval = proportions_ztest(count, nobs)
    income_prop.loc[i, 'z-val (core/excl.)'] = stat
    income_prop.loc[i, 'p-val (core/excl.)'] = pval
    
    count = np.array([income_count.loc[i, 'core'], income_count.loc[i, 'low compl.']])
    nobs = np.array([income_count.core.sum(), income_count['low compl.'].sum()])
    stat, pval = proportions_ztest(count, nobs)
    income_prop.loc[i, 'z-val (core/low compl.)'] = stat
    income_prop.loc[i, 'p-val (core/low compl.)'] = pval

income_prop = income_prop.astype(float).round(decimals=3)
income_prop.to_csv('attrition_income_bracket_proportion.csv')


income_prop

Unnamed: 0,core,excl.,low compl.,abs diff. core - excl.,abs diff. core - low compl.,z-val (core/excl.),p-val (core/excl.),z-val (core/low compl.),p-val (core/low compl.)
more than $3000,0.034,0.027,0.02,0.007,0.014,0.523,0.601,1.4,0.162
$1500 - $2999,0.095,0.054,0.083,0.041,0.012,1.96,0.05,0.744,0.457
$1000 -$1499,0.128,0.122,0.09,0.006,0.038,0.25,0.802,2.032,0.042
$500 - $999,0.169,0.176,0.185,0.007,0.016,-0.268,0.788,-0.748,0.455
$250 - $499,0.15,0.154,0.083,0.004,0.067,-0.165,0.869,3.401,0.001
Less than $249,0.393,0.434,0.514,0.041,0.12,-1.142,0.253,-4.206,0.0
Dont know,0.031,0.032,0.025,0.001,0.006,-0.086,0.932,0.566,0.571


race/ethnicity

In [35]:
race_names = {'1.0': 'American Indian/Alaska Native', '2.0' : 'Asian', '3.0': 'Native Hawaiian or Other Pacific Islander', 
              '4.0':'Black or African American', '5.0':'White', '6.0':'Multiracial', '7.0' :'Other',
              '8.0' :'Prefer not to disclose'}
core_data.DemC9 = core_data.DemC9.replace(race_names)
excluded_data.DemC9 = excluded_data.DemC9.replace(race_names)
lowCompl_data.DemC9 = lowCompl_data.DemC9.replace(race_names)


core_data.loc[core_data.DemC8 == '1.0', 'DemC9'] = 'Hispanic/Latinx'
excluded_data.loc[excluded_data.DemC8 == '1.0', 'DemC9'] = 'Hispanic/Latinx'
lowCompl_data.loc[lowCompl_data.DemC8 == '1.0', 'DemC9'] = 'Hispanic/Latinx'


raceEth_prop = pd.DataFrame()
raceEth_count= pd.DataFrame()
core_raceEth = pd.DataFrame(core_data[ 'DemC9'].value_counts()/ core_data[ 'DemW18_R1'].count())
core_raceEth.rename(columns = {'DemC9': 'core'}, inplace = True)
core_raceEth_count = pd.DataFrame(core_data[ 'DemC9'].value_counts())
core_raceEth_count.rename(columns = {'DemC9': 'core'}, inplace = True)

excl_raceEth = pd.DataFrame(excluded_data[ 'DemC9'].value_counts()/ excluded_data[ 'DemW18_R1'].count())
excl_raceEth.rename(columns = {'DemC9': 'excl.'}, inplace = True)
excl_raceEth_count = pd.DataFrame(excluded_data[ 'DemC9'].value_counts())
excl_raceEth_count.rename(columns = {'DemC9': 'excl.'}, inplace = True)
                             
lowcompl_raceEth = pd.DataFrame(lowCompl_data[ 'DemC9'].value_counts()/ lowCompl_data[ 'DemW18_R1'].count())
lowcompl_raceEth.rename(columns = {'DemC9': 'low compl.'}, inplace = True)
lowcompl_raceEth_count = pd.DataFrame(lowCompl_data[ 'DemC9'].value_counts())
lowcompl_raceEth_count.rename(columns = {'DemC9': 'low compl.'}, inplace = True)

raceEth_count = core_raceEth_count.merge(excl_raceEth_count, left_index = True, right_index=True, how = 'outer')
raceEth_count  = raceEth_count.merge(lowcompl_raceEth_count, left_index = True, right_index=True, how = 'outer')

                                 
raceEth_prop = core_raceEth.merge(excl_raceEth, left_index = True, right_index=True, how = 'outer')
raceEth_prop  = raceEth_prop.merge(lowcompl_raceEth, left_index = True, right_index=True, how = 'outer')
raceEth_prop['abs diff. core - excl.'] = abs(raceEth_prop['core'] - raceEth_prop['excl.'])
raceEth_prop['abs diff. core - low compl.'] = abs(raceEth_prop['core'] - raceEth_prop['low compl.'])


raceEth_count.to_csv('attrition_raceEth_bracket_count.csv')

for i in raceEth_count.index:
    count = np.array([raceEth_count.loc[i, 'core'], raceEth_count.loc[i, 'excl.']])
    nobs = np.array([raceEth_count.core.sum(), raceEth_count['excl.'].sum()])
    stat, pval = proportions_ztest(count, nobs)
    raceEth_prop.loc[i, 'z-val (core/excl.)'] = stat
    raceEth_prop.loc[i, 'p-val (core/excl.)'] = pval
    
    count = np.array([raceEth_count.loc[i, 'core'], raceEth_count.loc[i, 'low compl.']])
    nobs = np.array([raceEth_count.core.sum(), raceEth_count['low compl.'].sum()])
    stat, pval = proportions_ztest(count, nobs)
    raceEth_prop.loc[i, 'z-val (core/low compl.)'] = stat
    raceEth_prop.loc[i, 'p-val (core/low compl.)'] = pval
    
raceEth_prop = raceEth_prop.astype(float).round(decimals=3)
raceEth_prop.to_csv( 'attrition_raceEth_bracket_proportion.csv')

raceEth_prop

Unnamed: 0,core,excl.,low compl.,abs diff. core - excl.,abs diff. core - low compl.,z-val (core/excl.),p-val (core/excl.),z-val (core/low compl.),p-val (core/low compl.)
American Indian/Alaska Native,0.001,0.014,0.005,0.013,0.004,-3.25,0.001,-1.649,0.099
Asian,0.101,0.167,0.088,0.066,0.013,-2.873,0.004,0.778,0.436
Black or African American,0.071,0.054,0.083,0.017,0.011,0.921,0.357,-0.747,0.455
Hispanic/Latinx,0.094,0.072,0.11,0.022,0.016,1.04,0.298,-0.926,0.355
Multiracial,0.031,0.041,0.045,0.009,0.014,-0.71,0.478,-1.286,0.198
Native Hawaiian or Other Pacific Islander,0.001,0.005,0.003,0.004,0.002,-1.326,0.185,-0.803,0.422
Other,0.004,0.014,0.003,0.009,0.002,-1.687,0.092,0.488,0.625
Prefer not to disclose,0.003,0.018,0.003,0.015,0.001,-2.658,0.008,0.274,0.784
White,0.692,0.615,0.662,0.077,0.031,2.254,0.024,1.144,0.253


# Bootstrap

In [None]:


good_sub_data = data.set_index('PROLIFIC_PID')
good_ids = core_sample_PID 
good_sub_data = good_sub_data.loc[list(good_ids)]
good_sub_data.reset_index( inplace = True)


In [None]:
good_ids = core_sample_PID 
low_compl_ids = low_compl_sample_PID 
excl_ids = excl_sample_PID 

good_sub_data = data.set_index('PROLIFIC_PID')
good_ids = core_sample_PID 
good_sub_data = good_sub_data.loc[list(good_ids)]
good_sub_data.reset_index( inplace = True)


###### age distribution ######
column_extention = 'good'
age_sex_good,_ = get_age_sex_data(good_ids, data, column_extention)
column_extention = 'excl'
age_sex_excl,_ = get_age_sex_data(excl_ids, data, column_extention)
column_extention = 'low_compl'
age_sex_low_comp,_ = get_age_sex_data(low_compl_ids, data, column_extention)
age_sex = age_sex_good.join(age_sex_excl, how='outer')
age_sex = age_sex.join(age_sex_low_comp, how='outer')
age_sex = age_sex.fillna(0)

# randomly draw nSamples to create a distribution of eucledian distances between a sampe of the same size as 
# exlcuded sample or the sample with a low completion rate and the sample of good subjects.
age_rand_dist_male_excl = rand_euclDistance_dist_age(excl_ids, age_sex, good_sub_data, 'Male',nSamples, '_excl')
age_rand_dist_male_low_comp = rand_euclDistance_dist_age(low_compl_ids, age_sex, good_sub_data, 'Male',nSamples,'_low_compl')
age_rand_dist_female_excl = rand_euclDistance_dist_age(excl_ids, age_sex, good_sub_data, 'Female',nSamples, '_excl')
age_rand_dist_female_low_comp = rand_euclDistance_dist_age(low_compl_ids, age_sex, good_sub_data, 'Female', nSamples, '_low_compl')



###### political identity distribution ######
party_exclude, _ = poli_affil(excl_ids, data,'_excl')
party_low_comp_rate, _ = poli_affil(low_compl_ids, data,'_low_compl')
party_good, _ = poli_affil(good_ids, data,'_good')
party = party_good.join(party_exclude, how='outer')
party = party.join(party_low_comp_rate, how='outer')
party = party.fillna(0)

# randomly draw nSamples to create a distribution of eucledian distances between a sampe of the same size as 
# exlcuded sample or the sample with a low completion rate and the sample of good subjects.
party_rand_dist_excl = rand_euclDistance_dist_party(excl_ids, party_good, good_sub_data,  nSamples, 'excl')
party_rand_dist_low_comp = rand_euclDistance_dist_party(low_compl_ids, party_good, good_sub_data,  nSamples, '_low_compl')



######  income bracket distribution ###### 
income_exclude= income_brac(excl_ids, data,'_excl')
income_low_comp_rate= income_brac(low_compl_ids, data,'_low_compl')
income_good= income_brac(good_ids, data,'_good')
income = income_good.join(income_exclude, how='outer')
income = income.join(income_low_comp_rate, how='outer')
income = income.fillna(0)

# randcomly draw nSamples to create a distribution of eucledian distances between a sampe of the same size as 
# exlcuded sample or the sample with a low completion rate and the full week1 sample. 
income_rand_dist_excl = rand_euclDistance_dist_income(excl_ids, income_good, good_sub_data, nSamples, '_excl')
income_rand_dist_low_comp = rand_euclDistance_dist_income(low_compl_ids, income_good, good_sub_data,  nSamples, '_low_compl')



###### highest education level
education_exclude= highest_education(excl_ids, data,'_excl')
education_low_comp_rate= highest_education(low_compl_ids, data,'_low_compl')
education_good= highest_education(good_ids, data,'_good')
education = education_good.join(education_exclude, how='outer')
education = education.join(education_low_comp_rate, how='outer')
education = education.fillna(0)

# randomly draw nSamples to create a distribution of eucledian distances between a sampe of the same size as 
# exlcuded sample or the sample with a low completion rate and the full week1 sample. 
education_rand_dist_excl = rand_euclDistance_dist_education(excl_ids, education_good, good_sub_data, nSamples,'_excl')
education_rand_dist_low_comp = rand_euclDistance_dist_education(low_compl_ids, education_good, good_sub_data,  nSamples,'_low_compl')



###### race and ethnicity 
raceEth_exclude= raceEthnicity(excl_ids, data,'_excl')
raceEth_low_comp_rate= raceEthnicity(low_compl_ids, data,'_low_compl')
raceEth_good= raceEthnicity(good_ids, data,'_good')
raceEth = raceEth_good.join(raceEth_exclude, how='outer')
raceEth = raceEth.join(raceEth_low_comp_rate, how='outer')
raceEth = raceEth.fillna(0)

# randomly draw nSamples to create a distribution of eucledian distances between a sampe of the same size as 
# exlcuded sample or the sample with a low completion rate and the full week1 sample. 
raceEth_rand_dist_excl = rand_euclDistance_dist_raceEthnicity(excl_ids, raceEth_good, good_sub_data, nSamples,'_excl')
raceEth_rand_dist_low_comp = rand_euclDistance_dist_raceEthnicity(low_compl_ids, raceEth_good, good_sub_data,  nSamples,'_low_compl')




# save bootstrapped data

In [None]:
###### age distribution ######
# save to csv
eucl_dist = pd.DataFrame()
eucl_dist = age_rand_dist_female_low_comp.join(age_rand_dist_female_excl, how = 'outer')
eucl_dist.loc[0,'true_eucl_dist_low_compl'] = np.linalg.norm(age_sex['Female_good']-age_sex['Female_low_compl'])
eucl_dist.loc[0,'true_eucl_dist_excl'] = np.linalg.norm(age_sex['Female_good']-age_sex['Female_excl'])
eucl_dist.to_csv(fdir_validation + 'attrition_age_female_distance_measures.csv')

eucl_dist = pd.DataFrame()
eucl_dist = age_rand_dist_male_low_comp.join(age_rand_dist_male_excl, how = 'outer')
eucl_dist.loc[0,'true_eucl_dist_low_compl'] = np.linalg.norm(age_sex['Male_good']-age_sex['Male_low_compl'])
eucl_dist.loc[0,'true_eucl_dist_excl'] = np.linalg.norm(age_sex['Male_good']-age_sex['Male_excl'])
eucl_dist.to_csv(fdir_validation + 'attrition_age_male_distance_measures.csv')

age_sex.to_csv(fdir_validation + 'attrition_age_sex.csv')




###### political identity distribution ######
# save to csv
eucl_dist = pd.DataFrame()
eucl_dist = party_rand_dist_low_comp.join(party_rand_dist_excl, how = 'outer')
eucl_dist.loc[0,'true_eucl_dist_low_compl'] = np.linalg.norm(party['party_good']-party['party_low_compl'])
eucl_dist.loc[0,'true_eucl_dist_excl'] = np.linalg.norm(party['party_good']-party['party_excl'])

eucl_dist.to_csv(fdir_validation + 'attrition_party_distance_measures.csv')
party.to_csv(fdir_validation + 'attrition_party.csv')


######  income bracket distribution ###### 
# save to csv
eucl_dist = pd.DataFrame()
eucl_dist = raceEth_rand_dist_low_comp.join(raceEth_rand_dist_excl, how = 'outer')
eucl_dist.loc[0,'true_eucl_dist_low_compl'] = np.linalg.norm(income['raceEth_good']-income['raceEth_low_compl'])
eucl_dist.loc[0,'true_eucl_dist_excl'] = np.linalg.norm(income['raceEth_good']-income['raceEth_excl'])

eucl_dist.to_csv(fdir_validation + 'attrition_raceEth_distance_measures.csv')
income.to_csv(fdir_validation + 'attrition_income.csv')



###### highest education level###### 
# save to csv
eucl_dist = pd.DataFrame()
eucl_dist = education_rand_dist_low_comp.join(education_rand_dist_excl, how = 'outer')
eucl_dist.loc[0,'true_eucl_dist_low_compl'] = np.linalg.norm(education['education_good']-education['education_low_compl'])
eucl_dist.loc[0,'true_eucl_dist_excl'] = np.linalg.norm(education['education_good']-education['education_excl'])

eucl_dist.to_csv(fdir_validation + 'attrition_education_distance_measures.csv')
education.to_csv(fdir_validation + 'attrition_education.csv')


###### race and ethnicity ###### 
# save to csv
eucl_dist = pd.DataFrame()
eucl_dist = raceEth_rand_dist_low_comp.join(raceEth_rand_dist_excl, how = 'outer')
eucl_dist.loc[0,'true_eucl_dist_low_compl'] = np.linalg.norm(raceEth['raceEth_good']-raceEth['raceEth_low_compl'])
eucl_dist.loc[0,'true_eucl_dist_excl'] = np.linalg.norm(raceEth['raceEth_good']-raceEth['raceEth_excl'])

eucl_dist.to_csv(fdir_validation + 'attrition_raceEth_distance_measures.csv')
raceEth.to_csv(fdir_validation + 'attrition_raceEth.csv')



# plot

In [None]:

plt.rc('axes', titlesize=30)     # fontsize of the axes title
plt.rc('axes', labelsize=30)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=30)    # fontsize of the tick labels
plt.rc('ytick', labelsize=30)    # fontsize of the tick labels
plt.rc('legend', fontsize=30)    # legend fontsize
plt.rc('figure', titlesize=50)  # fontsize of the figure title
plt.rcParams['font.size'] = 30
linewidth = 5

In [None]:
fig = plt.figure(figsize=(40,60))


gs = fig.add_gridspec(6,4)
ax1 = fig.add_subplot(gs[0,0])
ax2 = fig.add_subplot(gs[1,0])
ax3 = fig.add_subplot(gs[0:2, 1])
ax4 = fig.add_subplot(gs[0,2])
ax5 = fig.add_subplot(gs[1, 2])
ax6 = fig.add_subplot(gs[0:2, 3])


ax7 = fig.add_subplot(gs[2,0])
ax8 = fig.add_subplot(gs[2,1])
ax9 = fig.add_subplot(gs[2,2])
ax10 = fig.add_subplot(gs[2,3])
ax11 = fig.add_subplot(gs[3,0])
ax12 = fig.add_subplot(gs[3,1])
ax13 = fig.add_subplot(gs[3,2])
ax14 = fig.add_subplot(gs[3,3])
ax15 = fig.add_subplot(gs[4,0])
ax16 = fig.add_subplot(gs[4,1])
ax17 = fig.add_subplot(gs[4,2])
ax18 = fig.add_subplot(gs[4,3])
ax19 = fig.add_subplot(gs[5,0])
ax20 = fig.add_subplot(gs[5,1])
ax21 = fig.add_subplot(gs[5,2])
ax22 = fig.add_subplot(gs[5,3])

##### AGE SEX ########
# reshape age sex data
age_sex_plot = age_sex.copy()
age_sex_plot['Female_good'] = -age_sex_plot['Female_good']
age_sex_plot['Female_excl']= -age_sex_plot['Female_excl']
age_sex_plot['Female_low_compl']= -age_sex_plot['Female_low_compl']

# plot random distribution of age distances and true distance
# male exluded
hist_sampledEuclidDist(age_rand_dist_male_excl, np.linalg.norm(age_sex['Male_good']-age_sex['Male_excl']), ax1, '',linewidth)
# female excluded
hist_sampledEuclidDist(age_rand_dist_female_excl, np.linalg.norm(age_sex['Female_good']-age_sex['Female_excl']), ax2, '',linewidth)
# male low rompletion rate
hist_sampledEuclidDist(age_rand_dist_male_low_comp, np.linalg.norm(age_sex['Male_good']-age_sex['Male_low_compl']), ax4, '',linewidth)
# female low completion rate
hist_sampledEuclidDist(age_rand_dist_female_low_comp, np.linalg.norm(age_sex['Female_good']-age_sex['Female_low_compl']), ax5, '', linewidth)

age_sex_butterfly_plot(age_sex_plot, 'excl', ax3)

age_sex_butterfly_plot(age_sex_plot, 'low_compl', ax6)


# party exluded
party_colors=['royalblue','mediumpurple','indianred','yellow']
hist_sampledEuclidDist(party_rand_dist_excl, np.linalg.norm(party['party_good']-party['party_excl']), ax7, 'party excl.', linewidth)
# party low completion rate
hist_sampledEuclidDist(party_rand_dist_low_comp, np.linalg.norm(party['party_good']-party['party_low_compl']), ax9, '', linewidth)
# party exclude pie
nested_pie(party, 'party_excl', 'party_good', ax8, 'party ID', party_colors)
# party low completion rate pie
nested_pie(party, 'party_low_compl', 'party_good', ax10, 'party ID', party_colors)

# eduaction exluded
cmap = plt.get_cmap('Set3')
ed_colors = [cmap(i) for i in np.linspace(0, 1, len(education))]
hist_sampledEuclidDist(education_rand_dist_excl, np.linalg.norm(education['education_good']-education['education_excl']), ax11, '',linewidth)
# eduaction low completion rate
hist_sampledEuclidDist(education_rand_dist_low_comp, np.linalg.norm(education['education_good']-education['education_low_compl']), ax13, '',linewidth)
# eduaction exclude pie
nested_pie(education, 'education_excl', 'education_good', ax12, 'highest education',ed_colors)
# eduaction low completion rate pie
nested_pie(education, 'education_low_compl', 'education_good', ax14, 'highest education',ed_colors)


# income exluded
cmap = plt.get_cmap('Set2')
raceEth_colors = [cmap(i) for i in np.linspace(0, 1, len(income))]
hist_sampledEuclidDist(raceEth_rand_dist_excl, np.linalg.norm(income['raceEth_good']-income['raceEth_excl']), ax15, '',linewidth)
# income low completion rate
hist_sampledEuclidDist(raceEth_rand_dist_low_comp, np.linalg.norm(income['raceEth_good']-income['raceEth_low_compl']), ax17, '',linewidth)
# income exclude pie
nested_pie(income, 'raceEth_excl', 'raceEth_good', ax16, '',raceEth_colors)
# income low completion rate pie
nested_pie(income, 'raceEth_low_compl', 'raceEth_good', ax18, '',raceEth_colors)

# race/ethnicity exluded
race_colors = ['MediumVioletRed','DarkViolet', 'SteelBlue','Gold','FireBrick','MediumPurple','Coral','MediumSeaGreen','DarkCyan']
hist_sampledEuclidDist(raceEth_rand_dist_excl, np.linalg.norm(raceEth['raceEth_good']-raceEth['raceEth_excl']), ax19, '',linewidth)
# race/ethnicity low completion rate
hist_sampledEuclidDist(raceEth_rand_dist_low_comp, np.linalg.norm(raceEth['raceEth_good']-raceEth['raceEth_low_compl']), ax21, '',linewidth)
# race/ethnicity exclude pie
nested_pie(raceEth, 'raceEth_excl', 'raceEth_good', ax20, '',race_colors)
# race/ethnicity low completion rate pie
nested_pie(raceEth, 'raceEth_low_compl', 'raceEth_good', ax22, '',race_colors)


fig.savefig('/Users/trusch/Box/COVID-19 Adolphs Lab/Visualization/attrition/attrition_all.svg',dpi=300)



In [None]:
###### age distribution ######
column_extention = 'good'
age_sex_good,_ = get_age_sex_data(good_ids, data, column_extention)
column_extention = 'excl'
age_sex_excl,_ = get_age_sex_data(excl_ids, data, column_extention)
column_extention = 'low_compl'
age_sex_low_comp,_ = get_age_sex_data(low_compl_ids, data, column_extention)

In [None]:
column_extention


In [None]:
# chi square test
age_sex_all = pd.concat([age_sex_good,age_sex_excl, age_sex_low_comp],axis=1).fillna(0)


# chi square test
f_exp = age_sex_all['Male_good']
f_obs = age_sex_all['Male_excl']
chisq, p_val = stats.chisquare(f_obs,f_exp )


In [None]:
chisq


In [None]:
chi2, p, dof, ex = stats.chi2_contingency(age_sex_all.loc[:,['Male_good','Male_excl']].values)

In [None]:
age_sex_all.loc[:,['Male_good','Male_excl']].values

In [None]:
labels = [str(i*5) + '-' + str((i+1)*5-1) for i in range(20)]
labels