In [2]:
# Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from os.path import sep

In [7]:
# Load data

home_folder = 'Documents{0}CEGIS{0}DiscSim'.format(sep)
filename = '{0}{1}sample ecd child data ADQ.xlsx'.format(home_folder, sep)

data = pd.read_excel(filename) # Read data from the excel sheet
variables = data.columns       # Each column is a single variable measured, this gives the names of columns
n_variables = len(variables)

print('The following variables were found in this dataset:')
for v in variables:
    print('    {0}'.format(v))

The following variables were found in this dataset:
    child_id
    stunting_aww
    wasting_aww
    underweight_aww
    gender_cat_ss
    weight_ss
    height_ss
    age_in_months_ss
    haz06_ss
    waz06_ss
    whz06_ss
    bmiz06_ss
    stunting_ss
    wasting_ss
    underweight_ss
    height_aww
    weight_aww
    measurement_date_aww


In [8]:
# To access any of the variables, use data.<variable name> -- for example:
data.child_id

0      2998424
1      3250679
2      3297596
3      3300262
4      3300591
        ...   
480    6469093
481    6469166
482    6470304
483    6472322
484    6474014
Name: child_id, Length: 485, dtype: int64

In [9]:
# To access a particular sample, use data.<variable name>[sample number] -- for example:
data.child_id[53]

4124209

In [16]:
# Identify variables that are supervisor and subordinate data entries
supervisor_variables = []
subordinate_variables = []

for v in variables: # Loop over variables (in each loop iteration, 'v' will take on the value of the next variable)
    variable_type = v[v.rindex('_') + 1:] # Returns the characters in v after the last occurence of '_'
    if variable_type == 'aww':
        subordinate_variables.append(v)
    else:
        if variable_type == 'ss':
            supervisor_variables.append(v)
            
print('Subordinate variables: {0}'.format(subordinate_variables))
print('Supervisor_variables: {0}'.format(supervisor_variables))

Subordinate variables: ['stunting_aww', 'wasting_aww', 'underweight_aww', 'height_aww', 'weight_aww', 'measurement_date_aww']
Supervisor_variables: ['gender_cat_ss', 'weight_ss', 'height_ss', 'age_in_months_ss', 'haz06_ss', 'waz06_ss', 'whz06_ss', 'bmiz06_ss', 'stunting_ss', 'wasting_ss', 'underweight_ss']


In [18]:
# Find variables that are measured by both subordinate and supervisor

subordinate_only = []
supervisor_only = []
subordinate_and_supervisor = []

supervisor_variable_names = [v[:v.rindex('_')] for v in supervisor_variables]

for v in subordinate_variables:
    
    variable_name = v[:v.rindex('_')] # Characters in v before the last occurence of '_'
    
    if variable_name in supervisor_variable_names:
        subordinate_and_supervisor.append(variable_name)
        
    else:
        subordinate_only.append(variable_name)
        
supervisor_only = [v for v in supervisor_variable_names if np.logical_and(np.logical_not(v in subordinate_only),
                                                                         np.logical_not(v in subordinate_and_supervisor))]

print('The following variables are measured by both subordinate and supervisor:')
for v in subordinate_and_supervisor:
    print('    {0}'.format(v))
print('')

print('The following variables are measured only by subordinate:')
for v in subordinate_only:
    print('    {0}'.format(v))
print('')
    
print('The following variables are measured only by supervisor:')
for v in supervisor_only:
    print('    {0}'.format(v))

The following variables are measured by both subordinate and supervisor:
    stunting
    wasting
    underweight
    height
    weight

The following variables are measured only by subordinate:
    measurement_date

The following variables are measured only by supervisor:
    gender_cat
    age_in_months
    haz06
    waz06
    whz06
    bmiz06
