In [1]:
# THE PURPOSE OF THIS SCRIPT IS TO:

# 1) TAKE THE INDIVIDUAL DATASET (CONTAINING A ROW FOR EVERY CHILD) AND DERIVE A NEW TABLE WITH ONE ROW 
#    PER AGE-SEX GROUP (WHERE AN INDIVIDUAL CHILD IS CLASSIFIED BASED ON THEIR AGE-SEX GROUP AT FIRST CHECK-UP) THAT SHOWCASES PROGRESS 
#    FOR EACH AGE-SEX GROUP THROUGH AVERAGE CHANGE IN HEIGHT-FOR-AGE Z-SCORE

# 2) RUN A STATISTICAL TEST THAT INVESTIGATES WHETHER ANY OF THE AGE-SEX GROUPS DIFFER FROM EACH OTHER IN THEIR AVERAGE CHANGE IN 
#    HEIGHT-FOR-AGE Z-SCORE  

In [2]:
# IMPORTING PACKAGES
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.sandbox.stats.multicomp import MultiComparison
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# READING IN DATASET WITH ONE ROW FOR EVERY CHECK-UP
mm_data = pd.read_excel("/Users/ajarbuckle/Desktop/MM PROJECT/mm_data_redo/mm_data_individual_table.xlsx") 
mm_data

Unnamed: 0,ID,nombre_del_niño,fecha_de_nacimiento,comunidad,sexo,monitoreo_prim,monitoreo_ult,talla_prim,talla_ult,z_prim,z_ult,observaciones,talla_dif,z_dif,monitoreo_prim_edad,duración,z_dif_rate,frecuencia,stunted_at_first,stunted_at_end
0,1,Maybelin Alicia Aju López,2016-02-22,Los Planes,F,2017-08-15,2019-10-25,70.5,87.3,-3.219024,-3.135741,10,16.8,0.083283,1.478439,2.193018,0.037976,4.559925,True,True
1,2,Samuel David Alva Alva,2015-09-19,Pahaj,M,2017-12-06,2020-01-15,76.4,92.5,-3.906698,-2.904418,11,16.1,1.002280,2.214921,2.108145,0.475432,5.217857,True,True
2,3,Miriam Tulul Bac,2017-01-25,Pahaj,F,2018-10-19,2021-12-08,72.5,95.6,-3.386637,-2.724794,9,23.1,0.661844,1.730322,3.137577,0.210941,2.868455,True,True
3,4,Cristian Anibal Avila Chávez,2019-03-21,Xesampual,M,2019-07-03,2021-12-08,62.0,90.0,0.279202,-0.961131,16,28.0,-1.240333,0.284736,2.433949,-0.509597,6.573678,False,False
4,5,Diego Alejandro Tzaj Chox,2019-01-08,Xesampual,M,2019-05-15,2019-07-31,56.4,61.0,-3.597655,-3.094706,4,4.6,0.502948,0.347707,0.210815,2.385739,18.974026,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419,535,Erikca Maria Rosalina Leal Quisquina,2016-09-28,Ciénaga Grande,F,2019-03-13,2019-04-10,84.7,85.8,-1.493608,-1.382273,2,1.1,0.111335,2.453114,0.076660,1.452326,26.089286,False,False
420,536,Rudy Gabriel Ajú López,2015-12-22,Ciénaga Grande,M,2019-03-13,2019-04-10,85.6,85.8,-3.100616,-3.178239,2,0.2,-0.077623,3.222450,0.076660,-1.012568,26.089286,True,True
421,538,Nataly Ileana Leal Quisquina,2014-08-06,Ciénaga Grande,F,2019-03-13,2019-04-10,99.2,99.0,-1.646368,-1.795524,2,-0.2,-0.149155,4.599589,0.076660,-1.945679,26.089286,False,False
422,544,Eduardo Valeriano Chávez,2015-07-13,Nikajkim,M,2019-03-13,2019-04-24,97.0,97.2,-0.861285,-1.082725,2,0.2,-0.221440,3.665982,0.114990,-1.925733,17.392857,False,False


In [3]:
# CREATING AN AGE GROUP COLUMN 

# DEFINING CONDITIONS AND VALUES FOR THE CREATION OF AN AGE GROUP COLUMN
values = ['0-1', '1-2', '2-3', '3-4', '4-5']
conditions = [
    (mm_data['monitoreo_prim_edad'] < 1),
    (mm_data['monitoreo_prim_edad'] >= 1) & (mm_data['monitoreo_prim_edad'] < 2),
    (mm_data['monitoreo_prim_edad'] >= 2) & (mm_data['monitoreo_prim_edad'] < 3),
    (mm_data['monitoreo_prim_edad'] >= 3) & (mm_data['monitoreo_prim_edad'] < 4),
    (mm_data['monitoreo_prim_edad'] >= 4) & (mm_data['monitoreo_prim_edad'] < 5)
]

# CREATING THE AGE GROUP COLUMN WITH THE DEFINED CONDITIONS AND VALUES 
mm_data['age_group'] = np.select(conditions, values, default = "5+")
mm_data

Unnamed: 0,ID,nombre_del_niño,fecha_de_nacimiento,comunidad,sexo,monitoreo_prim,monitoreo_ult,talla_prim,talla_ult,z_prim,...,observaciones,talla_dif,z_dif,monitoreo_prim_edad,duración,z_dif_rate,frecuencia,stunted_at_first,stunted_at_end,age_group
0,1,Maybelin Alicia Aju López,2016-02-22,Los Planes,F,2017-08-15,2019-10-25,70.5,87.3,-3.219024,...,10,16.8,0.083283,1.478439,2.193018,0.037976,4.559925,True,True,1-2
1,2,Samuel David Alva Alva,2015-09-19,Pahaj,M,2017-12-06,2020-01-15,76.4,92.5,-3.906698,...,11,16.1,1.002280,2.214921,2.108145,0.475432,5.217857,True,True,2-3
2,3,Miriam Tulul Bac,2017-01-25,Pahaj,F,2018-10-19,2021-12-08,72.5,95.6,-3.386637,...,9,23.1,0.661844,1.730322,3.137577,0.210941,2.868455,True,True,1-2
3,4,Cristian Anibal Avila Chávez,2019-03-21,Xesampual,M,2019-07-03,2021-12-08,62.0,90.0,0.279202,...,16,28.0,-1.240333,0.284736,2.433949,-0.509597,6.573678,False,False,0-1
4,5,Diego Alejandro Tzaj Chox,2019-01-08,Xesampual,M,2019-05-15,2019-07-31,56.4,61.0,-3.597655,...,4,4.6,0.502948,0.347707,0.210815,2.385739,18.974026,True,True,0-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419,535,Erikca Maria Rosalina Leal Quisquina,2016-09-28,Ciénaga Grande,F,2019-03-13,2019-04-10,84.7,85.8,-1.493608,...,2,1.1,0.111335,2.453114,0.076660,1.452326,26.089286,False,False,2-3
420,536,Rudy Gabriel Ajú López,2015-12-22,Ciénaga Grande,M,2019-03-13,2019-04-10,85.6,85.8,-3.100616,...,2,0.2,-0.077623,3.222450,0.076660,-1.012568,26.089286,True,True,3-4
421,538,Nataly Ileana Leal Quisquina,2014-08-06,Ciénaga Grande,F,2019-03-13,2019-04-10,99.2,99.0,-1.646368,...,2,-0.2,-0.149155,4.599589,0.076660,-1.945679,26.089286,False,False,4-5
422,544,Eduardo Valeriano Chávez,2015-07-13,Nikajkim,M,2019-03-13,2019-04-24,97.0,97.2,-0.861285,...,2,0.2,-0.221440,3.665982,0.114990,-1.925733,17.392857,False,False,3-4


In [4]:
# CREATING A NEW COLUMN FOR AGE-SEX GROUPS, WHICH IS JUST CONCATENATING THE ALREADY PRESENT AGE AND SEX COLUMNS
mm_data['age_sex_group'] = mm_data['sexo'] + ' ' + mm_data['age_group'] + ' Years'
mm_data

Unnamed: 0,ID,nombre_del_niño,fecha_de_nacimiento,comunidad,sexo,monitoreo_prim,monitoreo_ult,talla_prim,talla_ult,z_prim,...,talla_dif,z_dif,monitoreo_prim_edad,duración,z_dif_rate,frecuencia,stunted_at_first,stunted_at_end,age_group,age_sex_group
0,1,Maybelin Alicia Aju López,2016-02-22,Los Planes,F,2017-08-15,2019-10-25,70.5,87.3,-3.219024,...,16.8,0.083283,1.478439,2.193018,0.037976,4.559925,True,True,1-2,F 1-2 Years
1,2,Samuel David Alva Alva,2015-09-19,Pahaj,M,2017-12-06,2020-01-15,76.4,92.5,-3.906698,...,16.1,1.002280,2.214921,2.108145,0.475432,5.217857,True,True,2-3,M 2-3 Years
2,3,Miriam Tulul Bac,2017-01-25,Pahaj,F,2018-10-19,2021-12-08,72.5,95.6,-3.386637,...,23.1,0.661844,1.730322,3.137577,0.210941,2.868455,True,True,1-2,F 1-2 Years
3,4,Cristian Anibal Avila Chávez,2019-03-21,Xesampual,M,2019-07-03,2021-12-08,62.0,90.0,0.279202,...,28.0,-1.240333,0.284736,2.433949,-0.509597,6.573678,False,False,0-1,M 0-1 Years
4,5,Diego Alejandro Tzaj Chox,2019-01-08,Xesampual,M,2019-05-15,2019-07-31,56.4,61.0,-3.597655,...,4.6,0.502948,0.347707,0.210815,2.385739,18.974026,True,True,0-1,M 0-1 Years
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419,535,Erikca Maria Rosalina Leal Quisquina,2016-09-28,Ciénaga Grande,F,2019-03-13,2019-04-10,84.7,85.8,-1.493608,...,1.1,0.111335,2.453114,0.076660,1.452326,26.089286,False,False,2-3,F 2-3 Years
420,536,Rudy Gabriel Ajú López,2015-12-22,Ciénaga Grande,M,2019-03-13,2019-04-10,85.6,85.8,-3.100616,...,0.2,-0.077623,3.222450,0.076660,-1.012568,26.089286,True,True,3-4,M 3-4 Years
421,538,Nataly Ileana Leal Quisquina,2014-08-06,Ciénaga Grande,F,2019-03-13,2019-04-10,99.2,99.0,-1.646368,...,-0.2,-0.149155,4.599589,0.076660,-1.945679,26.089286,False,False,4-5,F 4-5 Years
422,544,Eduardo Valeriano Chávez,2015-07-13,Nikajkim,M,2019-03-13,2019-04-24,97.0,97.2,-0.861285,...,0.2,-0.221440,3.665982,0.114990,-1.925733,17.392857,False,False,3-4,M 3-4 Years


In [5]:
# CREATING THE NEW TABLE WITH INFORMATION ON EVERY SEX-AGE GROUP
age_sex_table = mm_data.groupby('age_sex_group').agg(
    total_kids = ('ID', lambda x: x.nunique()),
    avg_z_dif_rate =('z_dif_rate', 'mean')
    )

# ROUNDING VALUES TO MAKE THE FINAL TABLE LOOK NICER 
age_sex_table['avg_z_dif_rate'] = age_sex_table['avg_z_dif_rate'].round(2)

age_sex_table

Unnamed: 0_level_0,total_kids,avg_z_dif_rate
age_sex_group,Unnamed: 1_level_1,Unnamed: 2_level_1
F 0-1 Years,93,-0.36
F 1-2 Years,60,-0.2
F 2-3 Years,35,-0.05
F 3-4 Years,21,0.13
F 4-5 Years,12,0.16
M 0-1 Years,101,-0.25
M 1-2 Years,51,0.14
M 2-3 Years,27,-0.03
M 3-4 Years,18,-0.14
M 4-5 Years,6,-0.12


In [6]:
# NOW GOING BACK TO THE DATASET WITH ONE ROW PER CHILD TO RUN THE STATISTICAL TEST TO SEE IF THERE ARE STATISTICALLY SIGNIFICANT DIFFERENCES IN RATE OF CHANGE 
# OF HEIGHT-FOR-AGE Z-SCORE OVER TIME BETWEEN AGE-SEX GROUPS  

mm_data

Unnamed: 0,ID,nombre_del_niño,fecha_de_nacimiento,comunidad,sexo,monitoreo_prim,monitoreo_ult,talla_prim,talla_ult,z_prim,...,talla_dif,z_dif,monitoreo_prim_edad,duración,z_dif_rate,frecuencia,stunted_at_first,stunted_at_end,age_group,age_sex_group
0,1,Maybelin Alicia Aju López,2016-02-22,Los Planes,F,2017-08-15,2019-10-25,70.5,87.3,-3.219024,...,16.8,0.083283,1.478439,2.193018,0.037976,4.559925,True,True,1-2,F 1-2 Years
1,2,Samuel David Alva Alva,2015-09-19,Pahaj,M,2017-12-06,2020-01-15,76.4,92.5,-3.906698,...,16.1,1.002280,2.214921,2.108145,0.475432,5.217857,True,True,2-3,M 2-3 Years
2,3,Miriam Tulul Bac,2017-01-25,Pahaj,F,2018-10-19,2021-12-08,72.5,95.6,-3.386637,...,23.1,0.661844,1.730322,3.137577,0.210941,2.868455,True,True,1-2,F 1-2 Years
3,4,Cristian Anibal Avila Chávez,2019-03-21,Xesampual,M,2019-07-03,2021-12-08,62.0,90.0,0.279202,...,28.0,-1.240333,0.284736,2.433949,-0.509597,6.573678,False,False,0-1,M 0-1 Years
4,5,Diego Alejandro Tzaj Chox,2019-01-08,Xesampual,M,2019-05-15,2019-07-31,56.4,61.0,-3.597655,...,4.6,0.502948,0.347707,0.210815,2.385739,18.974026,True,True,0-1,M 0-1 Years
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419,535,Erikca Maria Rosalina Leal Quisquina,2016-09-28,Ciénaga Grande,F,2019-03-13,2019-04-10,84.7,85.8,-1.493608,...,1.1,0.111335,2.453114,0.076660,1.452326,26.089286,False,False,2-3,F 2-3 Years
420,536,Rudy Gabriel Ajú López,2015-12-22,Ciénaga Grande,M,2019-03-13,2019-04-10,85.6,85.8,-3.100616,...,0.2,-0.077623,3.222450,0.076660,-1.012568,26.089286,True,True,3-4,M 3-4 Years
421,538,Nataly Ileana Leal Quisquina,2014-08-06,Ciénaga Grande,F,2019-03-13,2019-04-10,99.2,99.0,-1.646368,...,-0.2,-0.149155,4.599589,0.076660,-1.945679,26.089286,False,False,4-5,F 4-5 Years
422,544,Eduardo Valeriano Chávez,2015-07-13,Nikajkim,M,2019-03-13,2019-04-24,97.0,97.2,-0.861285,...,0.2,-0.221440,3.665982,0.114990,-1.925733,17.392857,False,False,3-4,M 3-4 Years


In [7]:
# Create a MultiComparison object, then running a Tukey's HSD test
mc = MultiComparison(mm_data['z_dif_rate'], mm_data['age_sex_group'])
result = mc.tukeyhsd()
print(result)

     Multiple Comparison of Means - Tukey HSD, FWER=0.05     
   group1      group2   meandiff p-adj   lower  upper  reject
-------------------------------------------------------------
F 0-1 Years F 1-2 Years   0.1558 0.9854 -0.2984   0.61  False
F 0-1 Years F 2-3 Years   0.3044 0.7475 -0.2395 0.8483  False
F 0-1 Years F 3-4 Years   0.4837 0.3771  -0.179 1.1464  False
F 0-1 Years F 4-5 Years   0.5168 0.6319 -0.3246 1.3582  False
F 0-1 Years M 0-1 Years   0.1041 0.9979 -0.2901 0.4983  False
F 0-1 Years M 1-2 Years   0.4933 0.0367  0.0154 0.9713   True
F 0-1 Years M 2-3 Years   0.3256 0.7795 -0.2741 0.9252  False
F 0-1 Years M 3-4 Years   0.2196 0.9928 -0.4867  0.926  False
F 0-1 Years M 4-5 Years   0.2368 0.9997 -0.9186 1.3921  False
F 1-2 Years F 2-3 Years   0.1486 0.9984 -0.4348  0.732  False
F 1-2 Years F 3-4 Years   0.3279 0.8919 -0.3676 1.0234  False
F 1-2 Years F 4-5 Years    0.361 0.9477 -0.5064 1.2284  False
F 1-2 Years M 0-1 Years  -0.0517    1.0 -0.4988 0.3954  False
F 1-2 Ye

In [8]:
# ALL VALUES FOR 'REJECT' ARE FALSE EXCEPT FOR THE COMPARISON BETWEEN 0-1 YEAR OLD FEMALES AND 1-2 YEAR OLD MALES; THIS MEANS THAT THERE ARE NO
# STATISTICALLY SIGNIFICANT DIFFERENCES FOR THE RATE OF CHANGE OF HEIGHT-FOR-AGE Z-SCORES BETWEEN THE AGE-SEX GROUPS ASIDES FROM 0-1 YEAR OLD FEMALES AND 
# 1-2 YEAR OLD MALES, WITH THE 1-2 YEAR OLD MALES SHOWING MUCH BETTER PROGRESS THAN THE 0-1 YEAR OLD FEMALES