In [1]:
# THE PURPOSE OF THIS SCRIPT IS TO:

# 1) TAKE THE INDIVIDUAL DATASET (CONTAINING A ROW FOR EVERY CHILD) AND DERIVE A NEW TABLE WITH A ROW FOR EVERY STARTING YEAR COHORT 
#    (WHERE AN INDIVIDUAL CHILD IS CLASSIFIED INTO A COHORT BASED ON THE YEAR OF THEIR FIRST CHECK-UP) THAT SHOWCASES PROGRESS FOR EACH COHORT THROUGH 
#    AVERAGE CHANGE IN HEIGHT-FOR-AGE Z-SCORE

# 2) RUN A STATISTICAL TEST THAT INVESTIGATES WHETHER ANY OF THE COHORTS DIFFER FROM EACH OTHER IN THEIR AVERAGE CHANGE IN HEIGHT-FOR-AGE Z-SCORE 

In [2]:
# IMPORTING PACKAGES
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.sandbox.stats.multicomp import MultiComparison
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# READING IN DATASET WITH ONE ROW FOR EVERY CHECK-UP
mm_data = pd.read_excel("/Users/ajarbuckle/Desktop/MM PROJECT/mm_data_redo/mm_data_individual_table.xlsx") 
mm_data

Unnamed: 0,ID,nombre_del_niño,fecha_de_nacimiento,comunidad,sexo,monitoreo_prim,monitoreo_ult,talla_prim,talla_ult,z_prim,z_ult,observaciones,talla_dif,z_dif,monitoreo_prim_edad,duración,z_dif_rate,frecuencia,stunted_at_first,stunted_at_end
0,1,Maybelin Alicia Aju López,2016-02-22,Los Planes,F,2017-08-15,2019-10-25,70.5,87.3,-3.219024,-3.135741,10,16.8,0.083283,1.478439,2.193018,0.037976,4.559925,True,True
1,2,Samuel David Alva Alva,2015-09-19,Pahaj,M,2017-12-06,2020-01-15,76.4,92.5,-3.906698,-2.904418,11,16.1,1.002280,2.214921,2.108145,0.475432,5.217857,True,True
2,3,Miriam Tulul Bac,2017-01-25,Pahaj,F,2018-10-19,2021-12-08,72.5,95.6,-3.386637,-2.724794,9,23.1,0.661844,1.730322,3.137577,0.210941,2.868455,True,True
3,4,Cristian Anibal Avila Chávez,2019-03-21,Xesampual,M,2019-07-03,2021-12-08,62.0,90.0,0.279202,-0.961131,16,28.0,-1.240333,0.284736,2.433949,-0.509597,6.573678,False,False
4,5,Diego Alejandro Tzaj Chox,2019-01-08,Xesampual,M,2019-05-15,2019-07-31,56.4,61.0,-3.597655,-3.094706,4,4.6,0.502948,0.347707,0.210815,2.385739,18.974026,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419,535,Erikca Maria Rosalina Leal Quisquina,2016-09-28,Ciénaga Grande,F,2019-03-13,2019-04-10,84.7,85.8,-1.493608,-1.382273,2,1.1,0.111335,2.453114,0.076660,1.452326,26.089286,False,False
420,536,Rudy Gabriel Ajú López,2015-12-22,Ciénaga Grande,M,2019-03-13,2019-04-10,85.6,85.8,-3.100616,-3.178239,2,0.2,-0.077623,3.222450,0.076660,-1.012568,26.089286,True,True
421,538,Nataly Ileana Leal Quisquina,2014-08-06,Ciénaga Grande,F,2019-03-13,2019-04-10,99.2,99.0,-1.646368,-1.795524,2,-0.2,-0.149155,4.599589,0.076660,-1.945679,26.089286,False,False
422,544,Eduardo Valeriano Chávez,2015-07-13,Nikajkim,M,2019-03-13,2019-04-24,97.0,97.2,-0.861285,-1.082725,2,0.2,-0.221440,3.665982,0.114990,-1.925733,17.392857,False,False


In [3]:
# ADDING A COHORT COLUMN 

mm_data['cohort'] = mm_data['monitoreo_prim'].dt.year
mm_data

Unnamed: 0,ID,nombre_del_niño,fecha_de_nacimiento,comunidad,sexo,monitoreo_prim,monitoreo_ult,talla_prim,talla_ult,z_prim,...,observaciones,talla_dif,z_dif,monitoreo_prim_edad,duración,z_dif_rate,frecuencia,stunted_at_first,stunted_at_end,cohort
0,1,Maybelin Alicia Aju López,2016-02-22,Los Planes,F,2017-08-15,2019-10-25,70.5,87.3,-3.219024,...,10,16.8,0.083283,1.478439,2.193018,0.037976,4.559925,True,True,2017
1,2,Samuel David Alva Alva,2015-09-19,Pahaj,M,2017-12-06,2020-01-15,76.4,92.5,-3.906698,...,11,16.1,1.002280,2.214921,2.108145,0.475432,5.217857,True,True,2017
2,3,Miriam Tulul Bac,2017-01-25,Pahaj,F,2018-10-19,2021-12-08,72.5,95.6,-3.386637,...,9,23.1,0.661844,1.730322,3.137577,0.210941,2.868455,True,True,2018
3,4,Cristian Anibal Avila Chávez,2019-03-21,Xesampual,M,2019-07-03,2021-12-08,62.0,90.0,0.279202,...,16,28.0,-1.240333,0.284736,2.433949,-0.509597,6.573678,False,False,2019
4,5,Diego Alejandro Tzaj Chox,2019-01-08,Xesampual,M,2019-05-15,2019-07-31,56.4,61.0,-3.597655,...,4,4.6,0.502948,0.347707,0.210815,2.385739,18.974026,True,True,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419,535,Erikca Maria Rosalina Leal Quisquina,2016-09-28,Ciénaga Grande,F,2019-03-13,2019-04-10,84.7,85.8,-1.493608,...,2,1.1,0.111335,2.453114,0.076660,1.452326,26.089286,False,False,2019
420,536,Rudy Gabriel Ajú López,2015-12-22,Ciénaga Grande,M,2019-03-13,2019-04-10,85.6,85.8,-3.100616,...,2,0.2,-0.077623,3.222450,0.076660,-1.012568,26.089286,True,True,2019
421,538,Nataly Ileana Leal Quisquina,2014-08-06,Ciénaga Grande,F,2019-03-13,2019-04-10,99.2,99.0,-1.646368,...,2,-0.2,-0.149155,4.599589,0.076660,-1.945679,26.089286,False,False,2019
422,544,Eduardo Valeriano Chávez,2015-07-13,Nikajkim,M,2019-03-13,2019-04-24,97.0,97.2,-0.861285,...,2,0.2,-0.221440,3.665982,0.114990,-1.925733,17.392857,False,False,2019


In [4]:
# CREATING THE NEW TABLE WITH INFORMATION ON EVERY COHORT
cohort_table = mm_data.groupby('cohort').agg(
    total_kids = ('ID', lambda x: x.nunique()),
    avg_z_dif_rate =('z_dif_rate', 'mean')
    )

# ROUNDING VALUES TO MAKE THE FINAL TABLE LOOK NICER 
cohort_table['avg_z_dif_rate'] = cohort_table['avg_z_dif_rate'].round(2)

cohort_table

Unnamed: 0_level_0,total_kids,avg_z_dif_rate
cohort,Unnamed: 1_level_1,Unnamed: 2_level_1
2014,2,0.19
2015,4,-0.16
2016,13,-0.06
2017,54,-0.04
2018,85,-0.22
2019,144,-0.1
2020,67,-0.18
2021,55,-0.28


In [5]:
# NOW GOING BACK TO THE DATASET WITH ONE ROW PER CHILD TO RUN THE STATISTICAL TEST TO SEE IF THERE ARE STATISTICALLY SIGNIFICANT DIFFERENCES IN RATE OF CHANGE 
# OF HEIGHT-FOR-AGE Z-SCORE OVER TIME BETWEEN COHORTS  

mm_data

Unnamed: 0,ID,nombre_del_niño,fecha_de_nacimiento,comunidad,sexo,monitoreo_prim,monitoreo_ult,talla_prim,talla_ult,z_prim,...,observaciones,talla_dif,z_dif,monitoreo_prim_edad,duración,z_dif_rate,frecuencia,stunted_at_first,stunted_at_end,cohort
0,1,Maybelin Alicia Aju López,2016-02-22,Los Planes,F,2017-08-15,2019-10-25,70.5,87.3,-3.219024,...,10,16.8,0.083283,1.478439,2.193018,0.037976,4.559925,True,True,2017
1,2,Samuel David Alva Alva,2015-09-19,Pahaj,M,2017-12-06,2020-01-15,76.4,92.5,-3.906698,...,11,16.1,1.002280,2.214921,2.108145,0.475432,5.217857,True,True,2017
2,3,Miriam Tulul Bac,2017-01-25,Pahaj,F,2018-10-19,2021-12-08,72.5,95.6,-3.386637,...,9,23.1,0.661844,1.730322,3.137577,0.210941,2.868455,True,True,2018
3,4,Cristian Anibal Avila Chávez,2019-03-21,Xesampual,M,2019-07-03,2021-12-08,62.0,90.0,0.279202,...,16,28.0,-1.240333,0.284736,2.433949,-0.509597,6.573678,False,False,2019
4,5,Diego Alejandro Tzaj Chox,2019-01-08,Xesampual,M,2019-05-15,2019-07-31,56.4,61.0,-3.597655,...,4,4.6,0.502948,0.347707,0.210815,2.385739,18.974026,True,True,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419,535,Erikca Maria Rosalina Leal Quisquina,2016-09-28,Ciénaga Grande,F,2019-03-13,2019-04-10,84.7,85.8,-1.493608,...,2,1.1,0.111335,2.453114,0.076660,1.452326,26.089286,False,False,2019
420,536,Rudy Gabriel Ajú López,2015-12-22,Ciénaga Grande,M,2019-03-13,2019-04-10,85.6,85.8,-3.100616,...,2,0.2,-0.077623,3.222450,0.076660,-1.012568,26.089286,True,True,2019
421,538,Nataly Ileana Leal Quisquina,2014-08-06,Ciénaga Grande,F,2019-03-13,2019-04-10,99.2,99.0,-1.646368,...,2,-0.2,-0.149155,4.599589,0.076660,-1.945679,26.089286,False,False,2019
422,544,Eduardo Valeriano Chávez,2015-07-13,Nikajkim,M,2019-03-13,2019-04-24,97.0,97.2,-0.861285,...,2,0.2,-0.221440,3.665982,0.114990,-1.925733,17.392857,False,False,2019


In [6]:
# Create a MultiComparison object, then running a Tukey's HSD test
mc = MultiComparison(mm_data['z_dif_rate'], mm_data['cohort'])
result = mc.tukeyhsd()
print(result)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
  2014   2015  -0.3549 0.9998 -2.6616 1.9518  False
  2014   2016  -0.2524 0.9999 -2.2755 1.7708  False
  2014   2017  -0.2378 0.9999 -2.1558 1.6802  False
  2014   2018  -0.4162 0.9978 -2.3217 1.4892  False
  2014   2019   -0.299 0.9997 -2.1955 1.5974  False
  2014   2020  -0.3743 0.9989 -2.2856  1.537  False
  2014   2021  -0.4744 0.9952 -2.3917  1.443  False
  2015   2016   0.1026    1.0 -1.4204 1.6255  False
  2015   2017   0.1171    1.0 -1.2631 1.4974  False
  2015   2018  -0.0613    1.0 -1.4241 1.3014  False
  2015   2019   0.0559    1.0 -1.2943  1.406  False
  2015   2020  -0.0194    1.0 -1.3903 1.3516  False
  2015   2021  -0.1194    1.0 -1.4988 1.2599  False
  2016   2017   0.0146    1.0 -0.8083 0.8374  False
  2016   2018  -0.1639 0.9985 -0.9571 0.6293  False
  2016   2019  -0.0467    1.0 -0.8181 0.7247  False
  2016   202

In [None]:
# ALL VALUES FOR 'REJECT' ARE FALSE, MEANING THAT THERE ARE NO STATISTICALLY SIGNIFICANT DIFFERENCES FOR THE RATE OF CHANGE OF HEIGHT-FOR-AGE Z-SCORES
# BETWEEN THE COHORTS  