### Imports

In [1]:
import sys
sys.version

'3.8.12 (default, Oct 12 2021, 13:49:34) \n[GCC 7.5.0]'

In [2]:
# libraries importation
from collections import Counter
import pandas as pd
import pandas_schema
from pandas_schema import Column
from pandas_schema.validation import CustomElementValidation
from decimal import *
import numpy as np
from matplotlib.ticker import ScalarFormatter, FormatStrFormatter
from pandas_profiling import ProfileReport
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
pd.options.display.max_columns = None
from dfply import *
from plotly.subplots import make_subplots
import plotly.graph_objs as go
from scipy import stats
from statsmodels.graphics.gofplots import qqplot
from sklearn import preprocessing
#pd.options.display.max_rows = None

### Read csv. files

In [3]:
# Read clinical data
df = pd.read_csv('filemaker_data_nov2022.csv.gz')
df = df.rename(columns={'Codigo': 'Code'})
df = df.set_index("Code")

In [4]:
# Copy original clinical data
df_original = df.copy(deep = True)

### Drop EA cases 

In [5]:
# Index of samples that are not considered autism cases 
df = df.reset_index()

In [6]:
# Index of samples with code as EA 
cod_ea = df[(df['Code'].str.contains('EA', regex=False, case=False, na=False))].index
cod_ea

Int64Index([617, 641, 650, 651, 653, 655, 656, 660, 661, 674, 689, 690, 691,
            693, 696, 700, 701, 708, 710, 712, 715, 716, 720, 722, 724, 731,
            734, 740, 741, 745, 748, 757, 763, 770, 771, 777, 784, 788, 795,
            796, 797, 802, 805, 815, 819, 821, 822, 823, 826, 827, 839, 844,
            845, 846, 848, 857, 860, 866, 867, 868, 872, 875, 876, 877, 881],
           dtype='int64')

In [7]:
# Drop samples with code as EA
df.drop(cod_ea, inplace = True)

In [8]:
# Index of samples that were annulled
cod_null = df[(df['Code'].str.contains('anulad', regex=False, case=False, na=False))].index
cod_null

Int64Index([284, 549], dtype='int64')

In [9]:
# Drop samples with code as EA
df.drop(cod_null, inplace = True)

In [10]:
# Reset index
df = df.set_index("Code")

In [11]:
# Drop the sample (was highlighted has not being viable)
df.drop('AA913', inplace = True)

###  Adjust Columns Names and Values 

#### Change Columns' Names

In [12]:
# rename ADIR columns
df = df.rename(columns={'ADIR_B_10':'ADIR_Soc', 'ADIR_D_3': 'ADIR_RRB', 'ADIR_AD_1': 'ADIR_AbDev'})
df_original = df_original.rename(columns={'ADIR_B_10': 'ADIR_Soc', 'ADIR_D_3': 'ADIR_RRB', 'ADIR_AD_1': 'ADIR_AbDev'})

In [13]:
# rename VABS columns
df = df.rename(columns={'CP_Comunicação_Vin': 'VABS_Com', 'CP_Socialização_Vin': 'VABS_Soc', 'CP_Autonomia_Vin': 'VABS_Aut'})
df_original = df_original.rename(columns={'CP_Comunicação_Vin': 'VABS_Com', 'CP_Socialização_Vin': 'VABS_Soc', 'CP_Autonomia_Vin': 'VABS_Aut'})

In [14]:
# rename QD columns
df = df.rename(columns ={'QD_P/S': 'QD_PS', 'QD_O/M': 'QD_EH', 'QD_R/P': 'QD_PR',
                         'QDAFreav': 'QDLreav', 'QDOMreav': 'QDEHreav'})
df_original = df_original.rename(columns={'QD_P/S': 'QD_PS', 'QD_O/M': 'QD_EH','QD_R/P': 'QD_PR',
                                          'QDAFreav': 'QDLreav', 'QDOMreav': 'QDEHreav'})

In [15]:
# rename columns
df = df.rename(columns={'Apgar_1º': 'Apgar_1', 'Apgar_5º': 'Apgar_5',
                        'Andar': 'Walk_Age','Primeiras_Palavras': 'First_Words_Age',
                        'Primeiras_Frases': 'First_Phrases_Age', 'Idade_Diag.': 'Diag_Age',
                        'PC': 'HC', 'ADOS_resultado': 'ADOS_Sev','ADIR_cotação': 'ADIR_quot',
                        'Dismorfismos': 'Dysmorphysm', 'Regressão_Linguagem': 'Language_Regr', 
                        'Atraso_DPM_sempre': 'PMD_Delay','História_Familiar_Pos': 'Psyc_Family_Hist',
                        'Regressão_DPM': 'PMD_Regression', 'Sexo': 'Gender', 
                        'Audição': 'Audition', 'Visão': 'Vision', 'Raça': 'Race', 
                        })
df_original = df_original.rename(columns={'Apgar_1º': 'Apgar_1', 'Apgar_5º': 'Apgar_5',
                        'Andar': 'Walk_Age','Primeiras_Palavras': 'First_Words_Age',
                        'Primeiras_Frases': 'First_Phrases_Age', 'Idade_Diag.': 'Diag_Age',
                        'PC': 'HC', 'ADOS_resultado': 'ADOS_Sev','ADIR_cotação': 'ADIR_quot',
                        'Dismorfismos': 'Dysmorphysm', 'Regressão_Linguagem': 'Language_Regr', 
                        'Atraso_DPM_sempre': 'PMD_Delay','História_Familiar_Pos': 'Psyc_Family_Hist',
                        'Regressão_DPM': 'PMD_Regression', 'Sexo': 'Gender', 
                        'Audição': 'Audition', 'Visão': 'Vision', 'Raça': 'Race', 
                        'Diagnostico_secundario': 'ID'})

In [16]:
df = df.rename(columns={'Diagnótico_principal':'Main_Diagnosis',
                        'Diagnostico_secundario': 'Secondary_Diagnosis',
                        'Diagnóstico_clinico_3': 'Clinical_Diagnosis', 
                        'Cariótipo_Resultado':'Karyotype_Result',
                        'FRAXA_Resultado': 'FRAXA_Result',
                        'FRAXE_Resultado': 'FRAXE_Result',
                        'IdadeQDQI': 'QDQI_Age',
                        'DN': 'BD', 
                        'Data_observação': 'Observation_Date'})
df_original = df_original.rename(columns={'Diagnótico_principal':'Main_Diagnosis',
                                          'Diagnostico_secundario': 'Secondary_Diagnosis',
                                          'Diagnóstico_clinico_3': 'Clinical_Diagnosis',
                                          'Cariótipo_Resultado':'Karyotype_Result',
                                          'FRAXA_Resultado': 'FRAXA_Result',
                                          'FRAXE_Resultado': 'FRAXE_Result',
                                          'IdadeQDQI': 'QDQI_Age',
                                          'DN': 'BD',
                                          'Data_observação': 'Observation_Date'})

#### Change Data Type

In [17]:
# see columns data type
Counter(df.dtypes)

Counter({dtype('float64'): 90, dtype('O'): 215, dtype('int64'): 1})

In [18]:
# column of object data type
obj_col = list(df.select_dtypes(['O', 'object']).columns)

In [19]:
# select columns of object data type that should be int type
obj_col = ['ADIR_C_7', 'ADIR_C_8',
           'Ano_Diag.','Apgar_1','Apgar_5',
           'VABS_Aut','VABS_Com', 'CP_Global_Vin','VABS_Soc',
           'Griffiths_QD_Global', 
           'IM_G','IM_L','IM_M','IM_O/M','IM_P/S','IM_R','IM_R/P', 
           'Diag_Age', 'Idade_Gestacional_','Idade_de_início', 'QDQI_Age',
           'HC','PC_(ALTA)','PC__P', 'Peso_','Peso_(ALTA)','Peso_P',
           'First_Words_Age','First_Phrases_Age',
           'QD/QILing','QD_L','QD_M', 'QD_EH','QD_PS','QD_R',
           'QDLreav','QDMreav', 'QDEHreav','QDPSreav','QDRreav',
           'Vineland_abr_Autonomia_DP-P','Vineland_abr_Socialização_DP-P',
           'Vineland_abr_comunicação_DP-_P',
           'Vineland_abr_global_DP-P','Vineland_abr_motricidade_DP-P',
           'Vineland_ext_autonomia_DP-P',
           'Vineland_ext_comunicação_DP-P','Vineland_ext_global_DP-P',
           'Vineland_ext_socialização_DP-P',
           'WISCIII_Pcv','WISCIII_Pop',
           'WISCIII_Pvp','WPPSI-R_QIec',
           'WPPSI-R_QIr','WPPSI-R_QIv',
           'nivel_autonomia_V']

In [20]:
# Transform selected columns of object data type to int type
for v in obj_col:
    df[v] = df[v].apply(pd.to_numeric, errors='coerce')
    df_original[v] = df_original[v].apply(pd.to_numeric, errors='coerce')

#### Change Values' Names

##### Race

In [21]:
df['Race'].unique()

array(['Caucasiano', 'Africano', nan, 'Asiático', 'Hispânico', 'Brasil'],
      dtype=object)

In [22]:
df['Race'] = df['Race'].replace(['Hispânico', 'Brasil', np.nan, 'nan'], pd.NA)
df_original['Race'] = df_original['Race'].replace(['Hispânico', 'Brasil', np.nan], pd.NA)

In [23]:
df['Race'] = df['Race'].replace(['Caucasiano'], 'White')
df_original['Race'] = df_original['Race'].replace(['Caucasiano'], 'White')

In [24]:
df['Race'] = df['Race'].replace(['Africano'], 'African American')
df_original['Race'] = df_original['Race'].replace(['Africano'], 'African American')

In [25]:
df['Race'] = df['Race'].replace(['Asiático'], 'Asian')
df_original['Race'] = df_original['Race'].replace(['Asiático'], 'Asian')

##### ADOS Severity

In [26]:
df.ADOS_Sev.unique()

array([nan, 'Autismo', 'Pos', 'PEA', 'Ñ Autismo'], dtype=object)

In [27]:
df['ADOS_Sev'] = df['ADOS_Sev'].replace(['nan', np.nan], pd.NA)
df_original['ADOS_Sev'] = df_original['ADOS_Sev'].replace(['nan', np.nan], pd.NA)

In [28]:
df['ADOS_Sev'] = df['ADOS_Sev'].replace(['Autismo', 'Pos'], 'Autism')
df_original['ADOS_Sev'] = df_original['ADOS_Sev'].replace(['Autismo', 'Pos'], 'Autism')

In [29]:
df['ADOS_Sev'] = df['ADOS_Sev'].replace(['PEA'], 'ASD')
df_original['ADOS_Sev'] = df_original['ADOS_Sev'].replace(['PEA'], 'ASD')

In [30]:
df['ADOS_Sev'] = df['ADOS_Sev'].replace(['Ñ Autismo'], 'Non Spectrum')
df_original['ADOS_Sev'] = df_original['ADOS_Sev'].replace(['Ñ Autismo'], 'Non Spectrum')

##### ADIR Quotation

In [31]:
df.ADIR_quot.unique()

array(['Pos', 'pos', nan, 'Neg', 'Duv'], dtype=object)

In [32]:
df['ADIR_quot'] = df['ADIR_quot'].replace(['nan', np.nan], pd.NA)
df_original['ADIR_quot'] = df_original['ADIR_quot'].replace(['nan', np.nan], pd.NA)

In [33]:
df['ADIR_quot'] = df['ADIR_quot'].replace(['Pos', 'pos'], 'Positive')
df_original['ADIR_quot'] = df_original['ADIR_quot'].replace(['Pos', 'pos'], 'Positive')

In [34]:
df['ADIR_quot'] = df['ADIR_quot'].replace(['Neg', 'Duv'], 'Negative')
df_original['ADIR_quot'] = df_original['ADIR_quot'].replace(['Neg', 'Duv'], 'Negative')

#### Verbal

In [35]:
df.verbal.unique()

array(['n', 's', 'N', nan], dtype=object)

In [36]:
df['verbal'] = df['verbal'].replace(['nan', np.nan], pd.NA)
df_original['verbal'] = df_original['verbal'].replace(['nan', np.nan], pd.NA)

In [37]:
df['verbal'] = df['verbal'].replace(['n','N'], 'No')
df_original['verbal'] = df_original['verbal'].replace(['n','N'], 'No')

In [38]:
df['verbal'] = df['verbal'].replace(['s'], 'Yes')
df_original['verbal'] = df_original['verbal'].replace(['s'], 'Yes')

### Initial Observations

In [39]:
df.Main_Diagnosis.unique()

array(['Autismo1', 'Autismoatipico', 'Autismoduvidoso',
       'Evoluçãonãoautismo', 'Problemas Comportamento', 'Autismo'],
      dtype=object)

In [40]:
# index from samples with Main_Diagnosis Evoluçãonãoautismo
main_diag_ev = df[(df['Main_Diagnosis'] == 'Evoluçãonãoautismo')].index

In [41]:
main_diag_ev

Index(['AA783', 'AA839'], dtype='object', name='Code')

In [42]:
# index from samples with a doubtful Main_Diagnosis
main_diag_doub = df[(df['Main_Diagnosis'] == 'Autismoduvidoso')].index

In [43]:
# index from samples with a doubtful Main_Diagnosis and the respective ADOS result
ados_main_diag_doub = {}
for code in main_diag_doub:
    ados_main_diag_doub[code] = df.ADOS_Sev[code]
print(ados_main_diag_doub)

{'AA676': 'ASD', 'AA715': 'Non Spectrum', 'AA626': 'ASD', 'AA652': 'Autism', 'AA752': 'ASD', 'AA713': 'ASD', 'AA708': 'ASD', 'AA740': 'Autism', 'AA702': 'Autism', 'AA685': 'Autism', 'AA732': 'ASD', 'AA792': 'ASD'}


In [44]:
# index from samples with a doubtful Main_Diagnosis and the respective ADIR result
adir_main_diag_doub = {}
for code in main_diag_doub:
    adir_main_diag_doub[code] = df.ADIR_quot[code]
print(adir_main_diag_doub)

{'AA676': 'Negative', 'AA715': 'Positive', 'AA626': 'Negative', 'AA652': 'Negative', 'AA752': 'Negative', 'AA713': 'Negative', 'AA708': 'Negative', 'AA740': 'Negative', 'AA702': 'Negative', 'AA685': 'Negative', 'AA732': 'Negative', 'AA792': 'Negative'}


In [45]:
# profile report of dataset
profile = ProfileReport(df, title='Initial Dataset Profile', minimal = True)
# export profile report 
profile.to_file("Initial Dataset Profile")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Exclusion of Samples with  Negative ADIR/ADOS  Results

In [46]:
# index from null samples for both ADIR and ADOS
adir_ados_null = df[(df['ADIR_quot'].isnull())
                    &(df['ADOS_Sev'].isnull())].index

In [47]:
adir_ados_null

Index(['AA4', 'AA24', 'AA30', 'AA44', 'AA76', 'AA179', 'AA188', 'AA609',
       'AA770', 'AA940', 'AA811', 'AA833', 'AA875', 'AA869', 'AA857', 'AA863',
       'AA918', 'AA926', 'AA939', 'AA933', 'AA948', 'AA949', 'AA947', 'AA952',
       'AA950', 'AA951'],
      dtype='object', name='Code')

In [48]:
# index from null samples for ADIR and ADOS and the respective result of Diagnostico_principal
adir_ados_null_dic = {}
for code in adir_ados_null:
    adir_ados_null_dic[code] = df.Main_Diagnosis[code]
print(adir_ados_null_dic)

{'AA4': 'Autismo1', 'AA24': 'Autismo1', 'AA30': 'Autismo1', 'AA44': 'Autismo1', 'AA76': 'Autismo1', 'AA179': 'Autismo1', 'AA188': 'Autismo1', 'AA609': 'Autismo1', 'AA770': 'Autismo1', 'AA940': 'Autismo1', 'AA811': 'Autismo1', 'AA833': 'Autismo1', 'AA875': 'Autismo1', 'AA869': 'Autismo1', 'AA857': 'Autismo1', 'AA863': 'Autismo1', 'AA918': 'Autismo1', 'AA926': 'Autismo1', 'AA939': 'Autismo1', 'AA933': 'Autismo1', 'AA948': 'Autismo1', 'AA949': 'Autismo1', 'AA947': 'Autismo1', 'AA952': 'Autismo1', 'AA950': 'Autismo1', 'AA951': 'Autismo1'}


In [49]:
# index from samples with ADIR negative 
adir_neg = df[(df['ADIR_quot'] == 'Negative')].index

In [50]:
# index from samples with ADIR negative and the respective ADOS result
adir_neg_dic = {}
for code in adir_neg:
    adir_neg_dic[code] = df.ADOS_Sev[code]
print(adir_neg_dic)

{'AA71': <NA>, 'AA108': <NA>, 'AA162': <NA>, 'AA340': 'Autism', 'AA349': 'ASD', 'AA805': 'ASD', 'AA475': 'Autism', 'AA428': 'Autism', 'AA409': 'ASD', 'AA547': 'Autism', 'AA540': 'ASD', 'AA584': 'Autism', 'AA552': 'ASD', 'AA627': 'Autism', 'AA698': 'ASD', 'AA671': <NA>, 'AA595': 'Autism', 'AA546': <NA>, 'AA570': 'Autism', 'AA810': 'Autism', 'AA541': 'ASD', 'AA558': 'Autism', 'AA556': <NA>, 'AA585': 'Autism', 'AA676': 'ASD', 'AA579': 'ASD', 'AA610': <NA>, 'AA548': 'Autism', 'AA582': <NA>, 'AA561': 'Autism', 'AA566': 'Autism', 'AA743': 'Autism', 'AA925': 'ASD', 'AA611': 'Autism', 'AA800': 'Autism', 'AA626': 'ASD', 'AA651': 'ASD', 'AA597': 'ASD', 'AA622': 'Autism', 'AA666': <NA>, 'AA813': 'Non Spectrum', 'AA637': 'Autism', 'AA612': <NA>, 'AA642': 'ASD', 'AA628': 'Non Spectrum', 'AA662': 'Non Spectrum', 'AA654': 'Autism', 'AA652': 'Autism', 'AA667': 'Autism', 'AA673': 'ASD', 'AA703': 'Autism', 'AA752': 'ASD', 'AA665': 'ASD', 'AA663': 'ASD', 'AA713': 'ASD', 'AA658': 'Autism', 'AA684': 'Autis

In [51]:
# index from samples with ADOS negative
ados_neg = df[(df['ADOS_Sev'] == 'Non Spectrum')].index

In [52]:
# index from samples with ADOS negative and the respective ADIR result
ados_neg_dic = {}
for code in ados_neg:
    ados_neg_dic[code] = df.ADIR_quot[code]
print(ados_neg_dic)

{'AA109': 'Positive', 'AA751': 'Positive', 'AA587': 'Positive', 'AA715': 'Positive', 'AA813': 'Negative', 'AA628': 'Negative', 'AA662': 'Negative', 'AA736': 'Negative', 'AA775': 'Negative', 'AA893': 'Negative', 'AA919': <NA>, 'AA886': <NA>, 'AA839': <NA>, 'AA876': 'Positive', 'AA889': <NA>, 'AA900': 'Positive', 'AA929': <NA>}


In [53]:
# index from samples with ADIR or ADOS negative
adir_neg_ados_neg = df[(df['ADIR_quot'] == 'Negative')&(df['ADOS_Sev'] == 'Non Spectrum')
                       |(df['ADIR_quot'] == 'Negative')&(df['ADOS_Sev'].isnull())
                       |(df['ADIR_quot'].isnull())&(df['ADOS_Sev'] == 'Non Spectrum')].index

In [54]:
# how many cases of samples with ADIR or ADOS negative
len(adir_neg_ados_neg)

21

In [55]:
# drop samples with ADIR or ADOS negative
df.drop(adir_neg_ados_neg, inplace = True)

In [56]:
# index of null ADOS samples
ados_null = df[(df['ADOS_Sev'].isnull())].index

In [57]:
# how many cases of null ADOS samples
len(ados_null)

217

In [58]:
# index of null ADIR samples 
adir_null = df[(df['ADIR_quot'].isnull())].index

In [59]:
# how many cases of null ADIR samples
len(adir_null)

77

In [60]:
# index of null ADIR and ADOS samples
adir_ados_null = df[(df['ADIR_quot'].isnull())
                    &(df['ADOS_Sev'].isnull())].index

In [61]:
# how many cases of null ADIR and ADOS samples
len(adir_ados_null)

26

In [62]:
# index of positive samples for ADIR and null for ADOS
adir_pos_ados_null = df[((df['ADIR_quot'] == 'Positive')
                         &(df['ADOS_Sev'].isnull()))].index

In [63]:
# how many cases of positive samples for ADIR and null for ADOS
len(adir_pos_ados_null)

191

In [64]:
# index of positve samples for ADOS and null for ADIR
adir_null_ados_pos = df[((df['ADIR_quot'].isnull())&(df['ADOS_Sev'] == 'Autism'))
                        |((df['ADIR_quot'].isnull())&(df['ADOS_Sev'] == 'ASD'))].index

In [65]:
# how many cases of positve samples for ADOS and null for ADIR
len(adir_null_ados_pos)

51

In [66]:
# see values in column Main Diagnosis
Counter(df['Main_Diagnosis'])

Counter({'Autismo1': 805,
         'Autismoatipico': 109,
         'Autismoduvidoso': 12,
         'Evoluçãonãoautismo': 1,
         'Autismo': 1})

In [67]:
# index of samples with Diagnostico_principal duvidoso
main_diag_doubt = df[(df['Main_Diagnosis'] == 'Autismoduvidoso')].index

In [68]:
# index from samples with Diagnostico_principal duvidoso and the respective ADOS result
main_diag_doubt_dic = {}
for code in main_diag_doubt:
    main_diag_doubt_dic[code] = df.ADOS_Sev[code]
print(main_diag_doubt_dic)

{'AA676': 'ASD', 'AA715': 'Non Spectrum', 'AA626': 'ASD', 'AA652': 'Autism', 'AA752': 'ASD', 'AA713': 'ASD', 'AA708': 'ASD', 'AA740': 'Autism', 'AA702': 'Autism', 'AA685': 'Autism', 'AA732': 'ASD', 'AA792': 'ASD'}


In [69]:
# index from samples with Diagnostico_principal duvidoso and the respective ADIR result
main_diag_doubt_dic = {}
for code in main_diag_doubt:
    main_diag_doubt_dic[code] = df.ADIR_quot[code]
print(main_diag_doubt_dic)

{'AA676': 'Negative', 'AA715': 'Positive', 'AA626': 'Negative', 'AA652': 'Negative', 'AA752': 'Negative', 'AA713': 'Negative', 'AA708': 'Negative', 'AA740': 'Negative', 'AA702': 'Negative', 'AA685': 'Negative', 'AA732': 'Negative', 'AA792': 'Negative'}


In [70]:
len(df)

928

### Exclusion of Samples with Altered Karyotype and FRAXE/FRAXE Results

#### Karyotype

In [71]:
# see values of column Cariótipo_Resultado
df.Karyotype_Result.unique()

array(['Normal ',
       'Cariótipo alterado_x001D_FISH cr15-N_x001D_CMV gain ANXA1',
       'Normal _x001D_Fish 16 N / Fish cr 15 N',
       'Normal _x001D_FISH cr15-N',
       'Normal _x001D_FISH cr15 e subteloméricas -N',
       'Normal _x001D_FISH cr15-N_x001D_Pack Autism  kit P343 (ch15,16,22) e P245 (Microdeletion S) - N  _x001D_Subtelomericas - N',
       'Normal _x001D_Pack Autism  kit P343 (ch15,16,22) e P245 (Microdeletion S) - N  _x001D_Subtelomericas - N',
       'alterado_x001D_47xyy, 45x', 'alterado_x001D_FISH cr15 alterado',
       'Normal _x001D_MLPA cr15 e subteloméricas - N',
       'Cariótipo normal ', nan, 'Normal _x001D_MLPA cr15 - N',
       'Normal _x001D_subteloméricas e FISH cr15 - N',
       'Normal _x001D_FISH cr15-N_x001D_Array CGH- N (46,XX.arr(1-22,)x2(,XX)x1',
       'Array CGH alterado_x001D_FISH cr15-N_x001D_KCNQ3 gene on chromosome 8q24 encoding the voltage-gated potassium channel KV7.3 subunit ',
       'FISH cr15-N normal_x001D_subteloméricas-N',
   

In [72]:
# index of samples with altered Cariótipo_Resultado, but do not contain the word normal and array
index_alt = df[(df['Karyotype_Result'].str.contains('alterado', regex=False, case=False, na=False)) 
               & ~(df['Karyotype_Result'].str.contains('array', regex=False, case=False, na=False)) 
               & ~(df['Karyotype_Result'].str.contains('normal', regex=False, case=False, na=False))].index

In [73]:
# how many samples with altered Cariótipo_Resultado, but do not contain the word normal and array
len(index_alt)

12

In [74]:
# drop samples with altered Cariótipo_Resultado, but do not contain the word normal and array
df.drop(index_alt, inplace = True)

In [75]:
# see potential samples that were not droped
df_car_alt = df[(df['Karyotype_Result'].str.contains('cariótipo alterado', regex=False, case=False, na=False))]
df_car_alt.Karyotype_Result.unique()

array(['FISH cr15-N normal_x001D_Regiões subteloméricas-N_x001D_Cariótipo alterado_x001D_Array CGH alterado',
       "Cariótipo alterado_x001D_trissomia parcial 6qter_x001D_10 Mb duplication 6q25.3-q27, de novo, 46,XX.ish der(22)t(6;22)(6q25.3;p11.2)pat(6qtel+)10 Mb duplication 6q25.3-q27, de novo, identified with SNP array. High resolution karyotype was normal; subtelomere FISH analysis revealed a 6q terminal duplication arising from a paternal balanced translocation, 46,XY, t(6;22)(q25.3;p11.2). The father is healthy. The patient's karyotype is: 46,XX.ish der(22)t(6;22)(6q25.3;p11.2)pat(6qtel+)_x000B_",
       'Array CGH_x001D_Cariótipo alterado_x001D_deleção 7(q21.11)',
       'Array CGH_x001D_Cariótipo alterado_x001D_duplicação terminal 3(q29)',
       'Array CGH_x001D_Cariótipo alterado_x001D_duplicação no braço curto do cromossoma 4(p16.1); triplicação no barço longo do cromossoma 11(q14.2); duplicação no braço longo do cromossoma 8(q22.1q.22.2)',
       'Array CGH_x001D_Cariótip

In [76]:
# index of samples which contain an alteread Cariótipo_Resultado
car_alt = df[(df['Karyotype_Result'].str.contains('cariótipo alterado', regex=False, case=False, na=False))].index

In [77]:
# how many samples which contain an alteread Cariótipo_Resultado
len(car_alt)

9

In [78]:
# drop samples which contain an alteread Cariótipo_Resultado
df.drop(car_alt, inplace = True)

#### FRAXA 

In [79]:
# see values in column FRAXA_Resultado
df.FRAXA_Result.unique()

array(['Normal', nan, 'não cultivou', 'aguarda', 'Repetir',
       'alterado , aumento nº tripletos CGG, compatível mutações mais frequentes ',
       'alterado', 'amostra insuficiente',
       'pre-mutação do gene FMR1 (57 repetições)',
       'mutação completa >200 repetições',
       'alterado alelo intermedio 43 repetições CGG',
       'alelo intermedio 43 repetições CGG',
       'alterado um alelo normal 28 e um intermedio 48 repetições CGG ',
       'alterado alelo intermedio 41 repetições CGG',
       'Normal 28 repetições CGG',
       'alterado alelo intermedio 48 repetições CGG', 'Normal (fora)',
       'alterado alelo intermedio 45 repetições CGG', 'Normal (exterior)'],
      dtype=object)

In [80]:
# index from samples with altered FRAXA result
fraxa_alt = df[(df['FRAXA_Result'].str.contains('alterado', regex=False, case=False, na=False))].index

In [81]:
# how many samples with altered FRAXA result
len(fraxa_alt)

9

In [82]:
# drop samples with altered FRAXA result
df.drop(fraxa_alt, inplace = True)

In [83]:
# see values in column FRAXA_Resultado
Counter(df.FRAXA_Result)

Counter({'Normal': 777,
         nan: 109,
         'não cultivou': 1,
         'aguarda': 1,
         'Repetir': 3,
         'amostra insuficiente': 1,
         'pre-mutação do gene FMR1 (57 repetições)': 1,
         'mutação completa >200 repetições': 1,
         'alelo intermedio 43 repetições CGG': 1,
         'Normal 28 repetições CGG': 1,
         'Normal (fora)': 1,
         'Normal (exterior)': 1})

In [84]:
# index of samples that also contain an altered FRAXA result
fraxa_alt_2 = df[df['FRAXA_Result'].isin(['pre-mutação do gene FMR1 (57 repetições)', 
                                             'mutação completa >200 repetições', 
                                             'alelo intermedio 43 repetições CGG'])].index

In [85]:
# how many samples with altered FRAXA result
len(fraxa_alt_2)

3

In [86]:
# drop samples with altered FRAXA result
df.drop(fraxa_alt_2, inplace = True)

#### FRAXE

In [87]:
# see values in column FRAXE_Resultado
Counter(df.FRAXE_Result)

Counter({'Normal': 384,
         nan: 505,
         'repetir': 4,
         'hipermetilação e >200 repetições': 1,
         'amostra insuficiente': 1})

In [88]:
# index from samples with altered FRAXE results
fraxe_alt = df[(df['FRAXE_Result'].str.contains('hipermetilação', regex=False, case=False, na=False))].index

In [89]:
# how many samples with altered FRAXE results
len(fraxe_alt)

1

In [90]:
# drop samples with altered FRAXE results
df.drop(fraxe_alt, inplace = True)

### Exclusion of Samples with severe Intelectual Deficiency (ID) 

In [91]:
# see cases of severee ID (DM and AGDPM)
Counter(df.Clinical_Diagnosis)

Counter({nan: 840,
         'Surdez': 2,
         'Epilepsia': 11,
         'Autismoatipico': 1,
         'Asperger': 28,
         'Evoluçãonãoautismo': 4,
         'Perturbação tiques': 2,
         'Sem deficiênciamental (>70)': 2,
         'DM ligeira (50-69)': 2,
         'AGDPM': 1,
         'DM moderada (35-49)': 1})

In [92]:
# index of samples with severe ID
sev_id = df[(df['Clinical_Diagnosis'] == 'AGDPM')].index

In [93]:
# drop samples with severe ID
df.drop(sev_id, inplace = True)

In [94]:
# see cases of severe ID (DM and AGDPM)
Counter(df.Secondary_Diagnosis)

Counter({'DM severa a profunda': 25,
         'DM severa a profunda (<34)': 28,
         'DM moderada (35-49)': 99,
         'DM ligeira a moderada': 53,
         'Sem deficiênciamental (>70)': 308,
         'DM ligeira (50-69)': 178,
         'DM moderada': 1,
         'Sem deficiênciamental': 83,
         'Nivel funcional desc': 9,
         'DM ligeira a moderada_x001D__x001D_FALECIDO': 1,
         'DM severa a profunda (<34)_x001D__x001D_Surdez NS profunda': 1,
         'Sem deficiênciamental_x001D_ ': 1,
         'DM ligeira a moderada ': 4,
         'Nivel funcional desc_x001D__x001D_???': 1,
         'DM ligeira a moderada _x001D__x001D_???': 1,
         'Sem deficiênciamental ': 21,
         'DM ligeira (50-69)_x001D__x001D_???': 1,
         'Nivel intelectual borderline': 14,
         'Sem deficiênciamental (>70)_x001D__x001D_Disfunçao cerebral mínima': 1,
         'Défice Intelectual': 9,
         'DM': 4,
         'AGDPM': 11,
         nan: 35,
         'P Desenv Linguagem': 

In [95]:
# index of samples with severe ID
sev_id_2 = df[((df['Secondary_Diagnosis'] == 'DM') | (df['Secondary_Diagnosis'] == 'AGDPM'))].index

In [96]:
# drop samples with severe ID
df.drop(sev_id_2, inplace = True)

In [97]:
len(df)

878

In [98]:
# profile report of dataset
profile = ProfileReport(df, title='Dataset After Samples Removal Profile', minimal = True)
# export profile report 
profile.to_file("Dataset After Samples Removal Profile")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Create New Verbal Column Based on ADIR Communication Values

In [99]:
# see values in column verbal
Counter(df.verbal)

Counter({'No': 383, 'Yes': 173, <NA>: 322})

In [100]:
# index of non verbal samples 
not_verbal = df[ (df['verbal'] == 'No') | (df['verbal'].isnull())].index.tolist()

In [101]:
# how many non verbal samples
len(not_verbal)

705

In [102]:
# index of verbal samples
verbal = df[(df['verbal'] == 'Yes')|(df['verbal'].isnull())].index.tolist()

In [103]:
# how many verbal samples
len(verbal)

495

In [104]:
# see values in column ADIR_C_7
df.ADIR_C_7.unique()

array([14., 12., 13., nan, 11.,  8.,  9., 10., 18.,  7.,  6.,  5., 16.,
        4.,  3.,  1.,  2.])

In [105]:
# see values in column ADIR_C_8
df.ADIR_C_8.unique()

array([nan, 24.,  8., 20., 18., 14., 13., 22., 17., 23., 26., 12.,  9.,
       21., 15., 16., 19., 25., 11., 10.,  2.,  4.,  3.,  5.,  6.,  7.,
        0.])

In [106]:
# make True if there are values for verbal (ADIR_C_8) and false if there are values for non verbal (ADIR_C_7)
def set_verbal(verbal, nverbal):
    if (~(pd.isnull(verbal)) & pd.isnull(nverbal)):
        verbal = True
    elif (pd.isnull(verbal) & pd.isnull(nverbal)):
        verbal = pd.NA
    else:
        verbal= False
    return verbal

In [107]:
# creat new column Verbal 
df['Verbal'] = df.apply(lambda x: set_verbal(x['ADIR_C_8'], x['ADIR_C_7']), axis=1)
df_original['Verbal'] = df.apply(lambda x: set_verbal(x['ADIR_C_8'], x['ADIR_C_7']), axis=1)

In [108]:
# see values of column Verbal
Counter(df.Verbal)

Counter({False: 433, True: 369, <NA>: 76})

In [109]:
# Index of samples classified as verbal in ADIR_C_8
verbal_true = df.Verbal[df.Verbal == True].index.tolist()

In [110]:
# Index of samples classified as non verbal in ADIR_C_7
verbal_false = df.Verbal[df.Verbal == False].index.tolist()

In [111]:
# check if lists of non verabl samples are in agreenment 
check_not_verbal =  all(item in not_verbal for item in verbal_false)
 
if check_not_verbal is True:
    print('Yes, not_verbal contains all elements of verbal_false')    
else :
    print('No, not_verbal doesn\'t have all elements of verbal_false.')

Yes, not_verbal contains all elements of verbal_false


In [112]:
# check of lists with information for verabl samples are in agreenment  
check_verbal =  all(item in verbal for item in verbal_true)
 
if check_verbal is True:
    print("Yes, verbal contains all elements of verbal_true.")    
else :
    print("No, verbal doesn't have all elements of the verbal_true.")

No, verbal doesn't have all elements of the verbal_true.


In [113]:
# create list with codes from verbal samples that do not agree 
main_list = list(set(verbal_true) - set(verbal))
main_list

['AA285', 'AA934', 'AA252', 'AA292', 'AA300', 'AA510', 'AA261']

In [114]:
df.loc[['AA300', 'AA510', 'AA292', 'AA934', 'AA285', 'AA261', 'AA252'],
       ['QDMreav', 'QDPSreav', 'QDLreav', 'QDEHreav','QDRreav',
        'QD_M', 'QD_PS', 'QD_L', 'QD_EH', 'QD_R',
        'Diag_Age', 'QDQI_Age', 'Verbal', 'verbal']]

Unnamed: 0_level_0,QDMreav,QDPSreav,QDLreav,QDEHreav,QDRreav,QD_M,QD_PS,QD_L,QD_EH,QD_R,Diag_Age,QDQI_Age,Verbal,verbal
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AA300,116.0,116.0,120.0,130.0,136.0,138.0,74.0,42.0,60.0,79.0,2.0,32.0,True,No
AA510,,,,,,157.0,102.0,72.0,104.0,104.0,2.0,22.0,True,No
AA292,,,,,,106.0,95.0,54.0,78.0,82.0,4.0,58.0,True,No
AA934,,,,,,58.0,40.0,20.0,31.0,34.0,,86.0,True,No
AA285,,,,,,83.0,71.0,71.0,81.0,68.0,4.0,66.0,True,No
AA261,,,,,,108.0,102.0,95.0,119.0,95.0,2.0,59.0,True,No
AA252,,,,,,76.0,57.0,59.0,69.0,67.0,2.0,29.0,True,No


In [115]:
# replace values of column Verbal
df.Verbal = df.Verbal.replace({True: 'Yes', False: 'No'})
df_original.Verbal = df_original.Verbal.replace({True: 'Yes', False: 'No'})

### Update QD Columns with Reavaluation Values 

In [116]:
df.loc[['AA605', 'AA625', 'AA300', 'AA719', 'AA689', 'AA179'],
       ['QDMreav', 'QDPSreav', 'QDLreav', 'QDEHreav','QDRreav',
        'QD_M', 'QD_PS', 'QD_L', 'QD_EH', 'QD_R']]

Unnamed: 0_level_0,QDMreav,QDPSreav,QDLreav,QDEHreav,QDRreav,QD_M,QD_PS,QD_L,QD_EH,QD_R
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AA605,82.0,68.0,68.0,80.0,80.0,,57.0,70.0,54.0,60.0
AA625,91.0,88.0,76.0,102.0,132.0,68.0,63.0,,77.0,86.0
AA300,116.0,116.0,120.0,130.0,136.0,138.0,74.0,42.0,60.0,79.0
AA719,66.0,34.0,21.0,34.0,59.0,58.0,58.0,58.0,58.0,58.0
AA689,90.0,81.0,77.0,63.0,86.0,85.0,59.0,52.0,69.0,59.0
AA179,,,,,,33.0,24.0,,23.0,34.0


In [117]:
df['QD_M'] = df.apply(lambda x: x['QD_M'] if pd.isna(x['QDMreav']) else x['QDMreav'], axis=1)

In [118]:
df['QD_PS'] = df.apply(lambda x: x['QD_PS'] if pd.isna(x['QDPSreav']) else x['QDPSreav'], axis=1)

In [119]:
df['QD_EH'] = df.apply(lambda x: x['QD_EH'] if pd.isna(x['QDEHreav']) else x['QDEHreav'], axis=1)

In [120]:
df['QD_R'] = df.apply(lambda x: x['QD_R'] if pd.isna(x['QDRreav']) else x['QDRreav'], axis=1)

In [121]:
df['QD_L'] = df.apply(lambda x: x['QD_L'] if pd.isna(x['QDLreav']) else x['QDLreav'], axis=1)

### Exclusion of Samples with ages > 8 When Griffiths Was Performed

In [122]:
# Convert age values from months to years
df.QDQI_Age = df.QDQI_Age.div(12).round(0)
df_original.QDQI_Age = df_original.QDQI_Age.div(12).round(0)

In [123]:
df.QDQI_Age.unique()

array([ 8.000e+00,  7.000e+00,  1.000e+01,  9.000e+00,  1.600e+01,
        1.100e+01,        nan,  4.000e+00,  6.000e+00,  3.000e+00,
        1.200e+01,  5.000e+00,  1.400e+01,  1.300e+01,  2.000e+00,
        1.900e+01, -1.996e+03, -2.006e+03,  0.000e+00, -2.008e+03])

In [124]:
# Indexes with strange age values
strange_ages = df[df.QDQI_Age <= 0].index
strange_ages

Index(['AA514', 'AA573', 'AA663', 'AA768', 'AA859'], dtype='object', name='Code')

In [125]:
# Drop indexes with strange ages that do not have QD information
strange_ages_withou_qd = ['AA573', 'AA663', 'AA514', 'AA859']
df.drop(strange_ages_withou_qd, inplace = True)

In [126]:
# Create dataframe with QD and Age values 
qd_age = df[['QD_M', 'QD_PS', 'QD_L', 'QD_EH', 'QD_R', 
             'QDMreav', 'QDEHreav', 'QDPSreav', 'QDRreav', 'QDLreav',
             'Diag_Age', 'QDQI_Age']].copy()

In [127]:
# Number of samples with ages less or equal to 8 at the time of Griffiths evaluation
len(df[(df.QDQI_Age <= 8)])

670

In [128]:
# Number of samples with ages greater thab 8 at the time of Griffiths evaluation
age_qd_h8 = df[(df.QDQI_Age > 8)].index
len(age_qd_h8)

102

In [129]:
# Drop of all samples with ages greater thab 8 at the time of Griffiths evaluation
df.drop(age_qd_h8, inplace = True)

In [130]:
# Index of samples with ages less or equal to 8 at the time of Griffiths evaluation, but do not have any value for QD
age_qd_l8 = df[(df.QDQI_Age <= 8) 
               & (df.QD_M.isnull()) & (df.QD_PS.isnull()) 
               & (df.QD_EH.isnull()) & (df.QD_R.isnull())
               & (df.QD_L.isnull())].index
age_qd_l8

Index(['AA925'], dtype='object', name='Code')

In [131]:
# Number of samples with diagnostic age greater than 8
len(df[(df.Diag_Age > 8)])

18

In [132]:
# Index of samples with diagnostic age greater than 8 but with no values for age at the time of Griffiths evaluation 
# and with QD values
qd_null_da_h8 = df[(df.Diag_Age > 8) & (df.QDQI_Age.isnull()) 
               & (~df.QD_M.isnull()) & (~df.QD_PS.isnull()) 
               & (~df.QD_EH.isnull()) & (~df.QD_R.isnull())
               & (~df.QD_L.isnull())].index
qd_null_da_h8

Index(['AA158'], dtype='object', name='Code')

In [133]:
df.drop(qd_null_da_h8, inplace = True)

In [134]:
# Index of samples with diagnostic age greater than 8 but less or equal to 8 at the time of Griffiths evaluation 
# and with QD values
da_h8_qd_l8 = df[(df.Diag_Age > 8) & (df.QDQI_Age <= 8)  
               & (~df.QD_M.isnull()) & (~df.QD_PS.isnull()) 
               & (~df.QD_EH.isnull()) & (~df.QD_R.isnull())
               & (~df.QD_L.isnull())].index
da_h8_qd_l8

Index(['AA136'], dtype='object', name='Code')

In [135]:
df.drop(da_h8_qd_l8, inplace = True)

In [136]:
# Index of samples with diagnostic age greater than 8 but with no values for age at the time of Griffiths evaluation
# and with no QD values
diag_age_h8 = df[(df.Diag_Age > 8) & (df.QDQI_Age.isnull())
               & (df.QD_M.isnull()) & (df.QD_PS.isnull()) 
               & (df.QD_EH.isnull()) & (df.QD_R.isnull())
               & (df.QD_L.isnull())].index
diag_age_h8

Index(['AA16', 'AA85', 'AA101', 'AA190', 'AA496', 'AA349', 'AA328', 'AA322',
       'AA265', 'AA406', 'AA335', 'AA401', 'AA374', 'AA461', 'AA462', 'AA604'],
      dtype='object', name='Code')

In [137]:
# Drop of samples with diagnostic age greater than 8 but with no values for age at the time of Griffiths evaluation
# and with no QD values
df.drop(diag_age_h8, inplace = True)

In [138]:
qd_age

Unnamed: 0_level_0,QD_M,QD_PS,QD_L,QD_EH,QD_R,QDMreav,QDEHreav,QDPSreav,QDRreav,QDLreav,Diag_Age,QDQI_Age
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AA1,47.0,35.0,13.0,28.0,38.0,,,,,,2.0,8.0
AA34,43.0,27.0,18.0,25.0,31.0,,,,,,7.0,8.0
AA36,52.0,35.0,8.0,31.0,45.0,,,,,,6.0,7.0
AA72,29.0,21.0,23.0,21.0,20.0,,,,,,,7.0
AA86,30.0,26.0,12.0,18.0,29.0,,,,,,,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...
AA946,,,,,,,,,,,,
AA943,,,,,,,,,,,,
AA944,,,,,,,,,,,,
AA950,,,,,,,,,,,,


In [139]:
# Index of samples that do not have values for QD 
qd_na = df[(df.QD_M.isnull()) & (df.QD_PS.isnull()) 
                   & (df.QD_EH.isnull()) & (df.QD_R.isnull())
                   & (df.QD_L.isnull())].index
qd_na

Index(['AA161', 'AA97', 'AA141', 'AA125', 'AA305', 'AA226', 'AA235', 'AA284',
       'AA388', 'AA273', 'AA312', 'AA296', 'AA478', 'AA301', 'AA481', 'AA367',
       'AA454', 'AA763', 'AA415', 'AA521', 'AA577', 'AA500', 'AA434', 'AA547',
       'AA516', 'AA584', 'AA549', 'AA635', 'AA743', 'AA925', 'AA800', 'AA603',
       'AA598', 'AA688', 'AA923', 'AA725', 'AA785', 'AA849', 'AA802', 'AA756',
       'AA896', 'AA891', 'AA833', 'AA866', 'AA831', 'AA857', 'AA937', 'AA843',
       'AA870', 'AA848', 'AA882', 'AA898', 'AA853', 'AA876', 'AA860', 'AA918',
       'AA888', 'AA808', 'AA911', 'AA906', 'AA907', 'AA884', 'AA926', 'AA917',
       'AA910', 'AA939', 'AA920', 'AA931', 'AA924', 'AA948', 'AA941', 'AA936',
       'AA949', 'AA935', 'AA947', 'AA946', 'AA943', 'AA944', 'AA950', 'AA951'],
      dtype='object', name='Code')

In [140]:
# Index of samples that do not have values for age or QD 
qd_age_na = df[(df.Diag_Age.isnull()) & (df.QDQI_Age.isnull())
                   & (df.QD_M.isnull()) & (df.QD_PS.isnull()) 
                   & (df.QD_EH.isnull()) & (df.QD_R.isnull())
                   & (df.QD_L.isnull())].index
qd_age_na

Index(['AA97', 'AA603', 'AA688', 'AA923', 'AA725', 'AA785', 'AA849', 'AA802',
       'AA756', 'AA896', 'AA891', 'AA833', 'AA866', 'AA831', 'AA857', 'AA937',
       'AA843', 'AA870', 'AA848', 'AA882', 'AA898', 'AA853', 'AA876', 'AA860',
       'AA918', 'AA888', 'AA808', 'AA911', 'AA906', 'AA907', 'AA884', 'AA926',
       'AA917', 'AA910', 'AA939', 'AA920', 'AA931', 'AA924', 'AA948', 'AA941',
       'AA936', 'AA949', 'AA935', 'AA947', 'AA946', 'AA943', 'AA944', 'AA950',
       'AA951'],
      dtype='object', name='Code')

In [141]:
# Samples that do not have values for age or QD and their date of birth or/and date of observation
qd_age_na_date = df.loc[['AA97', 'AA603', 'AA688', 'AA923', 'AA725', 'AA785', 'AA849', 'AA802',
       'AA756', 'AA896', 'AA891', 'AA833', 'AA866', 'AA831', 'AA857', 'AA937',
       'AA843', 'AA870', 'AA848', 'AA882', 'AA898', 'AA853', 'AA876', 'AA860',
       'AA918', 'AA888', 'AA808', 'AA911', 'AA906', 'AA907', 'AA884', 'AA926',
       'AA917', 'AA910', 'AA939', 'AA920', 'AA931', 'AA924', 'AA948', 'AA941',
       'AA936', 'AA949', 'AA935', 'AA947', 'AA946', 'AA943', 'AA944', 'AA950',
       'AA951'],
       ['BD', 'Observation_Date']]
qd_age_na_date

Unnamed: 0_level_0,BD,Observation_Date
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
AA97,1996-02-01 00:00:00,
AA603,2003-04-14 00:00:00,2011-03-01 00:00:00
AA688,2008-09-10 00:00:00,2012-07-05 00:00:00
AA923,2009.04.14,2013.08.21
AA725,1997-12-30 00:00:00,2013-11-29 00:00:00
AA785,2004-07-31 00:00:00,2014-02-17 00:00:00
AA849,2010.01.01,2014.03.11
AA802,2009.10.03,2014-03-25 00:00:00
AA756,2010.10.05,2014-05-13 00:00:00
AA896,2011-10-12 00:00:00,2015-09-21 00:00:00


In [142]:
# Convert BD column to datatime type
qd_age_na_date['BD'] = qd_age_na_date['BD'].astype('datetime64[s]')

In [143]:
# Convert Observation_Date column to datatime type
qd_age_na_date['Observation_Date'] = qd_age_na_date['Observation_Date'].astype('datetime64[s]')

In [144]:
# Calculate the age at the time of observation
qd_age_na_date['Difference'] = (qd_age_na_date['Observation_Date'] - qd_age_na_date['BD'])

In [145]:
# Convert age from days to years
qd_age_na_date.Difference = qd_age_na_date.Difference.div(365)

In [146]:
qd_age_na_date

Unnamed: 0_level_0,BD,Observation_Date,Difference
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AA97,1996-02-01,NaT,NaT
AA603,2003-04-14,2011-03-01,7 days 21:14:18.082191780
AA688,2008-09-10,2012-07-05,3 days 19:39:36.986301369
AA923,2009-04-14,2013-08-21,4 days 08:32:52.602739726
AA725,1997-12-30,2013-11-29,15 days 22:13:28.767123287
AA785,2004-07-31,2014-02-17,9 days 13:20:52.602739726
AA849,2010-01-01,2014-03-11,4 days 04:36:09.863013698
AA802,2009-10-03,2014-03-25,4 days 11:26:27.945205479
AA756,2010-10-05,2014-05-13,3 days 14:31:53.424657534
AA896,2011-10-12,2015-09-21,3 days 22:41:05.753424657


In [147]:
# Index of samples that do not have values for age of observation
qd_age_unk = qd_age_na_date[qd_age_na_date.Difference.isnull()].index

In [148]:
# Drop index of samples that do not have values for age of observation
df.drop(qd_age_unk, inplace = True)

In [149]:
df.loc[['AA125', 'AA296', 'AA785'],
       ['QDMreav', 'QDPSreav', 'QDLreav', 'QDEHreav','QDRreav',
        'QD_M', 'QD_PS', 'QD_L', 'QD_EH', 'QD_R', 'WISCIII',
        'Diag_Age', 'QDQI_Age']]

Unnamed: 0_level_0,QDMreav,QDPSreav,QDLreav,QDEHreav,QDRreav,QD_M,QD_PS,QD_L,QD_EH,QD_R,WISCIII,Diag_Age,QDQI_Age
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AA125,,,,,,,,,,,N,8.0,
AA296,,,,,,,,,,,N,7.0,
AA785,,,,,,,,,,,S,,


In [150]:
# Index of samples that that have an age of observation greater than 8
obs_age_h8 = ['AA918', 'AA937', 'AA857', 'AA725']

In [151]:
df.drop(obs_age_h8, inplace = True)

In [152]:
# Highleted samples that have updated values in Filemaker
df.loc[['AA179','AA518','AA636', 'AA925', 'AA935', 'AA946'], 
       ['QD_M', 'QD_PS', 'QD_L', 'QD_EH', 'QD_R','Diag_Age', 'QDQI_Age']]

Unnamed: 0_level_0,QD_M,QD_PS,QD_L,QD_EH,QD_R,Diag_Age,QDQI_Age
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AA179,33.0,24.0,,23.0,34.0,5.0,6.0
AA518,86.0,73.0,95.0,108.0,92.0,4.0,6.0
AA636,91.0,61.0,69.0,68.0,100.0,2.0,2.0
AA925,,,,,,4.0,5.0
AA935,,,,,,,
AA946,,,,,,,


In [153]:
# Update highlited samples values
df.loc['AA636'] = {'QD_M':90, 'QD_PS':61, 'QD_L':69, 'QD_EH':68, 'QD_R':100, 'QDQI_Age': 44/12}
df.loc['AA925'] = {'QD_M':106, 'QD_PS':82, 'QD_L':86, 'QD_EH':88, 'QD_R':78, 'QDQI_Age':64/12}
df.loc['AA946'] = {'QD_M':120, 'QD_PS':72, 'QD_L':68, 'QD_EH':68, 'QD_R':108, 'QDQI_Age':50/12}
df.loc['AA935'] = {'QD_M':78, 'QD_PS':66, 'QD_L':40, 'QD_EH':86, 'QD_R':115, 'QDQI_Age':69/12}
df.loc['AA179'] = {'QD_M':33, 'QD_PS':24, 'QD_L':0, 'QD_EH':23, 'QD_R':34, 'QDQI_Age': 6}

In [154]:
# Create dataframe with Griffiths columns only
qd = df[['QD_M', 'QD_PS', 'QD_L', 'QD_EH', 'QD_R']]

In [155]:
# Check if there are samples that have values for only some Griffith columns 
nan_rows = qd[qd.isnull().any(axis=1)]
nan_rows

Unnamed: 0_level_0,QD_M,QD_PS,QD_L,QD_EH,QD_R
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AA161,,,,,
AA141,,,,,
AA125,,,,,
AA305,,,,,
AA226,,,,,
...,...,...,...,...,...
AA948,,,,,
AA936,,,,,
AA949,,,,,
AA950,,,,,


In [156]:
# Check if there are samples that are non verbal but have values for QD_L or QDLreav
qdl_na_non_verbal = df[(df.Verbal == 'No')
                   & (df.QD_L.isnull()) & (df.QDLreav.isnull())]
qdl_na_non_verbal[['Verbal', 'QD_L', 'QDLreav']]

Unnamed: 0_level_0,Verbal,QD_L,QDLreav
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AA226,No,,
AA235,No,,
AA284,No,,
AA273,No,,
AA635,No,,
AA603,No,,
AA688,No,,
AA896,No,,
AA870,No,,
AA853,No,,


In [157]:
df.loc[qdl_na_non_verbal.index, 'QD_L'] = 0

In [158]:
# Samples that are non verbal but have QD_L values greater than 79
qdl_non_verbal = df[(df.Verbal == 'No')
                   & (df.QD_L > 79) & (df.QDLreav.isnull())]
qdl_non_verbal[['Verbal', 'QD_L', 'QDLreav']]

Unnamed: 0_level_0,Verbal,QD_L,QDLreav
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AA219,No,140.0,
AA236,No,92.0,
AA225,No,103.0,
AA262,No,81.0,
AA281,No,81.0,
AA318,No,89.0,
AA466,No,88.0,
AA352,No,129.0,
AA358,No,112.0,
AA385,No,123.0,


In [159]:
# Index of samples that are non verbal but have QD_L values greater than 79
qdl_non_verbal_index = df[(df.Verbal == 'No')
                   & (df.QD_L > 79) & (df.QDLreav.isnull())].index

In [160]:
# Drop samples that are non verbal but have QD_L values greater than 79
df.drop(qdl_non_verbal_index, inplace = True)

### Exclusion of Missing Data Variables

In [161]:
# Creat new dataset with only variables of interess 
df_cluster = df[['QD_M', 'QD_PS', 'QD_L', 'QD_EH', 'QD_R',
                 'VABS_Com', 'VABS_Soc', 'VABS_Aut',
                 'ADIR_Soc', 'ADIR_RRB', 'ADIR_AbDev']].copy()

In [162]:
df_cluster

Unnamed: 0_level_0,QD_M,QD_PS,QD_L,QD_EH,QD_R,VABS_Com,VABS_Soc,VABS_Aut,ADIR_Soc,ADIR_RRB,ADIR_AbDev
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AA1,47.0,35.0,13.0,28.0,38.0,,,,28.0,9.0,2.0
AA34,43.0,27.0,18.0,25.0,31.0,,,,30.0,8.0,5.0
AA36,52.0,35.0,8.0,31.0,45.0,,,,30.0,8.0,5.0
AA72,29.0,21.0,23.0,21.0,20.0,,,,26.0,4.0,5.0
AA91,28.0,19.0,11.0,18.0,26.0,,,,28.0,8.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...
AA837,80.0,62.0,64.0,83.0,83.0,72.0,59.0,58.0,15.0,3.0,5.0
AA935,78.0,66.0,40.0,86.0,115.0,,,,,,
AA946,120.0,72.0,68.0,68.0,108.0,,,,,,
AA950,,,,,,,,,,,


In [163]:
# Drop samples that have missing data in all selected columns
df_cluster = df_cluster.dropna(subset=['ADIR_Soc','ADIR_RRB','ADIR_AbDev',
                                       'QD_M','QD_PS','QD_L','QD_EH','QD_R'], how='all')

In [164]:
# Drop samples that have missing data in all selected columns
df_cluster = df_cluster.dropna(subset=['ADIR_Soc','ADIR_RRB', 'ADIR_AbDev',
                                       'VABS_Com', 'VABS_Soc', 'VABS_Aut'], how = 'all')

In [165]:
# Drop samples that have missing data in all selected columns
df_cluster = df_cluster.dropna(subset=['VABS_Com', 'VABS_Soc', 'VABS_Aut',
                                       'QD_M','QD_PS','QD_L','QD_EH','QD_R'], how = 'all')

In [166]:
# Copy from dataset with only variables of interess 
df_cluster = df_cluster[['QD_M', 'QD_PS', 'QD_L', 'QD_EH', 'QD_R',
                         'VABS_Com', 'VABS_Soc', 'VABS_Aut',
                         'ADIR_Soc', 'ADIR_RRB', 'ADIR_AbDev']].copy()

In [167]:
# See how missing data for each column
df_cluster.isnull().sum()

QD_M           33
QD_PS          33
QD_L           21
QD_EH          33
QD_R           33
VABS_Com      137
VABS_Soc      135
VABS_Aut      137
ADIR_Soc       23
ADIR_RRB       24
ADIR_AbDev     32
dtype: int64

In [168]:
df_cluster

Unnamed: 0_level_0,QD_M,QD_PS,QD_L,QD_EH,QD_R,VABS_Com,VABS_Soc,VABS_Aut,ADIR_Soc,ADIR_RRB,ADIR_AbDev
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AA1,47.0,35.0,13.0,28.0,38.0,,,,28.0,9.0,2.0
AA34,43.0,27.0,18.0,25.0,31.0,,,,30.0,8.0,5.0
AA36,52.0,35.0,8.0,31.0,45.0,,,,30.0,8.0,5.0
AA72,29.0,21.0,23.0,21.0,20.0,,,,26.0,4.0,5.0
AA91,28.0,19.0,11.0,18.0,26.0,,,,28.0,8.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...
AA927,76.0,55.0,84.0,81.0,84.0,79.0,74.0,79.0,,,
AA938,97.0,73.0,65.0,77.0,97.0,78.0,68.0,77.0,11.0,3.0,
AA934,58.0,40.0,20.0,31.0,34.0,47.0,63.0,55.0,10.0,3.0,25.0
AA942,115.0,85.0,47.0,89.0,85.0,63.0,77.0,69.0,,,


In [169]:
# profile report of dataset with only variables of interess
profile = ProfileReport(df_cluster, title='Cluster Variables Profile')
# export profile report 
profile.to_file("Cluster Variables Profile.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Selection and Treatment of Cluster Categorization Variables

In [170]:
# create new dataframe with categorical variables 
df_var_cat = df[['Gender',
                 'ADOS_Sev','ADIR_quot',
                 'Dysmorphysm','Language_Regr',
                 'Audition', 'Vision',
                 'Verbal',
                 'Psyc_Family_Hist',
                 'PMD_Regression','PMD_Delay',
                 'HC','Apgar_1', 'Apgar_5',
                 'Diag_Age','Walk_Age',
                 'First_Words_Age','First_Phrases_Age', 
                 'ADIR_Soc', 'ADIR_RRB', 'ADIR_AbDev',
                 'VABS_Com', 'VABS_Soc', 'VABS_Aut',
                 'QD_M', 'QD_PS', 'QD_L', 'QD_EH', 'QD_R']].copy()

In [171]:
# reset index of categorical variables dataset
df_var_cat = df_var_cat.reset_index()
df_cluster = df_cluster.reset_index()
df_var_cat = df_var_cat[df_var_cat.Code.isin(df_cluster.Code)]

In [172]:
df_var_cat = df_var_cat.set_index("Code")

In [173]:
df_var_cat.Dysmorphysm.unique()

array(['N', 'S', nan], dtype=object)

In [174]:
# replace values from column Dysmorphysm 
df_var_cat['Dysmorphysm'] = df_var_cat['Dysmorphysm'].replace(['S'],'Yes')
df_original['Dysmorphysm'] = df_original['Dysmorphysm'].replace(['S'],'Yes')
df_var_cat['Dysmorphysm'] = df_var_cat['Dysmorphysm'].replace(['N'],'No')
df_original['Dysmorphysm'] = df_original['Dysmorphysm'].replace(['N'],'No')

In [175]:
df_var_cat.Language_Regr.unique()

array(['N', 'S', nan], dtype=object)

In [176]:
# replace values from column Language_Regr 
df_var_cat['Language_Regr'] = df_var_cat['Language_Regr'].replace(['S'],'Yes')
df_original['Language_Regr'] = df_original['Language_Regr'].replace(['S'],'Yes')
df_var_cat['Language_Regr'] = df_var_cat['Language_Regr'].replace(['N'],'No')
df_original['Language_Regr'] = df_original['Language_Regr'].replace(['N'],'No')

In [177]:
# see values of column PMD_Delay
df_var_cat.PMD_Delay.unique()

array(['S', '2A', 'N', '3 anos', nan, '3 Ano_x000B_', '4 anos', '3A'],
      dtype=object)

In [178]:
# replace values from column PMD_Delay 
df_var_cat['PMD_Delay'] = df_var_cat['PMD_Delay'].replace(['2A', '3 anos', '3 Ano_x000B_', '4 anos', '3A', 'S'],'Yes')
df_original['PMD_Delay'] = df_original['PMD_Delay'].replace(['2A', '3 anos', '3 Ano_x000B_', '4 anos', '3A','S'],'Yes')
df_var_cat['PMD_Delay'] = df_var_cat['PMD_Delay'].replace(['N'],'No')
df_original['PMD_Delay'] = df_original['PMD_Delay'].replace(['N'],'No')

In [179]:
df_var_cat.PMD_Regression.unique()

array(['N', 'S', nan], dtype=object)

In [180]:
df_var_cat['PMD_Regression'] = df_var_cat['PMD_Regression'].replace(['S'],'Yes')
df_original['PMD_Regression'] = df_original['PMD_Regression'].replace(['S'],'Yes')
df_var_cat['PMD_Regression'] = df_var_cat['PMD_Regression'].replace(['N'],'No')
df_original['PMD_Regression'] = df_original['PMD_Regression'].replace(['N'],'No')

In [181]:
# see value for Psyc_Family_Hist
df_var_cat.Psyc_Family_Hist.unique()

array(['S', 'N', nan, 'outro'], dtype=object)

In [182]:
# replace values from column Psyc_Family_Hist
df_var_cat['Psyc_Family_Hist'] = df_var_cat['Psyc_Family_Hist'].replace(['outro'],pd.NA)
df_original['Psyc_Family_Hist'] = df_original['Psyc_Family_Hist'].replace(['outro'],pd.NA)
df_var_cat['Psyc_Family_Hist'] = df_var_cat['Psyc_Family_Hist'].replace(['S'],'Yes')
df_original['Psyc_Family_Hist'] = df_original['Psyc_Family_Hist'].replace(['S'],'Yes')
df_var_cat['Psyc_Family_Hist'] = df_var_cat['Psyc_Family_Hist'].replace(['N'],'No')
df_original['Psyc_Family_Hist'] = df_original['Psyc_Family_Hist'].replace(['N'],'No')

In [183]:
# see values of column Audition 
df_var_cat['Audition'].unique()

array(['Normal_x001D_Normal- ORL', 'Normal C',
       'Surdez( especificar)_x001D_anormal- ORL', 'Normal- ORL',
       'Surdez( especificar)_x001D_surdez ligeira transmissão-Eda',
       'Normal', 'Normal-segundo inf. mãe', nan,
       'Não avaliada_x001D_Normal- ORL',
       'surdez ligeira transmissão-Eda_x001D_anormal- ORL',
       'PEA - normal', 'em estudo',
       'Clinicamente normal_x001D_Não avaliada por ORL',
       'Clinicamente normal', 'Normal_x001D_PEA - normal',
       'Normal- ORL_x001D_PEA - normal', 'PEA - normal_x001D_Normal- ORL',
       'PEA - Normal', 'surdez ligeira transmissão-Eda',
       'ORL-défice auditivo ligeiro', 'anormal- ORL_x001D_Alteração Dta.',
       'aguarda avaliação ORL',
       'Défice auditivo 20-30Db_x001D_anormal- ORL', 'anormal- ORL',
       'alterado - ORL', 'Vai ser avaliado', 'Aguarda ORL',
       'Normal - ORL', 'Duvidoso',
       'anormal- ORL_x001D_surdez transmissão OD', 'Normal C ?',
       'Normal ORL', 'Normal- ORL_x001D_Normal',
 

In [184]:
# replace values of column Audition
df_var_cat['Audition'] = df_var_cat['Audition'].replace(['Normal- ORL', 'Normal C', 'Normal',
                                                       'Normal-segundo inf. mãe', 'Não avaliada_x001D_Normal- ORL',
                                                       'PEA - normal','Clinicamente normal_x001D_Não avaliada por ORL',
                                                       'Clinicamente normal', 'Normal_x001D_PEA - normal',
                                                       'Normal- ORL_x001D_PEA - normal', 'PEA - normal_x001D_Normal- ORL', 
                                                       'PEA - Normal','Normal C_x001D_Normal- ORL', 'Normal - ORL', 
                                                       'Normal C_x001D_Pedida ORL','Normal C ?','Normal ORL', 
                                                       'Normal_x001D_Normal- ORL','Normal- ORL_x001D_Normal'],'Normal')
df_original['Audition'] = df_original['Audition'].replace(['Normal- ORL', 'Normal C', 'Normal',
                                                       'Normal-segundo inf. mãe', 'Não avaliada_x001D_Normal- ORL',
                                                       'PEA - normal','Clinicamente normal_x001D_Não avaliada por ORL',
                                                       'Clinicamente normal', 'Normal_x001D_PEA - normal',
                                                       'Normal- ORL_x001D_PEA - normal', 'PEA - normal_x001D_Normal- ORL', 
                                                       'PEA - Normal','Normal C_x001D_Normal- ORL', 'Normal - ORL', 
                                                       'Normal C_x001D_Pedida ORL','Normal C ?','Normal ORL', 
                                                       'Normal_x001D_Normal- ORL','Normal- ORL_x001D_Normal'],'Normal')

In [185]:
# replace values of column Audition
df_var_cat['Audition'] = df_var_cat['Audition'].replace(['Surdez NS profunda_x001D_anormal- ORL',
                                                       'Surdez( especificar)_x001D_surdez ligeira transmissão-Eda',
                                                       'surdez ligeira transmissão-Eda_x001D_anormal- ORL',
                                                       'Cirurgia ORL_x001D_Otite serosa',
                                                       'surdez ligeira transmissão-Eda', 'ORL-défice auditivo ligeiro',
                                                       'anormal- ORL_x001D_Alteração Dta.',
                                                       'Défice auditivo 20-30Db_x001D_anormal- ORL', 'anormal- ORL',
                                                       'alterado - ORL','anormal- ORL_x001D_surdez transmissão OD', 
                                                       'anormal- ORL_x001D_surdez ligeira transmissão-Eda',
                                                       'em estudo_x001D_surdez ligeira transmissão-Eda',
                                                       'Surdez( especificar)_x001D_anormal- ORL',
                                                       'anormal- ORL_x001D_surdez NS profunda',
                                                       'Surdez NS profunda',
                                                       'surdez ligeira transmissão-Eda_x001D_Normal- ORL'],'Abnormal')
# replace values of column Audition
df_original['Audition'] = df_original['Audition'].replace(['Surdez NS profunda_x001D_anormal- ORL',
                                                       'Surdez( especificar)_x001D_surdez ligeira transmissão-Eda',
                                                       'surdez ligeira transmissão-Eda_x001D_anormal- ORL',
                                                       'Cirurgia ORL_x001D_Otite serosa',
                                                       'surdez ligeira transmissão-Eda', 'ORL-défice auditivo ligeiro',
                                                       'anormal- ORL_x001D_Alteração Dta.',
                                                       'Défice auditivo 20-30Db_x001D_anormal- ORL', 'anormal- ORL',
                                                       'alterado - ORL','anormal- ORL_x001D_surdez transmissão OD', 
                                                       'anormal- ORL_x001D_surdez ligeira transmissão-Eda',
                                                       'em estudo_x001D_surdez ligeira transmissão-Eda',
                                                       'Surdez( especificar)_x001D_anormal- ORL',
                                                       'anormal- ORL_x001D_surdez NS profunda',
                                                       'Surdez NS profunda',
                                                       'surdez ligeira transmissão-Eda_x001D_Normal- ORL'],'Abnormal')

In [186]:
# replace values of column Audition
df_var_cat['Audition'] = df_var_cat['Audition'].replace(['em estudo','aguarda avaliação ORL','Duvidoso', 
                                                       'Vai ser avaliado', 'Aguarda ORL', np.nan], pd.NA)
# replace values of column Audition
df_original['Audition'] = df_original['Audition'].replace(['em estudo','aguarda avaliação ORL','Duvidoso', 
                                                       'Vai ser avaliado', 'Aguarda ORL', np.nan], pd.NA)

In [187]:
# see values of column Audition
df_var_cat['Audition'].unique()

array(['Normal', 'Abnormal', <NA>], dtype=object)

In [188]:
# see values of column Vision 
df_var_cat['Vision'].unique()

array(['Normal C', 'Anormal/of_x001D_Estrabismo', 'Normal-Oftal',
       'Estrabismo_x001D_Anormal/of', nan, 'Anormal/of_x001D_Miopia',
       'Anormal/of_x001D_Miopia_x001D_Astigmatismo',
       'Anormal/of_x001D_usa oculos', 'Anormal/of',
       'Anormal_x001D_microftalmia/albinismo ocular', 'Normal',
       'Estrabismo_x001D_Anormal', 'Anormal_x001D_Miopia',
       'Anormal/of_x001D_Astigmatismo_x001D_Miopia', 'Anormal',
       'Estrabismo_x001D_usa oculos_x001D_Anormal',
       'Normal-Oftal_x001D_Miopia_x001D_Usa Óculos',
       'Miopia_x001D_Anormal',
       'Usa Óculos_x001D_Astigmatismo_x001D_Anormal/of',
       'Anormal/of_x001D_Usa Óculos_x001D_Miopia', 'Clinicamente normal',
       'Estrabismo', 'Hipermetropia_x001D_Estrabismo_x001D_usa oculos',
       'Normal_x001D_Normal C', 'Estrabismo???',
       'Estrabismo_x001D_vai à consulta_x001D_Anormal/of',
       'Normal - Oftal_x001D_Miopia', 'Miopia_x001D_Anormal/of',
       'Usa Óculos', 'Alterada - Oftal._x001D_Astigmatismo',

In [189]:
# replace values of column Vision
df_var_cat['Vision'] = df_var_cat['Vision'].replace(['Normal C','Normal-Oftal',
                                                   'Normal','Normal-Oftal_x001D_Miopia_x001D_Usa àculos',
                                                   'Clinicamente normal','Normal_x001D_Normal C',
                                                   'Normal - Oftal_x001D_Miopia','Normal Oftal',
                                                   'Normal teste', 'Normal - Oftal', 
                                                   'Normal-Oftal_x001D_Miopia_x001D_Usa Óculos'],'Normal')
df_original['Vision'] = df_original['Vision'].replace(['Normal C','Normal-Oftal',
                                                   'Normal','Normal-Oftal_x001D_Miopia_x001D_Usa àculos',
                                                   'Clinicamente normal','Normal_x001D_Normal C',
                                                   'Normal - Oftal_x001D_Miopia','Normal Oftal',
                                                   'Normal teste', 'Normal - Oftal', 
                                                   'Normal-Oftal_x001D_Miopia_x001D_Usa Óculos'],'Normal')

In [190]:
# replace values of column Vision
df_var_cat['Vision'] = df_var_cat['Vision'].replace(['Anormal/of_x001D_Miopia',
                                                   'Anormal/of_x001D_Estrabismo',
                                                   'Anormal/of_x001D_Hipermetropia',
                                                   'Anormal/of_x001D_Miopia_x001D_Astigmatismo',
                                                   'Anormal/of_x001D_usa oculos',
                                                   'Anormal_x001D_microftalmia/albinismo ocular',
                                                   'Estrabismo_x001D_Anormal',
                                                   'Estrabismo (??)_x001D_Anormal', 'Anormal_x001D_Miopia',
                                                   'Anormal/of_x001D_Astigmatismo_x001D_Miopia', 'Anormal',
                                                   'Estrabismo_x001D_usa oculos_x001D_Anormal',
                                                   'Anormal_x001D_Estrabismo_x001D_Astigmatismo', 'Estrabismo',
                                                   'Miopia_x001D_Anormal', 'usa oculos_x001D_Anormal',
                                                   'Miopia_x001D_Astigmatismo_x001D_Anormal/of',
                                                   'Usa àculos_x001D_Astigmatismo_x001D_Anormal/of',
                                                   'Anormal/of_x001D_Usa àculos_x001D_Miopia',
                                                   'Hipermetropia_x001D_Estrabismo_x001D_usa oculos',
                                                   'Estrabismo???','Estrabismo_x001D_vai … consulta_x001D_Anormal/of',
                                                   'Astigmatismo_x001D_Miopia_x001D_usa oculos',
                                                   'Alterada - Oftal._x001D_Usa àculos', 'Miopia_x001D_Anormal/of',
                                                   'Usa àculos', 'Alterada - Oftal._x001D_Astigmatismo',
                                                   'Usa àculos_x001D_Miopia_x001D_Alterada - Oftal.',
                                                   'Anormal/of_x001D_Astigmatismo',
                                                   'Estrabismo_x001D_Hipermetropia_x001D_Usa àculos',
                                                   'Anormal_x001D_nistagmus',
                                                   'Alterada - Oftal._x001D_Astigmatismo_x001D_Miopia',
                                                   'Anormal/of_x001D_ROP sem defice visual',
                                                   'Estrabismo_x001D_Anormal/of', 'Anormal/of',
                                                   'Usa Óculos_x001D_Astigmatismo_x001D_Anormal/of',
                                                   'Anormal/of_x001D_Usa Óculos_x001D_Miopia',
                                                   'Estrabismo_x001D_vai à consulta_x001D_Anormal/of',
                                                   'Alterada - Oftal._x001D_Usa Óculos', 'Usa Óculos',
                                                   'Usa Óculos_x001D_Miopia_x001D_Alterada - Oftal.',
                                                   'Estrabismo_x001D_Hipermetropia_x001D_Usa Óculos',
                                                   'Estrabismo_x001D_Seguido em oftamologia_x001D_Alterada - Oftal.',
                                                   'Hipermetropia',
                                                   'Hipermetropia_x001D_Usa Óculos'],'Abnormal')
df_original['Vision'] = df_original['Vision'].replace(['Anormal/of_x001D_Miopia',
                                                   'Anormal/of_x001D_Estrabismo',
                                                   'Anormal/of_x001D_Hipermetropia',
                                                   'Anormal/of_x001D_Miopia_x001D_Astigmatismo',
                                                   'Anormal/of_x001D_usa oculos',
                                                   'Anormal_x001D_microftalmia/albinismo ocular',
                                                   'Estrabismo_x001D_Anormal',
                                                   'Estrabismo (??)_x001D_Anormal', 'Anormal_x001D_Miopia',
                                                   'Anormal/of_x001D_Astigmatismo_x001D_Miopia', 'Anormal',
                                                   'Estrabismo_x001D_usa oculos_x001D_Anormal',
                                                   'Anormal_x001D_Estrabismo_x001D_Astigmatismo', 'Estrabismo',
                                                   'Miopia_x001D_Anormal', 'usa oculos_x001D_Anormal',
                                                   'Miopia_x001D_Astigmatismo_x001D_Anormal/of',
                                                   'Usa àculos_x001D_Astigmatismo_x001D_Anormal/of',
                                                   'Anormal/of_x001D_Usa àculos_x001D_Miopia',
                                                   'Hipermetropia_x001D_Estrabismo_x001D_usa oculos',
                                                   'Estrabismo???','Estrabismo_x001D_vai … consulta_x001D_Anormal/of',
                                                   'Astigmatismo_x001D_Miopia_x001D_usa oculos',
                                                   'Alterada - Oftal._x001D_Usa àculos', 'Miopia_x001D_Anormal/of',
                                                   'Usa àculos', 'Alterada - Oftal._x001D_Astigmatismo',
                                                   'Usa àculos_x001D_Miopia_x001D_Alterada - Oftal.',
                                                   'Anormal/of_x001D_Astigmatismo',
                                                   'Estrabismo_x001D_Hipermetropia_x001D_Usa àculos',
                                                   'Anormal_x001D_nistagmus',
                                                   'Alterada - Oftal._x001D_Astigmatismo_x001D_Miopia',
                                                   'Anormal/of_x001D_ROP sem defice visual',
                                                   'Estrabismo_x001D_Anormal/of', 'Anormal/of',
                                                   'Usa Óculos_x001D_Astigmatismo_x001D_Anormal/of',
                                                   'Anormal/of_x001D_Usa Óculos_x001D_Miopia',
                                                   'Estrabismo_x001D_vai à consulta_x001D_Anormal/of',
                                                   'Alterada - Oftal._x001D_Usa Óculos', 'Usa Óculos',
                                                   'Usa Óculos_x001D_Miopia_x001D_Alterada - Oftal.',
                                                   'Estrabismo_x001D_Hipermetropia_x001D_Usa Óculos',
                                                   'Estrabismo_x001D_Seguido em oftamologia_x001D_Alterada - Oftal.',
                                                   'Hipermetropia',
                                                   'Hipermetropia_x001D_Usa Óculos'],'Abnormal')

In [191]:
# replace values of column Vision
df_var_cat['Vision'] = df_var_cat['Vision'].replace(['Não avaliada', np.nan],pd.NA)
df_original['Vision'] = df_original['Vision'].replace(['Não avaliada', np.nan],pd.NA)

In [192]:
# see values of column Vision
df_var_cat['Vision'].unique()

array(['Normal', 'Abnormal', <NA>], dtype=object)

In [193]:
# see values of column ADIR_quot 
df_var_cat['ADIR_quot'].unique()

array(['Positive', 'Negative', <NA>], dtype=object)

In [194]:
# check HC values 
df_var_cat.HC.unique()

array([ nan, 34. , 33.8, 34.5, 35.4, 36.5, 35.8, 35.5, 32. , 33. , 36. ,
       35. , 38. , 33.5, 37. , 30. , 34.9, 31. , 29.5, 32.8, 38.5, 37.5,
       39. , 33.6, 32.5, 29. , 10. , 75. , 37.2, 28.5, 35.7, 31.5, 35.3,
       35.2, 25. , 34.3, 36.4, 50. , 37.3, 37.8, 33.9, 36.3, 36.1, 39.5,
       27.8, 32.2, 33.4, 36.2, 26. , 34.2, 30.5, 36.8, 34.7, 36.9, 32.6,
       51. , 33.2])

In [195]:
Counter(df_var_cat.ADOS_Sev)

Counter({<NA>: 104, 'Autism': 443, 'ASD': 108, 'Non Spectrum': 6})

In [196]:
# transform strange value into NA
df_var_cat['HC'] = df_var_cat['HC'].replace({2532.0:np.nan})
df_var_cat['HC'] = df_var_cat['HC'].apply(pd.to_numeric, errors='coerce')
df_original['HC'] = df_original['HC'].replace({2532.0:np.nan})
df_original['HC'] = df_original['HC'].apply(pd.to_numeric, errors='coerce')

In [197]:
# profile report of dataset with only variables of interess
profile = ProfileReport(df_var_cat, title='Descriptive Variables Profile')
# export profile report 
profile.to_file("Descriptive Variables Profile.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Save Datasets Into csv. files

In [198]:
# reset indx of original dataset
df_original = df_original.reset_index()

In [199]:
# reset indx of categorization variables dataset
df_var_cat = df_var_cat.reset_index()

In [200]:
# create dataset that contains only the samples that will go to cluster analysis 
df_included = df_original[df_original.Code.isin(df_cluster.Code)]

In [201]:
df_included.set_index('Code', inplace=True)
df_cluster.set_index('Code', inplace=True)

In [202]:
df_included.update(df_cluster)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)


In [203]:
df_included = df_included.reset_index()
df_cluster = df_cluster.reset_index()

In [204]:
# create dataset that contains only the samples that were excluded from the cluster analysis 
df_excluded = df_original[~df_original.Code.isin(df_cluster.Code)]

In [205]:
# save original dataset to csv file
df_original.to_csv("original_dataset.csv", index = False)

In [206]:
# save dataset to csv file
df_cluster.to_csv("clusters_variables.csv", index = False)

In [207]:
# save categorization variables dataset to csv file
df_var_cat.to_csv("descriptive_variables.csv", index = False)

In [208]:
# save included dataset to csv file
df_included.to_csv("df_included_samples.csv", index = False)

In [209]:
# save excluded dataset to csv file
df_excluded.to_csv("df_excluded_samples.csv", index = False)