The first section focuses on removing "bad" values: obvious outliers and nonsensical values from the BIA data.

In [228]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

In [229]:
#This is the starting data.

train=pd.read_csv('train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 82 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   id                                      3168 non-null   object 
 1   Basic_Demos-Enroll_Season               3168 non-null   object 
 2   Basic_Demos-Age                         3168 non-null   int64  
 3   Basic_Demos-Sex                         3168 non-null   int64  
 4   CGAS-Season                             2049 non-null   object 
 5   CGAS-CGAS_Score                         1948 non-null   float64
 6   Physical-Season                         2658 non-null   object 
 7   Physical-BMI                            2433 non-null   float64
 8   Physical-Height                         2437 non-null   float64
 9   Physical-Weight                         2475 non-null   float64
 10  Physical-Waist_Circumference            703 non-null    floa

In [230]:
#I only want to include quantitative variables for possible analysis for outliers, so first I identify the float variables.

float_columns = train.select_dtypes(include=['float']).columns
print("Float columns:", float_columns)
print(len(float_columns))

Float columns: Index(['CGAS-CGAS_Score', 'Physical-BMI', 'Physical-Height', 'Physical-Weight',
       'Physical-Waist_Circumference', 'Physical-Diastolic_BP',
       'Physical-HeartRate', 'Physical-Systolic_BP',
       'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins',
       'Fitness_Endurance-Time_Sec', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone',
       'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone',
       'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone',
       'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
       'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
       'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
       'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
       'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
       'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total', 'PAQ_C-PAQ_C_Total',
       'PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04',
  

In [231]:
#This generates a list of indices for extreme outliers or negative values that should not be negative. 
#In particular, BIA_BIA_Fat, which measures body fat percentage, has a bunch of negative values.
index_set=[]
for i in range(len(float_columns)):
    #print(float_columns[i])
    z=np.abs(stats.zscore(train[[float_columns[i]]],nan_policy='omit'))
    #print(z.describe())
    threshold = 10
    indices_large_z = np.where(np.all(z > threshold, axis=1))[0].flatten().tolist()
    indices_negative=np.where(np.all(train[[float_columns[i]]]<0, axis=1))[0].flatten().tolist()
    index_set=index_set+indices_large_z+indices_negative
    #print(indices_large_z)
    #print(indices_negative)
    i+=1

index_set_neg_largez=list(set(index_set))
print(index_set_neg_largez)
print(len(index_set_neg_largez))

[1665, 2434, 1028, 2695, 649, 2825, 2699, 147, 1306, 1179, 2715, 2589, 417, 806, 294, 1192, 171, 2987, 1709, 2350, 1071, 2101, 3126, 1719, 2613, 1465, 1850, 1211, 1468, 317, 3135, 3143, 73, 2766, 2384, 465, 3157, 1239, 2135, 729, 2649, 987, 860, 861, 2017, 2533, 873, 618, 238, 2419, 760, 2172, 1405]
53


In [232]:
#This defines a new dataframe with the the entire case removed, where there is an extreme outlier or a nonsensical negative value.
train_no_outliers_no_negatives=train.drop(index_set_neg_largez)

#Here's a quick look at the effect of this cleaning. The first entry shows how many rows have been affected in each sii category. 

print(train['sii'].value_counts()-train_no_outliers_no_negatives['sii'].value_counts())

sii
0.0    28
1.0    11
2.0    10
3.0     0
Name: count, dtype: int64


In [233]:
#This takes extreme outliers and nonsensical negative values and replaces them with NaN.

train_outliers_neg_to_NaN=train
for i in range(len(float_columns)):
    index_set=[]
    z=np.abs(stats.zscore(train[[float_columns[i]]],nan_policy='omit'))
    threshold = 10
    indices_large_z = np.where(np.all(z > threshold, axis=1))[0].flatten().tolist()
    indices_negative=np.where(np.all(train[[float_columns[i]]]<0, axis=1))[0].flatten().tolist()
    index_set=indices_large_z+indices_negative
    #print(len(index_set))
    for j in range(len(index_set)):
        train_outliers_neg_to_NaN.at[j, float_columns[i]] = np.nan
        j+=1
    i+=1


In [234]:
#Now I replace the original train dataframe with the new cleaned data frame (where I've replaced bad values with NaN.)

train=train_outliers_neg_to_NaN
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 82 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   id                                      3168 non-null   object 
 1   Basic_Demos-Enroll_Season               3168 non-null   object 
 2   Basic_Demos-Age                         3168 non-null   int64  
 3   Basic_Demos-Sex                         3168 non-null   int64  
 4   CGAS-Season                             2049 non-null   object 
 5   CGAS-CGAS_Score                         1947 non-null   float64
 6   Physical-Season                         2658 non-null   object 
 7   Physical-BMI                            2433 non-null   float64
 8   Physical-Height                         2437 non-null   float64
 9   Physical-Weight                         2475 non-null   float64
 10  Physical-Waist_Circumference            703 non-null    floa

This section of the file focus on imputing values for the PCIAT questionaire, based on other responses to the questionaire, using k nearest neighbors.

In [235]:
#First we identify the columns of interest to search for NaN variables.

pciats = [col for col in train.columns if 'PCIAT' in col]
pciats.remove('PCIAT-Season')
pciats.remove('PCIAT-PCIAT_Total')

# Create 20 new variables that indicate whether or not PCIAT-PCIAT_01 through PCIAT-PCIAT_20 were NaN

for pciat in pciats:
    train[pciat + '_isnotna'] = train[pciat].notna().astype(int)

    

#Create new variable that represents the sum of the questions answered.

pciatsnotna = [col for col in train.columns if 'isnotna' in col]
train['pciatsnotna_sum'] = train[pciatsnotna].sum(axis=1)

#Here, we can see NaN rates. Note that there are almost 1000 cases where the participants did not respond to any PCIAT questions. 
#These participants are eliminated. Note that when rows are eliminated, the original row indexing is preserved; therefore, I reset the index. 
#(This step is important to make sure indices match up when I later replace NaN entries with their imputed values.)

print("NaN rates in original data: ",train['pciatsnotna_sum'].value_counts())
train = train[train['pciatsnotna_sum'] != 0]
train.reset_index(drop=True, inplace=True)
print("NaN rates after the non-responders have been removed:",train['pciatsnotna_sum'].value_counts())



NaN rates in original data:  pciatsnotna_sum
20    2136
0      984
19      38
18       7
15       1
17       1
10       1
Name: count, dtype: int64
NaN rates after the non-responders have been removed: pciatsnotna_sum
20    2136
19      38
18       7
15       1
17       1
10       1
Name: count, dtype: int64


In [236]:
#Next we use KNN to impute the missing values.

from sklearn.impute import KNNImputer

# define imputer
Number_Neighbors=5
imputer = KNNImputer(n_neighbors=Number_Neighbors, weights='uniform', metric='nan_euclidean')

#The imputer.fit_transform function outputs a numpy array. So first I do the fitting, then convert the output back to a pandas dataframe.

imputations=imputer.fit_transform(train[pciats])
df2 = pd.DataFrame(imputations, columns=pciats)

#Next take the result and insert into the original dataframe. 
train_imp=train.copy()
train_imp[pciats]=df2[pciats]

In [237]:
#I recalculate the PCIAT total score. I also drop the variables that were added to detect NaN in the PCIAT data, just to tidy up a bit.

train_imp['PCIAT_Total_Imputed'] = train_imp[pciats].sum(axis=1)
train_imp = train_imp.drop(columns=pciatsnotna)
train_imp = train_imp.drop(columns=['pciatsnotna_sum'])


#Now we can calculate a new sii score with the imputed values. 

bins = [0, 30, 49,79,100]
labels = [0,1,2,3]
train_imp['sii_Imputed'] = pd.cut(train_imp['PCIAT_Total_Imputed'], bins=bins, labels=labels, right=False)

print(train_imp.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2184 entries, 0 to 2183
Data columns (total 84 columns):
 #   Column                                  Non-Null Count  Dtype   
---  ------                                  --------------  -----   
 0   id                                      2184 non-null   object  
 1   Basic_Demos-Enroll_Season               2184 non-null   object  
 2   Basic_Demos-Age                         2184 non-null   int64   
 3   Basic_Demos-Sex                         2184 non-null   int64   
 4   CGAS-Season                             1879 non-null   object  
 5   CGAS-CGAS_Score                         1878 non-null   float64 
 6   Physical-Season                         2077 non-null   object  
 7   Physical-BMI                            2029 non-null   float64 
 8   Physical-Height                         2031 non-null   float64 
 9   Physical-Weight                         2063 non-null   float64 
 10  Physical-Waist_Circumference            371 non-

In [238]:
train_imp.to_csv('train_imp.csv', index=False)

In [239]:
train_imp['sii_Imputed'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 2184 entries, 0 to 2183
Series name: sii_Imputed
Non-Null Count  Dtype   
--------------  -----   
2184 non-null   category
dtypes: category(1)
memory usage: 2.5 KB
