The first section focuses on removing "bad" values: obvious outliers and nonsensical values from the BIA data.

In [191]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
#Here I am splitting the data into a train and test set. I want to stratify by age.

from sklearn.model_selection import train_test_split
train, test = train_test_split(train_original, test_size=0.2, stratify=train_original['Basic_Demos-Age'])

In [192]:
#This is the starting data.

train=pd.read_csv('train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 82 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   id                                      3168 non-null   object 
 1   Basic_Demos-Enroll_Season               3168 non-null   object 
 2   Basic_Demos-Age                         3168 non-null   int64  
 3   Basic_Demos-Sex                         3168 non-null   int64  
 4   CGAS-Season                             2049 non-null   object 
 5   CGAS-CGAS_Score                         1948 non-null   float64
 6   Physical-Season                         2658 non-null   object 
 7   Physical-BMI                            2433 non-null   float64
 8   Physical-Height                         2437 non-null   float64
 9   Physical-Weight                         2475 non-null   float64
 10  Physical-Waist_Circumference            703 non-null    floa

In [193]:
#I only want to include quantitative variables for possible analysis for outliers, so first I identify the float variables.

float_columns = train.select_dtypes(include=['float']).columns
print("Float columns:", float_columns)
print(len(float_columns))

Float columns: Index(['CGAS-CGAS_Score', 'Physical-BMI', 'Physical-Height', 'Physical-Weight',
       'Physical-Waist_Circumference', 'Physical-Diastolic_BP',
       'Physical-HeartRate', 'Physical-Systolic_BP',
       'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins',
       'Fitness_Endurance-Time_Sec', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone',
       'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone',
       'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone',
       'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
       'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
       'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
       'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
       'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
       'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total', 'PAQ_C-PAQ_C_Total',
       'PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04',
  

In [194]:
#This generates a list of indices for extreme outliers or negative values that should not be negative. 
#In particular, BIA_BIA_Fat, which measures body fat percentage, has a bunch of negative values.
threshold=5
index_set=[]
for i in range(len(float_columns)):
    #print(float_columns[i])
    z=np.abs(stats.zscore(train[[float_columns[i]]],nan_policy='omit'))
    indices_large_z = np.where(np.all(z > threshold, axis=1))[0].flatten().tolist()
    indices_negative=np.where(np.all(train[[float_columns[i]]]<0, axis=1))[0].flatten().tolist()
    index_set=index_set+indices_large_z+indices_negative
    #print(indices_large_z)
    #print(indices_negative)
    i+=1

index_set_neg_largez=list(set(index_set))
print(index_set_neg_largez)
print(len(index_set_neg_largez))

[1028, 2069, 1558, 2589, 1071, 3122, 2101, 3126, 2613, 56, 3135, 3143, 73, 3157, 1622, 2135, 2649, 1126, 1640, 618, 123, 2172, 1665, 1154, 131, 2695, 649, 2699, 147, 148, 1179, 2715, 1192, 1705, 171, 1709, 2225, 1719, 1211, 2750, 206, 2766, 214, 1239, 729, 238, 760, 2825, 1302, 2328, 1306, 292, 806, 294, 2858, 2350, 1849, 1338, 1850, 317, 2885, 2384, 860, 861, 873, 1391, 883, 2419, 1405, 2434, 387, 1411, 417, 2978, 936, 2987, 1465, 1468, 452, 465, 2514, 1491, 3033, 987, 2017, 482, 2533, 1510]
88


In [195]:
#This defines a new dataframe with the the entire case removed, where there is an extreme outlier or a nonsensical negative value.
train_no_outliers_no_negatives=train.drop(index_set_neg_largez)

#Here's a quick look at the effect of this cleaning. The first entry shows how many rows have been affected in each sii category. 

print(train['sii'].value_counts()-train_no_outliers_no_negatives['sii'].value_counts())

sii
0.0    38
1.0    19
2.0    19
3.0     4
Name: count, dtype: int64


In [196]:
#This takes extreme outliers and nonsensical negative values and replaces them with NaN.

train_outliers_neg_to_NaN=train
for i in range(len(float_columns)):
    index_set=[]
    z=np.abs(stats.zscore(train[[float_columns[i]]],nan_policy='omit'))
    indices_large_z = np.where(np.all(z > threshold, axis=1))[0].flatten().tolist()
    indices_negative=np.where(np.all(train[[float_columns[i]]]<0, axis=1))[0].flatten().tolist()
    index_set=indices_large_z+indices_negative
    #print(len(index_set))
    for j in index_set:
        train_outliers_neg_to_NaN.at[j, float_columns[i]] = np.nan
        j+=1
    i+=1


In [197]:
#Now I replace the original train dataframe with the new cleaned data frame (where I've replaced bad values with NaN.)

train=train_outliers_neg_to_NaN
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 82 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   id                                      3168 non-null   object 
 1   Basic_Demos-Enroll_Season               3168 non-null   object 
 2   Basic_Demos-Age                         3168 non-null   int64  
 3   Basic_Demos-Sex                         3168 non-null   int64  
 4   CGAS-Season                             2049 non-null   object 
 5   CGAS-CGAS_Score                         1947 non-null   float64
 6   Physical-Season                         2658 non-null   object 
 7   Physical-BMI                            2429 non-null   float64
 8   Physical-Height                         2437 non-null   float64
 9   Physical-Weight                         2475 non-null   float64
 10  Physical-Waist_Circumference            703 non-null    floa

In [198]:
#I'll export the dataframe to a csv.

train_cleaned=train
train_cleaned.to_csv('train_cleaned.csv', index=False)

Next I am going to impute input variables. 
I'm doing this before I remove the cases for which we can't compute sii scores, so that we have all data available.
I am doing this in groups: For example, I will use only physical data to impute physical data values. This seems reasonable to do, although perhaps we might get more accurate results if we used more variables?

In [199]:
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# define a pipe that first scales the variables and then does a KNN imputation. Note that when there is a case with no values at all, KNNImputer replaces fills in each variable with the group average.

Number_Neighbors=5
impute_pipe = Pipeline([('scale', StandardScaler()),
                 ('KNN_impute', KNNImputer(n_neighbors=Number_Neighbors, weights='uniform', metric='nan_euclidean'))])




In [200]:
#We have complete information for the basic demographics variables, age and gender.

Basic_Demos = [col for col in train.columns if 'Basic' in col]
Basic_Demos.remove('Basic_Demos-Enroll_Season')
train['Basic_nan_count'] = train[Basic_Demos].isna().sum(axis=1)
train['Basic_nan_count'].value_counts()

Basic_nan_count
0    3168
Name: count, dtype: int64

In [201]:
#Next we'll consider the physical variables. There are many missing values here, including 688 cases with no values at all. We will do imputation.

Physical = [col for col in train.columns if 'Physical' in col]
Physical.remove('Physical-Season')
train['Physical_nan_count'] = train[Physical].isna().sum(axis=1)
print(train['Physical_nan_count'].value_counts())
print(len(Physical))

Physical_nan_count
1    1688
7     688
0     674
6      42
4      28
2      24
3      23
5       1
Name: count, dtype: int64
7


In [202]:
#Now I will impute values for these variables. First I'll define a new dataframe to work on.

df=train[Physical]

#Now I run the impute pipe on this dataframe. First I fit the pipe to the data. I record the transform of the dataframe as imputation. 
# Imputation is a numpy array, so it needs to be converted back to a pandas dataframe.

impute_pipe.fit(df)

imputation_physical=impute_pipe.transform(df)
df2 = pd.DataFrame(imputation_physical, columns=Physical)
df2.info()

#Lastly, I replace the original values in the dataframe with the newly imputed values.

train[Physical]=df2[Physical]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Physical-BMI                  3168 non-null   float64
 1   Physical-Height               3168 non-null   float64
 2   Physical-Weight               3168 non-null   float64
 3   Physical-Waist_Circumference  3168 non-null   float64
 4   Physical-Diastolic_BP         3168 non-null   float64
 5   Physical-HeartRate            3168 non-null   float64
 6   Physical-Systolic_BP          3168 non-null   float64
dtypes: float64(7)
memory usage: 173.4 KB


In [203]:
#Next we'll consider the fitness test variables. 
# There are many missing values here, although it looks like we have at least some values for every case.

Fitness = [col for col in train.columns if 'Fitness' in col or 'FGC']
Fitness.remove('Fitness_Endurance-Season')
Fitness.remove('FGC-Season')
train['Fitness_nan_count'] = train[Fitness].isna().sum(axis=1)
print(train['Fitness_nan_count'].value_counts())
print(len(Fitness))

Fitness_nan_count
5     272
6     198
8     179
36    174
69    169
     ... 
60      3
46      3
58      2
17      2
48      1
Name: count, Length: 66, dtype: int64
82


In [204]:
#Now I will impute values for these variables. First I'll define a new dataframe to work on.

df=train[Fitness]

#Now I run the impute pipe on this dataframe. First I fit the pipe to the data. I record the transform of the dataframe as imputation. 
# Imputation is a numpy array, so it needs to be converted back to a pandas dataframe.

impute_pipe.fit(df)

imputation_fitness=impute_pipe.transform(df)
df2 = pd.DataFrame(imputation_fitness, columns=Fitness)
df2.info()

#Lastly, I replace the original values in the dataframe with the newly imputed values.

train[Fitness]=df2[Fitness]

ValueError: could not convert string to float: '1b5fb26f'

This section of the file focus on imputing values for the PCIAT questionaire, based on other responses to the questionaire, using k nearest neighbors.

In [96]:
#First we identify the columns of interest to search for NaN variables.

pciats = [col for col in train.columns if 'PCIAT' in col]
pciats.remove('PCIAT-Season')
pciats.remove('PCIAT-PCIAT_Total')

# Create 20 new variables that indicate whether or not PCIAT-PCIAT_01 through PCIAT-PCIAT_20 were NaN

for pciat in pciats:
    train[pciat + '_isnotna'] = train[pciat].notna().astype(int)

    

#Create new variable that represents the sum of the questions answered.

pciatsnotna = [col for col in train.columns if 'isnotna' in col]
train['pciatsnotna_sum'] = train[pciatsnotna].sum(axis=1)

#Here, we can see NaN rates. Note that there are almost 1000 cases where the participants did not respond to any PCIAT questions. 
#These participants are eliminated. Note that when rows are eliminated, the original row indexing is preserved; therefore, I reset the index. 
#(This step is important to make sure indices match up when I later replace NaN entries with their imputed values.)

print("NaN rates in original data: ",train['pciatsnotna_sum'].value_counts())
train = train[train['pciatsnotna_sum'] != 0]
train.reset_index(drop=True, inplace=True)
print("NaN rates after the non-responders have been removed:",train['pciatsnotna_sum'].value_counts())



NaN rates in original data:  pciatsnotna_sum
20    2136
0      984
19      38
18       7
15       1
17       1
10       1
Name: count, dtype: int64
NaN rates after the non-responders have been removed: pciatsnotna_sum
20    2136
19      38
18       7
15       1
17       1
10       1
Name: count, dtype: int64


In [97]:
#Next we use KNN to impute the missing values.

from sklearn.impute import KNNImputer

# define imputer
Number_Neighbors=5
imputer = KNNImputer(n_neighbors=Number_Neighbors, weights='uniform', metric='nan_euclidean')

#The imputer.fit_transform function outputs a numpy array. So first I do the fitting, then convert the output back to a pandas dataframe.

imputations=imputer.fit_transform(train[pciats])
df2 = pd.DataFrame(imputations, columns=pciats)

#Next take the result and insert into the original dataframe. 
train_imp=train.copy()
train_imp[pciats]=df2[pciats]

In [98]:
#I recalculate the PCIAT total score. I also drop the variables that were added to detect NaN in the PCIAT data, just to tidy up a bit.

train_imp['PCIAT_Total_Imputed'] = train_imp[pciats].sum(axis=1)
train_imp = train_imp.drop(columns=pciatsnotna)
train_imp = train_imp.drop(columns=['pciatsnotna_sum'])


#Now we can calculate a new sii score with the imputed values. 

bins = [0, 30, 49,79,100]
labels = [0,1,2,3]
train_imp['sii_Imputed'] = pd.cut(train_imp['PCIAT_Total_Imputed'], bins=bins, labels=labels, right=False)

print(train_imp.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2184 entries, 0 to 2183
Data columns (total 87 columns):
 #   Column                                  Non-Null Count  Dtype   
---  ------                                  --------------  -----   
 0   id                                      2184 non-null   object  
 1   Basic_Demos-Enroll_Season               2184 non-null   object  
 2   Basic_Demos-Age                         2184 non-null   int64   
 3   Basic_Demos-Sex                         2184 non-null   int64   
 4   CGAS-Season                             1879 non-null   object  
 5   CGAS-CGAS_Score                         1879 non-null   float64 
 6   Physical-Season                         2077 non-null   object  
 7   Physical-BMI                            2184 non-null   float64 
 8   Physical-Height                         2184 non-null   float64 
 9   Physical-Weight                         2184 non-null   float64 
 10  Physical-Waist_Circumference            2184 non

In [99]:
train_imp.to_csv('train_imp.csv', index=False)

In [100]:
train_imp['BIA-BIA_Fat'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 2184 entries, 0 to 2183
Series name: BIA-BIA_Fat
Non-Null Count  Dtype  
--------------  -----  
1428 non-null   float64
dtypes: float64(1)
memory usage: 17.2 KB
