The first section focuses on removing "bad" values: obvious outliers and nonsensical values from the BIA data.

In [325]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

In [326]:
#The original data has been named train_original
#Here I am splitting the data into a train and test set. I want to stratify by age. 
# Then I export as csv files, since we are working over multiple jupiter notebooks.

from sklearn.model_selection import train_test_split

train_original=pd.read_csv('train_original.csv')

seed=1275
train, test = train_test_split(train_original, test_size=0.2, random_state=seed, stratify=train_original['Basic_Demos-Age'])

train.to_csv('train.csv', index=False)
test.to_csv('test.csv',index=False)

In [327]:
#This is the starting data.

train=pd.read_csv('train.csv')
train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 82 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   id                                      3168 non-null   object 
 1   Basic_Demos-Enroll_Season               3168 non-null   object 
 2   Basic_Demos-Age                         3168 non-null   int64  
 3   Basic_Demos-Sex                         3168 non-null   int64  
 4   CGAS-Season                             2065 non-null   object 
 5   CGAS-CGAS_Score                         1951 non-null   float64
 6   Physical-Season                         2642 non-null   object 
 7   Physical-BMI                            2401 non-null   float64
 8   Physical-Height                         2404 non-null   float64
 9   Physical-Weight                         2446 non-null   float64
 10  Physical-Waist_Circumference            695 non-null    floa

In [328]:
#I only want to include quantitative variables for possible analysis for outliers, so first I identify the float variables.

float_columns = train.select_dtypes(include=['float']).columns
print("Float columns:", float_columns)
print(len(float_columns))

Float columns: Index(['CGAS-CGAS_Score', 'Physical-BMI', 'Physical-Height', 'Physical-Weight',
       'Physical-Waist_Circumference', 'Physical-Diastolic_BP',
       'Physical-HeartRate', 'Physical-Systolic_BP',
       'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins',
       'Fitness_Endurance-Time_Sec', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone',
       'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone',
       'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone',
       'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
       'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
       'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
       'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
       'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
       'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total', 'PAQ_C-PAQ_C_Total',
       'PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04',
  

In [329]:
#This generates a list of indices for extreme outliers or negative values that should not be negative. 
#In particular, BIA_BIA_Fat, which measures body fat percentage, has a bunch of negative values.
threshold=5
index_set=[]
for i in range(len(float_columns)):
    #print(float_columns[i])
    z=np.abs(stats.zscore(train[[float_columns[i]]],nan_policy='omit'))
    indices_large_z = np.where(np.all(z > threshold, axis=1))[0].flatten().tolist()
    indices_negative=np.where(np.all(train[[float_columns[i]]]<0, axis=1))[0].flatten().tolist()
    index_set=index_set+indices_large_z+indices_negative
    #print(indices_large_z)
    #print(indices_negative)
    i+=1

index_set_neg_largez=list(set(index_set))
print(index_set_neg_largez)
print(len(index_set_neg_largez))

[515, 1541, 1542, 3080, 2061, 2078, 1054, 1058, 1067, 3120, 1076, 573, 2116, 1092, 3142, 73, 76, 78, 3150, 1621, 92, 1119, 2656, 610, 2658, 101, 107, 2171, 1663, 127, 1671, 2185, 657, 1683, 2720, 2723, 1188, 1205, 2249, 1234, 2265, 2276, 1766, 2287, 2799, 2804, 2296, 1274, 1791, 1287, 777, 786, 790, 793, 797, 2853, 1320, 1836, 2862, 2350, 2352, 306, 2870, 2875, 2363, 2366, 2881, 329, 1354, 1867, 2387, 1893, 872, 2412, 371, 1399, 2425, 2954, 2446, 1427, 2966, 411, 1441, 1955, 2486, 962, 3012, 2518, 2525, 992, 997, 3047, 488, 498, 2042]
95


In [330]:
#This defines a new dataframe with the the entire case removed, where there is an extreme outlier or a nonsensical negative value.
train_no_outliers_no_negatives=train.drop(index_set_neg_largez)

#Here's a quick look at the effect of this cleaning. The first entry shows how many rows have been affected in each sii category. 

print(train['sii'].value_counts()-train_no_outliers_no_negatives['sii'].value_counts())

sii
0.0    38
1.0    21
2.0    20
3.0     5
Name: count, dtype: int64


In [331]:
#This takes extreme outliers and nonsensical negative values and replaces them with NaN.

train_outliers_neg_to_NaN=train
for i in range(len(float_columns)):
    index_set=[]
    z=np.abs(stats.zscore(train[[float_columns[i]]],nan_policy='omit'))
    indices_large_z = np.where(np.all(z > threshold, axis=1))[0].flatten().tolist()
    indices_negative=np.where(np.all(train[[float_columns[i]]]<0, axis=1))[0].flatten().tolist()
    index_set=indices_large_z+indices_negative
    #print(len(index_set))
    for j in index_set:
        train_outliers_neg_to_NaN.at[j, float_columns[i]] = np.nan
        j+=1
    i+=1


In [332]:
#Now I replace the original train dataframe with the new cleaned data frame (where I've replaced bad values with NaN.)

train=train_outliers_neg_to_NaN
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 82 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   id                                      3168 non-null   object 
 1   Basic_Demos-Enroll_Season               3168 non-null   object 
 2   Basic_Demos-Age                         3168 non-null   int64  
 3   Basic_Demos-Sex                         3168 non-null   int64  
 4   CGAS-Season                             2065 non-null   object 
 5   CGAS-CGAS_Score                         1950 non-null   float64
 6   Physical-Season                         2642 non-null   object 
 7   Physical-BMI                            2395 non-null   float64
 8   Physical-Height                         2404 non-null   float64
 9   Physical-Weight                         2445 non-null   float64
 10  Physical-Waist_Circumference            695 non-null    floa

In [333]:
#I'll export the dataframe to a csv.

train_cleaned=train
train_cleaned.to_csv('train_cleaned.csv', index=False)

Next I am going to impute input variables. 
I'm doing this before I remove the cases for which we can't compute sii scores, so that we have all data available.
I am doing this in groups: For example, I will use only physical data to impute physical data values. This seems reasonable to do, although perhaps we might get more accurate results if we used more variables?

In [334]:
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

#Because we will be using multiple imputation strategies, 
# I am going to define a new dataframe that will record all of the imputations using KNN.

train_imp_KNN=train_cleaned.copy()

# define a pipe that first scales the variables and then does a KNN imputation. 
# Note that when there is a case with no values at all, KNNImputer replaces fills in each variable with the group average.

Number_Neighbors=5
impute_pipe = Pipeline([('scale', StandardScaler()),
                 ('KNN_impute', KNNImputer(n_neighbors=Number_Neighbors, weights='uniform', metric='nan_euclidean'))])




In [335]:
#We have complete information for the basic demographics variables, age and gender.

Basic_Demos = [col for col in train_imp_KNN.columns if 'Basic' in col]
Basic_Demos.remove('Basic_Demos-Enroll_Season')
train_imp_KNN['Basic_nan_count'] = train_imp_KNN[Basic_Demos].isna().sum(axis=1)
train_imp_KNN['Basic_nan_count'].value_counts()

Basic_nan_count
0    3168
Name: count, dtype: int64

In [336]:
#Next we'll consider the physical variables. There are many missing values here, including 688 cases with no values at all. We will do imputation.
#Because age and gender are likely to be related to the Physical variables, I add these to the mix for imputation.
#Note also that I have removed the season variable. I did this because it is not quantitative, so I can't easily run the imputation using this variable. 
#But this might be something to go back to later.

Physical = [col for col in train_imp_KNN.columns if 'Physical' in col]
Physical.remove('Physical-Season')
Physical=Physical+Basic_Demos
train_imp_KNN['Physical_nan_count'] = train_imp_KNN[Physical].isna().sum(axis=1)
print(train_imp_KNN['Physical_nan_count'].value_counts())
print(len(Physical))

Physical_nan_count
1    1664
7     716
0     665
6      45
4      30
3      27
2      20
5       1
Name: count, dtype: int64
9


In [337]:
#Now I will impute values for these variables. First I'll define a new dataframe to work on.

df=train_imp_KNN[Physical]

#Now I run the impute pipe on this dataframe. First I fit the pipe to the data. I record the transform of the dataframe as imputation. 
# Imputation is a numpy array, so it needs to be converted back to a pandas dataframe.
#Also, I reverse-transformed the data. My reasoning for doing this is that we want it in terms of the original scale to be able to make sense of things. 
#But since we are scaling twice, more rounding issues arise.

impute_pipe.fit(df)

imputation_physical=impute_pipe.transform(df)
imputation_physical=impute_pipe.named_steps['scale'].inverse_transform(imputation_physical)
df2 = pd.DataFrame(imputation_physical, columns=Physical)
df2.info()

#Lastly, I replace the original values in the dataframe with the newly imputed values.

train_imp_KNN[Physical]=train_imp_KNN[Physical].fillna(df2[Physical])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Physical-BMI                  3168 non-null   float64
 1   Physical-Height               3168 non-null   float64
 2   Physical-Weight               3168 non-null   float64
 3   Physical-Waist_Circumference  3168 non-null   float64
 4   Physical-Diastolic_BP         3168 non-null   float64
 5   Physical-HeartRate            3168 non-null   float64
 6   Physical-Systolic_BP          3168 non-null   float64
 7   Basic_Demos-Age               3168 non-null   float64
 8   Basic_Demos-Sex               3168 non-null   float64
dtypes: float64(9)
memory usage: 222.9 KB


In [338]:
#Next we'll consider the fitness test variables. 
# There are many missing values here, although it looks like we have at least some values for every case.
#I kept in all of the zone variables, which means they have the same weight as the actual measurements. It seems like I shouldn't do this.

Fitness = [col for col in train_imp_KNN.columns if 'Fitness' in col]+[col for col in train_imp_KNN.columns if 'FGC' in col]
Fitness.remove('Fitness_Endurance-Season')
Fitness.remove('FGC-Season')
Fitness=Fitness+Basic_Demos
train_imp_KNN['Fitness_nan_count'] = train_imp_KNN[Fitness].isna().sum(axis=1)
print(train_imp_KNN['Fitness_nan_count'].value_counts())
print(len(Fitness))

Fitness_nan_count
17    1306
3      650
7      549
4      435
0      152
5       21
9       18
8       10
11       7
12       6
2        5
14       3
6        2
13       2
16       1
15       1
Name: count, dtype: int64
19


In [339]:
#Now I will impute values for these variables. First I'll define a new dataframe to work on.

df=train_imp_KNN[Fitness]

#Now I run the impute pipe on this dataframe. First I fit the pipe to the data. I record the transform of the dataframe as imputation. 
# Imputation is a numpy array, so it needs to be converted back to a pandas dataframe.

impute_pipe.fit(df)

imputation_fitness=impute_pipe.transform(df)
imputation_fitness=impute_pipe.named_steps['scale'].inverse_transform(imputation_fitness)
df2 = pd.DataFrame(imputation_fitness, columns=Fitness)
df2.info()

#Lastly, I replace the original values in the dataframe with the newly imputed values.

train_imp_KNN[Fitness]=train_imp_KNN[Fitness].fillna(df2[Fitness])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Fitness_Endurance-Max_Stage  3168 non-null   float64
 1   Fitness_Endurance-Time_Mins  3168 non-null   float64
 2   Fitness_Endurance-Time_Sec   3168 non-null   float64
 3   FGC-FGC_CU                   3168 non-null   float64
 4   FGC-FGC_CU_Zone              3168 non-null   float64
 5   FGC-FGC_GSND                 3168 non-null   float64
 6   FGC-FGC_GSND_Zone            3168 non-null   float64
 7   FGC-FGC_GSD                  3168 non-null   float64
 8   FGC-FGC_GSD_Zone             3168 non-null   float64
 9   FGC-FGC_PU                   3168 non-null   float64
 10  FGC-FGC_PU_Zone              3168 non-null   float64
 11  FGC-FGC_SRL                  3168 non-null   float64
 12  FGC-FGC_SRL_Zone             3168 non-null   float64
 13  FGC-FGC_SRR       

In [340]:
#Next we'll consider the BIA variables. 

BIA = [col for col in train_imp_KNN.columns if 'BIA' in col]
BIA.remove('BIA-Season')
BIA=BIA+Basic_Demos
train_imp_KNN['BIA_nan_count'] = train_imp_KNN[BIA].isna().sum(axis=1)
print(train_imp_KNN['BIA_nan_count'].value_counts())
print(len(BIA))


BIA_nan_count
16    1575
0     1537
1       27
2       21
3        7
12       1
Name: count, dtype: int64
18


In [341]:
#Now I will impute values for these variables. First I'll define a new dataframe to work on.

df=train_imp_KNN[BIA]

#Now I run the impute pipe on this dataframe. First I fit the pipe to the data. I record the transform of the dataframe as imputation. 
# Imputation is a numpy array, so it needs to be converted back to a pandas dataframe.

impute_pipe.fit(df)

imputation_BIA=impute_pipe.transform(df)
imputation_BIA=impute_pipe.named_steps['scale'].inverse_transform(imputation_BIA)
df2 = pd.DataFrame(imputation_BIA, columns=BIA)
df2.info()

#Lastly, I replace the original values in the dataframe with the newly imputed values.

train_imp_KNN[BIA]=train_imp_KNN[BIA].fillna(df2[BIA])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   BIA-BIA_Activity_Level_num  3168 non-null   float64
 1   BIA-BIA_BMC                 3168 non-null   float64
 2   BIA-BIA_BMI                 3168 non-null   float64
 3   BIA-BIA_BMR                 3168 non-null   float64
 4   BIA-BIA_DEE                 3168 non-null   float64
 5   BIA-BIA_ECW                 3168 non-null   float64
 6   BIA-BIA_FFM                 3168 non-null   float64
 7   BIA-BIA_FFMI                3168 non-null   float64
 8   BIA-BIA_FMI                 3168 non-null   float64
 9   BIA-BIA_Fat                 3168 non-null   float64
 10  BIA-BIA_Frame_num           3168 non-null   float64
 11  BIA-BIA_ICW                 3168 non-null   float64
 12  BIA-BIA_LDM                 3168 non-null   float64
 13  BIA-BIA_LST                 3168 

In [342]:
#Next we consider CGAS (Children's Global Assessment Score). This measure comes from an evaluation by a trained professional. 
#Looking at the description, it seems reasonable that it is related to gender and age, so I am going to do KNN with those variables. 

CGAS = ['CGAS-CGAS_Score']+Basic_Demos
train_imp_KNN['CGAS_nan_count'] = train_imp_KNN[CGAS].isna().sum(axis=1)
print(train_imp_KNN['CGAS_nan_count'].value_counts())
print(len(CGAS))
print(CGAS)

CGAS_nan_count
0    1950
1    1218
Name: count, dtype: int64
3
['CGAS-CGAS_Score', 'Basic_Demos-Age', 'Basic_Demos-Sex']


In [343]:
#Now I will impute values for this variable. First I'll define a new dataframe to work on.

df=train_imp_KNN[CGAS]

#Now I run the impute pipe on this dataframe. First I fit the pipe to the data. I record the transform of the dataframe as imputation. 
# Imputation is a numpy array, so it needs to be converted back to a pandas dataframe.

impute_pipe.fit(df)

imputation_CGAS=impute_pipe.transform(df)
imputation_CGAS=impute_pipe.named_steps['scale'].inverse_transform(imputation_CGAS)
df2 = pd.DataFrame(imputation_CGAS, columns=CGAS)
df2.info()

#Lastly, I replace the original values in the dataframe with the newly imputed values.

train_imp_KNN[CGAS]=train_imp_KNN[CGAS].fillna(df2[CGAS])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CGAS-CGAS_Score  3168 non-null   float64
 1   Basic_Demos-Age  3168 non-null   float64
 2   Basic_Demos-Sex  3168 non-null   float64
dtypes: float64(3)
memory usage: 74.4 KB


In [344]:
#Next we consider  PreInt_EduHx-computerinternet_hoursday. 
#It seems reasonable that it is related to gender and age, so I am going to do KNN with those variables. 

IntHrs = ['PreInt_EduHx-computerinternet_hoursday']+Basic_Demos
train_imp_KNN['IntHrs_nan_count'] = train_imp_KNN[IntHrs].isna().sum(axis=1)
print(train_imp_KNN['IntHrs_nan_count'].value_counts())
print(len(IntHrs))


IntHrs_nan_count
0    2633
1     535
Name: count, dtype: int64
3


In [345]:
#Now I will impute values for this variable. First I'll define a new dataframe to work on.

df=train_imp_KNN[IntHrs]

#Now I run the impute pipe on this dataframe. First I fit the pipe to the data. I record the transform of the dataframe as imputation. 
# Imputation is a numpy array, so it needs to be converted back to a pandas dataframe.

impute_pipe.fit(df)

imputation_IntHrs=impute_pipe.transform(df)
imputation_IntHrs=impute_pipe.named_steps['scale'].inverse_transform(imputation_IntHrs)
df2 = pd.DataFrame(imputation_IntHrs, columns=IntHrs)
df2.info()

#Lastly, I replace the original values in the dataframe with the newly imputed values.

train_imp_KNN[IntHrs]=train_imp_KNN[IntHrs].fillna(df2[IntHrs])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 3 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   PreInt_EduHx-computerinternet_hoursday  3168 non-null   float64
 1   Basic_Demos-Age                         3168 non-null   float64
 2   Basic_Demos-Sex                         3168 non-null   float64
dtypes: float64(3)
memory usage: 74.4 KB


This section of the file focus on imputing values for the PCIAT questionaire, based on other responses to the questionaire, using k nearest neighbors.

In [346]:
#First we identify the columns of interest to search for NaN variables.

pciats = [col for col in train_imp_KNN.columns if 'PCIAT' in col]
pciats.remove('PCIAT-Season')
pciats.remove('PCIAT-PCIAT_Total')

# Create 20 new variables that indicate whether or not PCIAT-PCIAT_01 through PCIAT-PCIAT_20 were NaN

for pciat in pciats:
    train_imp_KNN[pciat + '_isnotna'] = train_imp_KNN[pciat].notna().astype(int)

    

#Create new variable that represents the sum of the questions answered.

pciatsnotna = [col for col in train_imp_KNN.columns if 'isnotna' in col]
train_imp_KNN['pciatsnotna_sum'] = train_imp_KNN[pciatsnotna].sum(axis=1)

#Here, we can see NaN rates. Note that there are almost 1000 cases where the participants did not respond to any PCIAT questions. 
#These participants are eliminated. Note that when rows are eliminated, the original row indexing is preserved; therefore, I reset the index. 
#(This step is important to make sure indices match up when I later replace NaN entries with their imputed values.)

print("NaN rates in original data: ",train_imp_KNN['pciatsnotna_sum'] != 0)
train_imp_KNN.reset_index(drop=True, inplace=True)
print("NaN rates after the non-responders have been removed:",train_imp_KNN['pciatsnotna_sum'].value_counts())



NaN rates in original data:  0        True
1       False
2       False
3        True
4       False
        ...  
3163     True
3164     True
3165    False
3166     True
3167    False
Name: pciatsnotna_sum, Length: 3168, dtype: bool
NaN rates after the non-responders have been removed: pciatsnotna_sum
20    2130
0      974
19      52
18       9
10       1
15       1
17       1
Name: count, dtype: int64


In [347]:
#Next we use KNN to impute the missing values.


# define imputer
Number_Neighbors=5
imputer = KNNImputer(n_neighbors=Number_Neighbors, weights='uniform', metric='nan_euclidean')

#The imputer.fit_transform function outputs a numpy array. So first I do the fitting, then convert the output back to a pandas dataframe.

imputations=imputer.fit_transform(train_imp_KNN[pciats])
df2 = pd.DataFrame(imputations, columns=pciats)

#Next take the result and insert into the original dataframe. 

train_imp_KNN[pciats]=train_imp_KNN[pciats].fillna(df2[pciats])

In [348]:
#I recalculate the PCIAT total score. I also drop the variables that were added to detect NaN in the PCIAT data, just to tidy up a bit.

train_imp_KNN['PCIAT_Total_Imputed'] = train_imp_KNN[pciats].sum(axis=1)
train_imp_KNN = train_imp_KNN.drop(columns=pciatsnotna)
train_imp_KNN = train_imp_KNN.drop(columns=['pciatsnotna_sum'])


#Now we can calculate a new sii score with the imputed values. 

bins = [0, 30, 49,79,100]
labels = [0,1,2,3]
train_imp_KNN['sii_Imputed'] = pd.cut(train_imp_KNN['PCIAT_Total_Imputed'], bins=bins, labels=labels, right=False)

print(train_imp_KNN.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 90 columns):
 #   Column                                  Non-Null Count  Dtype   
---  ------                                  --------------  -----   
 0   id                                      3168 non-null   object  
 1   Basic_Demos-Enroll_Season               3168 non-null   object  
 2   Basic_Demos-Age                         3168 non-null   int64   
 3   Basic_Demos-Sex                         3168 non-null   int64   
 4   CGAS-Season                             2065 non-null   object  
 5   CGAS-CGAS_Score                         3168 non-null   float64 
 6   Physical-Season                         2642 non-null   object  
 7   Physical-BMI                            3168 non-null   float64 
 8   Physical-Height                         3168 non-null   float64 
 9   Physical-Weight                         3168 non-null   float64 
 10  Physical-Waist_Circumference            3168 non

In [349]:
train_imp_KNN.to_csv('train_imp_KNN.csv', index=False)

Next I'm working on imputation using MICE.

In [350]:
# I am going to define a new dataframe that will record all of the imputations using MICE. I only want to apply MICE to the input variables, so I separate those out.
#Also, MICE doesn't like categorical variables. I have just removed those--the seasons--for now.

train_imp_MICE=train_cleaned.copy()

print(train_imp_MICE.columns)

features=['Basic_Demos-Age', 'Basic_Demos-Sex',
        'CGAS-CGAS_Score', 'Physical-BMI',
       'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
       'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
        'Fitness_Endurance-Max_Stage',
       'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
        'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
       'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
       'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
       'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 
       'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
       'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
       'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
       'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
       'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total', 
       'PAQ_C-PAQ_C_Total',
       'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T', 
       'PreInt_EduHx-computerinternet_hoursday']

df=train_imp_MICE[features]

Index(['id', 'Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
       'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
       'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
       'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
       'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
       'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
       'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
       'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
       'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
       'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
       'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
       'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
       'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
       'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST'

In [351]:
#New packages needed.

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.datasets import make_regression

In [352]:
imputer = IterativeImputer(max_iter=10, random_state=497)

df2= imputer.fit_transform(df)

In [353]:
#Now I fill in the missing values in train_imp_MICE with the MICE-imputed values. I am still using KNN for the pciats values. Then I export.

train_imp_MICE[features]=train_imp_MICE[features].fillna(df2[features])
train_imp_KNN[pciats]=train_imp_KNN[pciats].fillna(train_imp_KNN[pciats])

#Now I can export to a csv.

train_imp_MICE.to_csv('train_imp_MICE.csv', index=False)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices