In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

In [3]:
#This is the starting data.

train=pd.read_csv('train.csv')

In [4]:
#I only want to include quantitative variables for possible analysis for outliers, so first I identify the float variables.

float_columns = train.select_dtypes(include=['float']).columns
print("Float columns:", float_columns)
print(len(float_columns))

Float columns: Index(['CGAS-CGAS_Score', 'Physical-BMI', 'Physical-Height', 'Physical-Weight',
       'Physical-Waist_Circumference', 'Physical-Diastolic_BP',
       'Physical-HeartRate', 'Physical-Systolic_BP',
       'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins',
       'Fitness_Endurance-Time_Sec', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone',
       'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone',
       'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone',
       'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
       'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
       'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
       'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
       'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
       'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total', 'PAQ_C-PAQ_C_Total',
       'PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04',
  

In [5]:
#This generates a list of indices that I have identified as extreme outliers. I identify an extreme outlier as any value that has a z-score more extreme than +/-10.as_integer_ratio

index_set=[]
threshold = 10
for i in range(len(float_columns)):
    z=np.abs(stats.zscore(train[[float_columns[i]]],nan_policy='omit'))
    #print(z.describe())
    indices = np.where(np.all(z > threshold, axis=1))[0].flatten().tolist()
    index_set=index_set+indices
    i+=1

index_set=list(set(index_set))
index_set

[3511, 3205, 3147, 2065, 3767, 1470]

In [6]:
#This defines a new dataframe with the outliers removed.

train_no_outliers=train.drop(index_set)
len(train_no_outliers)

3954

In [8]:
#This generates a list of indices for extreme outliers or negative values that should not be negative. 
#In particular, BIA_BIA_Fat, which measures body fat percentage, has a bunch of negative values.
index_set=[]
for i in range(len(float_columns)):
    #print(float_columns[i])
    z=np.abs(stats.zscore(train[[float_columns[i]]],nan_policy='omit'))
    #print(z.describe())
    threshold = 10
    indices_large_z = np.where(np.all(z > threshold, axis=1))[0].flatten().tolist()
    indices_negative=np.where(np.all(train[[float_columns[i]]]<0, axis=1))[0].flatten().tolist()
    index_set=index_set+indices_large_z+indices_negative
    #print(indices_large_z)
    #print(indices_negative)
    i+=1

index_set_neg_largez=list(set(index_set))
print(index_set_neg_largez)
print(len(index_set_neg_largez))

[1280, 1155, 3205, 646, 1031, 2567, 1030, 1287, 3086, 655, 2065, 1170, 3091, 3729, 2710, 1944, 1434, 156, 1700, 1707, 2478, 2299, 3630, 2738, 691, 3767, 55, 3511, 2490, 3771, 1470, 3903, 3774, 3143, 72, 840, 3655, 3147, 3658, 3792, 2770, 3451, 3925, 2009, 3289, 1245, 609, 2146, 2659, 3809, 2021, 2661, 3581, 232, 3051, 238, 113, 498, 3700, 1141, 1526, 2551, 1656, 3958, 1658, 635, 1660, 1917]
68


In [9]:
#This defines a new dataframe with the the entire case removed, where there is an extreme outlier or a nonsensical negative value.
train_no_outliers_no_negatives=train.drop(index_set_neg_largez)
train_no_outliers_no_negatives.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3892 entries, 0 to 3959
Data columns (total 82 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   id                                      3892 non-null   object 
 1   Basic_Demos-Enroll_Season               3892 non-null   object 
 2   Basic_Demos-Age                         3892 non-null   int64  
 3   Basic_Demos-Sex                         3892 non-null   int64  
 4   CGAS-Season                             2496 non-null   object 
 5   CGAS-CGAS_Score                         2362 non-null   float64
 6   Physical-Season                         3242 non-null   object 
 7   Physical-BMI                            2956 non-null   float64
 8   Physical-Height                         2961 non-null   float64
 9   Physical-Weight                         3010 non-null   float64
 10  Physical-Waist_Circumference            892 non-null    float64
 

In [10]:
#This takes extreme outliers and nonsensical negative values and replaces them with NaN.

train_outliers_neg_to_NaN=train
for i in range(len(float_columns)):
    index_set=[]
    z=np.abs(stats.zscore(train[[float_columns[i]]],nan_policy='omit'))
    threshold = 10
    indices_large_z = np.where(np.all(z > threshold, axis=1))[0].flatten().tolist()
    indices_negative=np.where(np.all(train[[float_columns[i]]]<0, axis=1))[0].flatten().tolist()
    index_set=indices_large_z+indices_negative
    #print(len(index_set))
    for j in range(len(index_set)):
        train_outliers_neg_to_NaN.at[j, float_columns[i]] = np.nan
        j+=1
    i+=1

train_outliers_neg_to_NaN.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3960 entries, 0 to 3959
Data columns (total 82 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   id                                      3960 non-null   object 
 1   Basic_Demos-Enroll_Season               3960 non-null   object 
 2   Basic_Demos-Age                         3960 non-null   int64  
 3   Basic_Demos-Sex                         3960 non-null   int64  
 4   CGAS-Season                             2555 non-null   object 
 5   CGAS-CGAS_Score                         2420 non-null   float64
 6   Physical-Season                         3310 non-null   object 
 7   Physical-BMI                            3022 non-null   float64
 8   Physical-Height                         3027 non-null   float64
 9   Physical-Weight                         3076 non-null   float64
 10  Physical-Waist_Circumference            898 non-null    floa