In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
train = pd.read_csv(r"C:\Users\basde\Downloads\playground-series-s3e24\train.csv")

In [3]:
print("--- Outliers using Z-Score ---")

--- Outliers using Z-Score ---


In [5]:
numeric_cols = train.select_dtypes(include=np.number).columns
z_scores = np.abs(stats.zscore(train[numeric_cols]))

In [8]:
threshold = 3

In [10]:
is_outlier_z = pd.DataFrame(z_scores > threshold, index=train.index, columns=numeric_cols)

In [12]:
outlier_rows_z = train[is_outlier_z.any(axis=1)]

In [13]:
print("\nRows containing outliers (Z-Score):")
print(outlier_rows_z)


Rows containing outliers (Z-Score):
            id  age  height(cm)  weight(kg)  waist(cm)  eyesight(left)  \
1            1   70         165          65       89.0             0.6   
5            5   50         170          55       51.0             1.2   
7            7   55         155          60       84.5             0.7   
10          10   40         155          50       68.0             0.8   
16          16   40         160          65       78.0             1.2   
...        ...  ...         ...         ...        ...             ...   
159182  159182   40         160          55       75.0             1.2   
159196  159196   65         150          60       80.0             0.5   
159209  159209   40         160          70       82.9             1.5   
159212  159212   50         170          65       88.0             1.5   
159230  159230   35         175         100      104.0             1.2   

        eyesight(right)  hearing(left)  hearing(right)  systolic  ...  HDL

In [16]:
print("\nSpecific outlier values (Z-Score > 3):")
outlier_values_z = train[numeric_cols][is_outlier_z].stack()
if outlier_values_z.empty:
    print("No outliers found with Z-score > 3.")
else:
    print(outlier_values_z)


Specific outlier values (Z-Score > 3):
1       hearing(left)            2.0
        hearing(right)           2.0
        fasting blood sugar    147.0
5       waist(cm)               51.0
        triglyceride           343.0
                               ...  
159209  HDL                     99.0
159212  Gtp                    130.0
159230  fasting blood sugar    151.0
        AST                     69.0
        ALT                    102.0
Length: 26288, dtype: float64


In [17]:
print("\n\n--- Outliers using IQR ---")



--- Outliers using IQR ---


In [18]:
numeric_cols = train.select_dtypes(include=np.number).columns
Q1 = train[numeric_cols].quantile(0.25)
Q3 = train[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

In [19]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [21]:
# Create a boolean DataFrame indicating the position of outliers
is_outlier_iqr = (train[numeric_cols] < lower_bound) | (train[numeric_cols] > upper_bound)

In [22]:
# Get the rows from the original DataFrame that contain at least one outlier
outlier_rows_iqr = train[is_outlier_iqr.any(axis=1)]


In [23]:
print("\nRows containing outliers (IQR):")
print(outlier_rows_iqr)


Rows containing outliers (IQR):
            id  age  height(cm)  weight(kg)  waist(cm)  eyesight(left)  \
0            0   55         165          60       81.0             0.5   
1            1   70         165          65       89.0             0.6   
3            3   35         180          95      105.0             1.5   
5            5   50         170          55       51.0             1.2   
7            7   55         155          60       84.5             0.7   
...        ...  ...         ...         ...        ...             ...   
159245  159245   35         175          80       87.2             0.7   
159247  159247   20         170          55       71.0             1.2   
159248  159248   30         170         100      100.0             1.0   
159250  159250   25         180          90       88.0             1.5   
159251  159251   40         155          45       69.0             1.5   

        eyesight(right)  hearing(left)  hearing(right)  systolic  ...  HDL  \


In [25]:
# To see the specific outlier values
print("\nSpecific outlier values (IQR):")
outlier_values_iqr = train[numeric_cols][is_outlier_iqr].stack()
if outlier_values_iqr.empty:
    print("No outliers found using the IQR method.")
else:
    print(outlier_values_iqr)


Specific outlier values (IQR):
0       triglyceride           300.0
1       hearing(left)            2.0
        hearing(right)           2.0
        fasting blood sugar    147.0
        dental caries            1.0
                               ...  
159248  weight(kg)             100.0
        serum creatinine         1.3
159250  dental caries            1.0
159251  eyesight(right)          2.0
        fasting blood sugar     64.0
Length: 102578, dtype: float64
