> # Import Modules

In [1]:
import sys
import os
import pandas as pd
import numpy as np

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from noventis.data_cleaner import NoventisOutlierHandler
print('Successfully imported modules!')

Successfully imported modules!


> # Prepare for Data Frame

In [2]:
base_data = np.random.normal(loc=100, scale=20, size=500)
outliers_A = np.array([5, 10, 250, 300, 320])
outliers_B = np.array([-50, 150, 160, 200, 210]) 

df = pd.DataFrame({
    'Feature_A': np.concatenate([base_data, outliers_A]),
    'Feature_B': np.concatenate([np.random.normal(50, 10, 500), outliers_B])
})

In [3]:
df

Unnamed: 0,Feature_A,Feature_B
0,81.514191,55.308657
1,119.620876,51.702204
2,71.050803,41.341307
3,102.766202,57.284365
4,114.243365,42.656732
...,...,...
500,5.000000,-50.000000
501,10.000000,150.000000
502,250.000000,160.000000
503,300.000000,200.000000


> # Automatic Handling

In [4]:
df_auto = df.copy()

handler = NoventisOutlierHandler(verbose=True)

df_cleaned = handler.fit_transform(df_auto)

print(f"Original shape: {df_auto.shape}")
print(f"Cleaned shape: {df_cleaned.shape}")


Method                    | AUTO
Total Rows                | 505
Rows to Remove            | 0
Removal Percentage        | 0.00%

📊 Per-Column Details:
  • Feature_A: WINSORIZE (52 outliers)
  • Feature_B: WINSORIZE (52 outliers)
Original shape: (505, 2)
Cleaned shape: (505, 2)


> # Global Method (Winsorizing)

In [5]:
df_winsor = df.copy()

handler_winsorize = NoventisOutlierHandler(default_method='winsorize', quantile_range=(0.01, 0.99), verbose=True)

df_winsorized = handler_winsorize.fit_transform(df_winsor)

print(f"Original shape: {df_winsor.shape}")
print(f"Winsorized shape: {df_winsorized.shape}")
print("\nMin/Max values before:\n", df_winsor.agg(['min', 'max']))
print("\nMin/Max values after:\n", df_winsorized.agg(['min', 'max']))


Method                    | WINSORIZE
Total Rows                | 505
Rows to Remove            | 0
Removal Percentage        | 0.00%

📊 Per-Column Details:
  • Feature_A: WINSORIZE (12 outliers)
  • Feature_B: WINSORIZE (12 outliers)
Original shape: (505, 2)
Winsorized shape: (505, 2)

Min/Max values before:
      Feature_A  Feature_B
min        5.0      -50.0
max      320.0      210.0

Min/Max values after:
       Feature_A  Feature_B
min   53.811836  24.563952
max  150.464837  75.610304


> # Per-Column Custom Strategy

In [6]:
df_cust = df.copy()

method_map = {
    'Feature_A': 'iqr_trim',    
    'Feature_B': 'winsorize'    
}

handler_custom = NoventisOutlierHandler(feature_method_map=method_map, verbose=True)

df_custom = handler_custom.fit_transform(df_cust)

print(f"Original shape: {df_cust.shape}")
print(f"Custom handled shape: {df_custom.shape}")


Method                    | AUTO
Total Rows                | 505
Rows to Remove            | 6
Removal Percentage        | 1.19%

📊 Per-Column Details:
  • Feature_A: IQR_TRIM (6 outliers)
  • Feature_B: WINSORIZE (52 outliers)
Original shape: (505, 2)
Custom handled shape: (499, 2)
