> # Import Modules

In [31]:
import sys
import os
import pandas as pd
import numpy as np

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from noventis.data_cleaner import NoventisScaler
print('Successfully imported modules!')

Successfully imported modules!


> # Prepare for Data Frame

In [32]:
df = pd.DataFrame({
    'normal_data': np.random.normal(loc=100, scale=15, size=500),
    'skewed_data': np.random.gamma(shape=1, scale=50, size=500)**2,
    'data_with_outliers': np.concatenate([np.random.normal(loc=0, scale=5, size=496), np.array([-50, 50, -60, 60])]),
    'bimodal_data': np.concatenate([np.random.normal(loc=20, scale=5, size=250), 
                                    np.random.normal(loc=80, scale=7, size=250)])
})

In [33]:
df

Unnamed: 0,normal_data,skewed_data,data_with_outliers,bimodal_data
0,106.136403,12295.008344,1.164478,21.029621
1,138.874150,1010.331835,-0.898943,22.491160
2,108.104553,127.879147,3.610717,18.508406
3,81.306715,153.833868,0.916086,8.197191
4,83.366994,17643.995412,-4.621082,38.755673
...,...,...,...,...
495,103.632890,322.909539,3.294457,77.394998
496,101.462155,1087.173702,-50.000000,76.146647
497,109.805239,6512.673748,50.000000,69.770786
498,97.898418,9109.688947,-60.000000,93.561887


> # Automatic Scaling

In [34]:
df_auto = df.copy()

scaler = NoventisScaler(method='auto', verbose=True)

# Fit and transform the data
df_scaled = scaler.fit_transform(df_auto)

print("\nScaler chosen for each column:")
print(scaler.fitted_methods_)


   - STANDARD: 2 columns
   - POWER: 1 columns
   - ROBUST: 1 columns
  Column: normal_data
     - Method: STANDARD
     - Reason: p-value 0.416 > 0.05
     - Skewness: 0.15 | Outlier Ratio: 0.40%
  Column: skewed_data
     - Method: POWER
     - Reason: High skewness (5.05)
     - Skewness: 5.05 | Outlier Ratio: 9.40%
  Column: data_with_outliers
     - Method: ROBUST
     - Reason: Outliers (ratio: 2.0%)
     - Skewness: 0.10 | Outlier Ratio: 2.00%
  Column: bimodal_data
     - Method: STANDARD
     - Reason: Default fallback
     - Skewness: 0.03 | Outlier Ratio: 0.00%

Scaler chosen for each column:
{'normal_data': 'standard', 'skewed_data': 'power', 'data_with_outliers': 'robust', 'bimodal_data': 'standard'}


In [35]:
print('Data Before Scaling')
print(df_auto.tail(5))

print('\nData After Scaling')
print(df_scaled.tail(5))

Data Before Scaling
     normal_data  skewed_data  data_with_outliers  bimodal_data
495   103.632890   322.909539            3.294457     77.394998
496   101.462155  1087.173702          -50.000000     76.146647
497   109.805239  6512.673748           50.000000     69.770786
498    97.898418  9109.688947          -60.000000     93.561887
499   110.682554  2753.085462           60.000000     73.864637

Data After Scaling
     normal_data  skewed_data  data_with_outliers  bimodal_data
495     0.238852    -0.637540            0.447785      0.904592
496     0.098806    -0.108387           -7.812204      0.863706
497     0.637065     0.865359            7.686574      0.654880
498    -0.131111     1.078046           -9.362082      1.434100
499     0.693665     0.365149            9.236451      0.788964


> # Force Specific Method

In [36]:
df_robust = df.copy()

scaler_robust = NoventisScaler(method='robust')

df_robust_scaled = scaler_robust.fit_transform(df_robust)

print("\nDescription of data after forcing RobustScaler on all columns:")
print(df_robust_scaled.describe())


Description of data after forcing RobustScaler on all columns:
        normal_data  skewed_data  data_with_outliers  bimodal_data
count  5.000000e+02   500.000000          500.000000    500.000000
mean   3.037532e-02     0.269611           -0.009807     -0.001366
std    7.382474e-01     0.819049            1.096530      0.519357
min   -1.850250e+00    -0.131238           -9.362082     -0.742895
25%   -4.746994e-01    -0.107603           -0.503521     -0.501000
50%   -3.380542e-16     0.000000            0.000000      0.000000
75%    5.253006e-01     0.303742            0.496479      0.499000
max    2.935764e+00     8.391401            9.236451      0.788843


In [37]:
print('Data Before Scaling')
print(df_robust.tail(5))

print('\nData After Scaling')
print(df_robust_scaled.tail(5))

Data Before Scaling
     normal_data  skewed_data  data_with_outliers  bimodal_data
495   103.632890   322.909539            3.294457     77.394998
496   101.462155  1087.173702          -50.000000     76.146647
497   109.805239  6512.673748           50.000000     69.770786
498    97.898418  9109.688947          -60.000000     93.561887
499   110.682554  2753.085462           60.000000     73.864637

Data After Scaling
     normal_data  skewed_data  data_with_outliers  bimodal_data
495     0.206531    -0.103696            0.447785      0.467970
496     0.103246    -0.038509           -7.812204      0.446757
497     0.500216     0.424250            7.686574      0.338410
498    -0.066320     0.645759           -9.362082      0.742699
499     0.541959     0.103582            9.236451      0.407978


> # Advanced Usage with custom_params

In [38]:
df_custom = df.copy()

custom_config = {'standardize': False}

scaler_custom = NoventisScaler(method='power', custom_params=custom_config)

df_custom_scaled = scaler_custom.fit_transform(df_custom)

print("\nDescription of data after custom PowerTransformer:")
print(df_custom_scaled.describe())



Description of data after custom PowerTransformer:
       normal_data  skewed_data  data_with_outliers  bimodal_data
count   500.000000   500.000000          500.000000    500.000000
mean     32.261870    12.813272            0.398567      6.474992
std       3.584992     5.855527            7.079134      1.986187
min      22.531380    -4.602548          -58.313792      2.345421
25%      29.859712     8.731175           -2.822653      4.666646
50%      32.203578    13.170830            0.405947      6.910247
75%      34.711430    17.006268            3.639607      8.390221
max      45.335279    30.018033           61.738515      9.046763


In [39]:
print('Data Before Scaling')
print(df_custom.tail(5))

print('\nData After Scaling')
print(df_custom_scaled.tail(5))

Data Before Scaling
     normal_data  skewed_data  data_with_outliers  bimodal_data
495   103.632890   322.909539            3.294457     77.394998
496   101.462155  1087.173702          -50.000000     76.146647
497   109.805239  6512.673748           50.000000     69.770786
498    97.898418  9109.688947          -60.000000     93.561887
499   110.682554  2753.085462           60.000000     73.864637

Data After Scaling
     normal_data  skewed_data  data_with_outliers  bimodal_data
495    33.199679     9.083873            3.321188      8.314007
496    32.703230    12.179243          -48.668500      8.261146
497    34.593549    17.875336           51.370761      7.980849
498    31.880791    19.119485          -58.313792      8.948242
499    34.789616    14.949269           61.738515      8.162860
