> # Import Modules

In [1]:
import sys
import os
import pandas as pd
import numpy as np

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from noventis.data_cleaner import NoventisImputer
print('Successfully imported modules!')

Successfully imported modules!


> # Prepare for Data Frame

In [2]:
df = pd.DataFrame({
    'Age': [22, 38, 26, 35, np.nan, 28, 50, np.nan],
    'Salary': [72000, 48000, 54000, 61000, 75000, np.nan, 83000, 45000],
    'City': ['London', 'Paris', 'New York', np.nan, 'Tokyo', 'London', 'Paris', 'New York'],
    'Experience': [1, 10, 3, 8, 5, 4, 20, np.nan] 
})

In [3]:
df

Unnamed: 0,Age,Salary,City,Experience
0,22.0,72000.0,London,1.0
1,38.0,48000.0,Paris,10.0
2,26.0,54000.0,New York,3.0
3,35.0,61000.0,,8.0
4,,75000.0,Tokyo,5.0
5,28.0,,London,4.0
6,50.0,83000.0,Paris,20.0
7,,45000.0,New York,


> # Automatic Imputation (Default)

In [4]:
df_auto = df.copy()
imputer = NoventisImputer(verbose=True)
df_imputed = imputer.fit_transform(df_auto)
print(df_imputed)


Method                    | AUTO
Total Values Imputed      | 5
Completion Score          | 100.00%
    Age   Salary      City  Experience
0  22.0  72000.0    London         1.0
1  38.0  48000.0     Paris        10.0
2  26.0  54000.0  New York         3.0
3  35.0  61000.0    London         8.0
4  33.0  75000.0     Tokyo         5.0
5  28.0  62571.0    London         4.0
6  50.0  83000.0     Paris        20.0
7  33.0  45000.0  New York         7.0


> # Using a Global Method (KNN)

In [5]:
df_knn = df.copy()
imputer_knn = NoventisImputer(method='knn', n_neighbors=3, verbose=True)
df_knn_imputed = imputer_knn.fit_transform(df_knn)
print(df_knn_imputed)


Method                    | knn
Total Values Imputed      | 5
Completion Score          | 100.00%
         Age   Salary      City  Experience
0  22.000000  72000.0    London         1.0
1  38.000000  48000.0     Paris        10.0
2  26.000000  54000.0  New York         3.0
3  35.000000  61000.0    London         8.0
4  33.333333  75000.0     Tokyo         5.0
5  28.000000  67000.0    London         4.0
6  50.000000  83000.0     Paris        20.0
7  33.000000  45000.0  New York         7.0


> # Per-Column Custom Strategy

In [6]:
df_custom = df.copy()

custom_methods = {
    'Age': 'median',
    'Salary': 'mean',
    'City': 'mode',
    'Experience': 'constant'
}
imputer_custom = NoventisImputer(method=custom_methods, fill_value=0, verbose=True)
df_custom_imputed = imputer_custom.fit_transform(df_custom)
print(df_custom_imputed)


Method                    | CUSTOM MAP
Total Values Imputed      | 5
Completion Score          | 100.00%
    Age   Salary      City  Experience
0  22.0  72000.0    London         1.0
1  38.0  48000.0     Paris        10.0
2  26.0  54000.0  New York         3.0
3  35.0  61000.0    London         8.0
4  31.5  75000.0     Tokyo         5.0
5  28.0  62571.0    London         4.0
6  50.0  83000.0     Paris        20.0
7  31.5  45000.0  New York         0.0
