## KNN Imputer

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

# Create a sample dataset with missing values
data = np.array([[1, 2, np.nan, 4],
                 [5, np.nan, 7, 8],
                 [9, 10, 11, 12],
                 [13, 14, 15, np.nan]])

# Initialize the KNNImputer with the desired number of neighbors (k)
k = 2  # You can adjust the value of k
imputer = KNNImputer(n_neighbors=k)

# Perform the imputation
imputed_data = imputer.fit_transform(data)

# The imputed_data array now contains the imputed values
print("Imputed Data:\n", imputed_data)

Imputed Data:
 [[ 1.  2.  9.  4.]
 [ 5.  6.  7.  8.]
 [ 9. 10. 11. 12.]
 [13. 14. 15. 10.]]


In [2]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Create a sample dataset with missing values for categorical features
data = pd.DataFrame({'Color': ['Red', 'Green', np.nan, 'Blue', 'Green'],
                     'Size': ['Small', 'Medium', 'Large', 'Medium', np.nan],
                     'Weight': [10, 20, 30, np.nan, 50]})

# Identify columns with categorical data
categorical_cols = ['Color', 'Size']

# Label encode categorical columns
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Initialize the KNNImputer with the desired number of neighbors (k)
k = 4  # You can adjust the value of k
imputer = KNNImputer(n_neighbors=k)

# Perform the imputation
imputed_data = imputer.fit_transform(data)

# Convert back to original categorical values
imputed_data = pd.DataFrame(imputed_data, columns=data.columns)
for col in categorical_cols:
    imputed_data[col] = label_encoders[col].inverse_transform(np.round(imputed_data[col]).astype(int))

# The imputed_data DataFrame now contains the imputed values with categorical features
print("Imputed Data:\n", imputed_data)

Imputed Data:
    Color    Size  Weight
0    Red   Small    10.0
1  Green  Medium    20.0
2    NaN   Large    30.0
3   Blue  Medium    27.5
4  Green     NaN    50.0


In [21]:
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

# Create a sample DataFrame with 10 features and 15 data points (including missing values)
data = pd.DataFrame({
    'Numeric1': [1.0, 2.0, 3.0, 4.0, np.nan, 6.0, np.nan, 8.0, 9.0, 10.0, 11.0, 12.0, np.nan, 14.0, 15.0],
    'Numeric2': [10.0, np.nan, 30.0, 40.0, np.nan, 60.0, 70.0, 80.0, np.nan, 100.0, 110.0, 120.0, 130.0, 140.0, 150.0],
    'Numeric3': [1.0, 2.0, 3.0, 4.0, 5.0, np.nan, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, np.nan, 15.0],
    'Numeric4': [10.0, 20.0, 30.0, 40.0, 50.0, np.nan, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0, 130.0, 140.0, np.nan],
    'Numeric5': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, np.nan, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5],
    'Category1': ['A', 'B', np.nan, 'B', 'A', 'B', 'A', 'B', 'A', 'B', np.nan, 'B', 'A', 'B', 'A'],
    'Category2': ['X', 'Y', 'X', 'Y', np.nan, 'Y', 'X', 'Y', 'X', 'Y', 'X', np.nan, 'X', 'Y', 'X'],
    'Category3': ['High', 'Low', 'Medium', 'Medium', np.nan, 'Low', np.nan, 'High', 'Low', 'Medium', 'Medium', np.nan, 'High', 'Medium', 'High'],
    'Category4': ['M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M'],
    'Category5': ['Red', 'Green', np.nan, 'Red', 'Green', 'Blue', np.nan, 'Green', 'Blue', 'Red', 'Green', 'Blue', np.nan, 'Green', 'Blue']
})

# Separate features into numerical and categorical
numerical_features = ['Numeric1', 'Numeric2', 'Numeric3', 'Numeric4', 'Numeric5']
categorical_features = ['Category1', 'Category2', 'Category3', 'Category4', 'Category5']

# Create a pipeline for numerical features (K-NN imputation)
numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=2))  # You can adjust the number of neighbors as needed
])

# Create a custom imputer for categorical features (Decision Tree imputation)
class CustomCategoricalImputer:
    def __init__(self):
        self.tree_imputers = {}
    
    def fit(self, X, y=None):
        for col in X.columns:
            mask = X[col].notnull()
            clf = DecisionTreeClassifier()
            clf.fit(X.loc[mask, col].values.reshape(-1, 1), X.loc[mask, col])
            self.tree_imputers[col] = clf
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col in X.columns:
            mask = X[col].isnull()
            if mask.any():
                X_copy.loc[mask, col] = self.tree_imputers[col].predict(X.loc[mask, col].values.reshape(-1, 1))
        return X_copy

categorical_transformer = Pipeline(steps=[
    ('imputer', CustomCategoricalImputer())
])

# Use ColumnTransformer to apply the transformers to the respective features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a final pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the data
imputed_data = pipeline.fit_transform(data)

# Convert the result back to a DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=data.columns)

# The 'imputed_df' DataFrame now contains imputed values for both numerical and categorical features
print("Imputed Data:\n", imputed_df)

ValueError: could not convert string to float: 'A'

In [22]:
# Create a sample DataFrame with 10 features and 15 data points (including missing values)
data = pd.DataFrame({
    'Numeric1': [1.0, 2.0, 3.0, 4.0, np.nan, 6.0, np.nan, 8.0, 9.0, 10.0, 11.0, 12.0, np.nan, 14.0, 15.0],
    'Numeric2': [10.0, np.nan, 30.0, 40.0, np.nan, 60.0, 70.0, 80.0, np.nan, 100.0, 110.0, 120.0, 130.0, 140.0, 150.0],
    'Numeric3': [1.0, 2.0, 3.0, 4.0, 5.0, np.nan, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, np.nan, 15.0],
    'Numeric4': [10.0, 20.0, 30.0, 40.0, 50.0, np.nan, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0, 130.0, 140.0, np.nan],
    'Numeric5': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, np.nan, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5],
    'Category1': ['A', 'B', np.nan, 'B', 'A', 'B', 'A', 'B', 'A', 'B', np.nan, 'B', 'A', 'B', 'A'],
    'Category2': ['X', 'Y', 'X', 'Y', np.nan, 'Y', 'X', 'Y', 'X', 'Y', 'X', np.nan, 'X', 'Y', 'X'],
    'Category3': ['High', 'Low', 'Medium', 'Medium', np.nan, 'Low', np.nan, 'High', 'Low', 'Medium', 'Medium', np.nan, 'High', 'Medium', 'High'],
    'Category4': ['M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M'],
    'Category5': ['Red', 'Green', np.nan, 'Red', 'Green', 'Blue', np.nan, 'Green', 'Blue', 'Red', 'Green', 'Blue', np.nan, 'Green', 'Blue']
})

# Separate features into numerical and categorical
numerical_features = ['Numeric1', 'Numeric2', 'Numeric3', 'Numeric4', 'Numeric5']
categorical_features = ['Category1', 'Category2', 'Category3', 'Category4', 'Category5']

# Create a pipeline for numerical features (K-NN imputation)
numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=2))  # You can adjust the number of neighbors as needed
])

# Create a custom imputer for categorical features (Decision Tree imputation)
class CustomCategoricalImputer:
    def __init__(self):
        self.tree_imputers = {}
    
    def fit(self, X, y=None):
        for col in X.columns:
            mask = X[col].notnull()
            clf = DecisionTreeClassifier()
            clf.fit(X.loc[mask, col].values.reshape(-1, 1), X.loc[mask, col])
            self.tree_imputers[col] = clf
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col in X.columns:
            mask = X[col].isnull()
            if mask.any():
                X_copy.loc[mask, col] = self.tree_imputers[col].predict(X.loc[mask, col].values.reshape(-1, 1))
        return X_copy

categorical_transformer = Pipeline(steps=[
    ('imputer', CustomCategoricalImputer())
])

# Use ColumnTransformer to apply the transformers to the respective features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a final pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the data
imputed_data = pipeline.fit_transform(data)

# Convert the result back to a DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=data.columns)

# The 'imputed_df' DataFrame now contains imputed values for both numerical and categorical features
print("Imputed Data:\n", imputed_df)

ValueError: could not convert string to float: 'A'

In [26]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

# Create a sample DataFrame with 10 features and 15 data points (including missing values)
data = pd.DataFrame({
    'Numeric1': [1.0, 2.0, 3.0, 4.0, np.nan, 6.0, np.nan, 8.0, 9.0, 10.0, 11.0, 12.0, np.nan, 14.0, 15.0],
    'Numeric2': [10.0, np.nan, 30.0, 40.0, np.nan, 60.0, 70.0, 80.0, np.nan, 100.0, 110.0, 120.0, 130.0, 140.0, 150.0],
    'Numeric3': [1.0, 2.0, 3.0, 4.0, 5.0, np.nan, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, np.nan, 15.0],
    'Numeric4': [10.0, 20.0, 30.0, 40.0, 50.0, np.nan, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0, 130.0, 140.0, np.nan],
    'Numeric5': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, np.nan, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5],
    'Category1': ['A', 'B', np.nan, 'B', 'A', 'B', 'A', 'B', 'A', 'B', np.nan, 'B', 'A', 'B', 'A'],
    'Category2': ['X', 'Y', 'X', 'Y', np.nan, 'Y', 'X', 'Y', 'X', 'Y', 'X', np.nan, 'X', 'Y', 'X'],
    'Category3': ['High', 'Low', 'Medium', 'Medium', np.nan, 'Low', np.nan, 'High', 'Low', 'Medium', 'Medium', np.nan, 'High', 'Medium', 'High'],
    'Category4': ['M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M'],
    'Category5': ['Red', 'Green', np.nan, 'Red', 'Green', 'Blue', np.nan, 'Green', 'Blue', 'Red', 'Green', 'Blue', np.nan, 'Green', 'Blue']
})

# Separate features into numerical and categorical
numerical_features = ['Numeric1', 'Numeric2', 'Numeric3', 'Numeric4', 'Numeric5']
categorical_features = ['Category1', 'Category2', 'Category3', 'Category4', 'Category5']

# Preprocess categorical features
data[categorical_features] = data[categorical_features].fillna("Missing")  # Fill missing categorical values with a placeholder

# Encode categorical values
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Apply K-NN imputation to numerical features
imputer = KNNImputer(n_neighbors=2)  # You can adjust the number of neighbors as needed
imputed_data = imputer.fit_transform(data[numerical_features])

# Convert the result back to a DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=numerical_features)

# Inverse transform the encoded categorical values
print(imputed_df)
for col in categorical_features:
    imputed_df[col] = label_encoders[col].inverse_transform(imputed_df[col].astype(int))

# The 'imputed_df' DataFrame now contains imputed values for both numerical and categorical features
print("Imputed Data:\n", imputed_df)

    Numeric1  Numeric2  Numeric3  Numeric4  Numeric5
0        1.0      10.0       1.0      10.0       0.1
1        2.0      45.0       2.0      20.0       0.2
2        3.0      30.0       3.0      30.0       0.3
3        4.0      40.0       4.0      40.0       0.4
4        5.0      50.0       5.0      50.0       0.5
5        6.0      60.0       7.0      70.0       0.6
6        7.0      70.0       7.0      70.0       0.7
7        8.0      80.0       8.0      80.0       0.8
8        9.0     105.0       9.0      90.0       0.9
9       10.0     100.0      10.0     100.0       1.0
10      11.0     110.0      11.0     110.0       1.1
11      12.0     120.0      12.0     120.0       1.2
12      13.0     130.0      13.0     130.0       1.3
13      14.0     140.0      14.0     140.0       1.4
14      15.0     150.0      15.0     115.0       1.5


KeyError: 'Category1'

In [29]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

# Create a sample DataFrame with 10 features and 15 data points (including missing values)
data = pd.DataFrame({
    'Numeric1': [1.0, 2.0, 3.0, 4.0, np.nan, 6.0, np.nan, 8.0, 9.0, 10.0, 11.0, 12.0, np.nan, 14.0, 15.0],
    'Numeric2': [10.0, np.nan, 30.0, 40.0, np.nan, 60.0, 70.0, 80.0, np.nan, 100.0, 110.0, 120.0, 130.0, 140.0, 150.0],
    'Numeric3': [1.0, 2.0, 3.0, 4.0, 5.0, np.nan, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, np.nan, 15.0],
    'Numeric4': [10.0, 20.0, 30.0, 40.0, 50.0, np.nan, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0, 130.0, 140.0, np.nan],
    'Numeric5': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, np.nan, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5],
    'Category1': ['A', 'B', np.nan, 'B', 'A', 'B', 'A', 'B', 'A', 'B', np.nan, 'B', 'A', 'B', 'A'],
    'Category2': ['X', 'Y', 'X', 'Y', np.nan, 'Y', 'X', 'Y', 'X', 'Y', 'X', np.nan, 'X', 'Y', 'X'],
    'Category3': ['High', 'Low', 'Medium', 'Medium', np.nan, 'Low', np.nan, 'High', 'Low', 'Medium', 'Medium', np.nan, 'High', 'Medium', 'High'],
    'Category4': ['M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M'],
    'Category5': ['Red', 'Green', np.nan, 'Red', 'Green', 'Blue', np.nan, 'Green', 'Blue', 'Red', 'Green', 'Blue', np.nan, 'Green', 'Blue']
})

# Separate features into numerical and categorical
numerical_features = ['Numeric1', 'Numeric2', 'Numeric3', 'Numeric4', 'Numeric5']
categorical_features = ['Category1', 'Category2', 'Category3', 'Category4', 'Category5']

# Preprocess categorical features and replace 'Missing' values with a placeholder
data[categorical_features] = data[categorical_features].fillna("Placeholder")

# Encode categorical values
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Apply K-NN imputation to numerical features
imputer = KNNImputer(n_neighbors=2)  # You can adjust the number of neighbors as needed
imputed_numerical_data = imputer.fit_transform(data[numerical_features])

# Convert the result back to a DataFrame
imputed_numerical_df = pd.DataFrame(imputed_numerical_data, columns=numerical_features)

# Concatenate imputed numerical features with the original categorical features
imputed_df = pd.concat([imputed_numerical_df, data[categorical_features]], axis=1)

# Inverse transform the encoded categorical values
for col in categorical_features:
    imputed_df[col] = label_encoders[col].inverse_transform(imputed_df[col].astype(int))

# The 'imputed_df' DataFrame now contains imputed values for both numerical and categorical features
imputed_df

Unnamed: 0,Numeric1,Numeric2,Numeric3,Numeric4,Numeric5,Category1,Category2,Category3,Category4,Category5
0,1.0,10.0,1.0,10.0,0.1,A,X,High,M,Red
1,2.0,45.0,2.0,20.0,0.2,B,Y,Low,N,Green
2,3.0,30.0,3.0,30.0,0.3,Placeholder,X,Medium,M,Placeholder
3,4.0,40.0,4.0,40.0,0.4,B,Y,Medium,N,Red
4,5.0,50.0,5.0,50.0,0.5,A,Placeholder,Placeholder,M,Green
5,6.0,60.0,7.0,70.0,0.6,B,Y,Low,N,Blue
6,7.0,70.0,7.0,70.0,0.7,A,X,Placeholder,M,Placeholder
7,8.0,80.0,8.0,80.0,0.8,B,Y,High,N,Green
8,9.0,105.0,9.0,90.0,0.9,A,X,Low,M,Blue
9,10.0,100.0,10.0,100.0,1.0,B,Y,Medium,N,Red


In [33]:
from sklearn.experimental import enable_iterative_imputer  # Enable IterativeImputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

# Create a sample DataFrame with 10 features and 15 data points (including missing values)
data = pd.DataFrame({
    'Numeric1': [1.0, 2.0, 3.0, 4.0, np.nan, 6.0, np.nan, 8.0, 9.0, 10.0, 11.0, 12.0, np.nan, 14.0, 15.0],
    'Numeric2': [10.0, np.nan, 30.0, 40.0, np.nan, 60.0, 70.0, 80.0, np.nan, 100.0, 110.0, 120.0, 130.0, 140.0, 150.0],
    'Numeric3': [1.0, 2.0, 3.0, 4.0, 5.0, np.nan, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, np.nan, 15.0],
    'Numeric4': [10.0, 20.0, 30.0, 40.0, 50.0, np.nan, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0, 130.0, 140.0, np.nan],
    'Numeric5': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, np.nan, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5],
    'Category1': ['A', 'B', np.nan, 'B', 'A', 'B', 'A', 'B', 'A', 'B', np.nan, 'B', 'A', 'B', 'A'],
    'Category2': ['X', 'Y', 'X', 'Y', np.nan, 'Y', 'X', 'Y', 'X', 'Y', 'X', np.nan, 'X', 'Y', 'X'],
    'Category3': ['High', 'Low', 'Medium', 'Medium', np.nan, 'Low', np.nan, 'High', 'Low', 'Medium', 'Medium', np.nan, 'High', 'Medium', 'High'],
    'Category4': ['M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M', 'N', 'M'],
    'Category5': ['Red', 'Green', np.nan, 'Red', 'Green', 'Blue', np.nan, 'Green', 'Blue', 'Red', 'Green', 'Blue', np.nan, 'Green', 'Blue']
})

# Separate features into numerical and categorical
numerical_features = ['Numeric1', 'Numeric2', 'Numeric3', 'Numeric4', 'Numeric5']

# Use IterativeImputer with RandomForestRegressor for numeric features
imputer = IterativeImputer(estimator=RandomForestRegressor(), max_iter=10, random_state=0)

# Fit and transform the data
imputed_data = imputer.fit_transform(data[numerical_features])

# Convert the result back to a DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=numerical_features)

# Combine imputed numeric features with original categorical features
imputed_df = pd.concat([imputed_df, data.drop(columns=numerical_features)], axis=1)

# The 'imputed_df' DataFrame now contains imputed values for numeric features using Random Forest Imputer
imputed_df



Unnamed: 0,Numeric1,Numeric2,Numeric3,Numeric4,Numeric5,Category1,Category2,Category3,Category4,Category5
0,1.0,10.0,1.0,10.0,0.1,A,X,High,M,Red
1,2.0,17.9,2.0,20.0,0.2,B,Y,Low,N,Green
2,3.0,30.0,3.0,30.0,0.3,,X,Medium,M,
3,4.0,40.0,4.0,40.0,0.4,B,Y,Medium,N,Red
4,4.46,44.2,5.0,50.0,0.5,A,,,M,Green
5,6.0,60.0,6.31,64.7,0.6,B,Y,Low,N,Blue
6,6.65,70.0,7.0,70.0,0.661,A,X,,M,
7,8.0,80.0,8.0,80.0,0.8,B,Y,High,N,Green
8,9.0,85.7,9.0,90.0,0.9,A,X,Low,M,Blue
9,10.0,100.0,10.0,100.0,1.0,B,Y,Medium,N,Red


In [38]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError

class CategoricalImputer(BaseEstimator):
    def __init__(self, numeric_strategy='most_frequent', categorical_strategy='decision_tree', random_state=None):
        self.numeric_strategy = numeric_strategy
        self.categorical_strategy = categorical_strategy
        self.random_state = random_state
        self.numeric_imputer = None
        self.categorical_imputer = None
        self.label_encoders = {}
        self.fitted = False

    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input data must be a pandas DataFrame.")

        # Separate numeric and categorical features
        numeric_features = X.select_dtypes(include=[np.number]).columns
        categorical_features = X.select_dtypes(exclude=[np.number]).columns

        if len(numeric_features) > 0:
            self.numeric_imputer = IterativeImputer(estimator=RandomForestRegressor(random_state=self.random_state),
                                                    random_state=self.random_state)
            self.numeric_imputer.fit(X[numeric_features])

        if len(categorical_features) > 0:
            if self.categorical_strategy == 'most_frequent':
                self.categorical_imputer = SimpleImputer(strategy='most_frequent')
                X[categorical_features] = X[categorical_features].fillna(X[categorical_features].mode().iloc[0])
            elif self.categorical_strategy == 'decision_tree':
                self.label_encoders = {}
                X[categorical_features] = self._fit_decision_tree_imputer(X, categorical_features)
            else:
                raise ValueError("Invalid categorical imputation strategy. Supported strategies: 'most_frequent', 'decision_tree'.")

        self.fitted = True
        return self

    def transform(self, X):
        if not self.fitted:
            raise NotFittedError("CategoricalImputer is not fitted. Call fit() before transforming data.")

        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input data must be a pandas DataFrame.")

        # Separate numeric and categorical features
        numeric_features = X.select_dtypes(include=[np.number]).columns
        categorical_features = X.select_dtypes(exclude=[np.number]).columns

        # Impute numeric features
        if self.numeric_imputer and len(numeric_features) > 0:
            X[numeric_features] = self.numeric_imputer.transform(X[numeric_features])

        return X

    def _fit_decision_tree_imputer(self, X, categorical_features):
        for col in categorical_features:
            mask = X[col].notnull()
            clf = DecisionTreeClassifier(random_state=self.random_state)
            clf.fit(X.loc[mask, [col]].values, X.loc[mask, col])
            self.label_encoders[col] = LabelEncoder()
            self.label_encoders[col].fit(X.loc[mask, col])
            X.loc[~mask, col] = self.label_encoders[col].inverse_transform(clf.predict(X.loc[~mask, [col]].values))
        return X

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

# Example usage:
data = pd.DataFrame({
    'Numeric1': [1.0, 2.0, np.nan, 4.0],
    'Category1': ['A', 'B', np.nan, 'B'],
    'Category2': ['X', 'Y', 'X', np.nan]
})

# Instantiate the CategoricalImputer
imputer = CategoricalImputer(numeric_strategy='mean', categorical_strategy='decision_tree', random_state=42)

# Use the imputer to fill missing values
imputed_data = imputer.fit_transform(data)

# Display the imputed DataFrame
print(imputed_data)

ValueError: could not convert string to float: 'A'

## MissForest 

In [3]:
# !pip install missingpy



In [8]:
from missingpy import MissForest

# Create a sample dataset with missing values for numeric and categorical features
data = pd.DataFrame({'Feature1': [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.nan, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, np.nan, 15.0, 16.0, 17.0, np.nan, 19.0, 20.0],
                     'Feature2': [10.0, np.nan, 30.0, 40.0, np.nan, 60.0, 70.0, 80.0, np.nan, 100.0, 110.0, 120.0, 130.0, 140.0, np.nan, 160.0, 170.0, 180.0, 190.0, 200.0],
                     'Feature3': [1.0, 2.0, 3.0, 4.0, 5.0, np.nan, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, np.nan, 16.0, 17.0, 18.0, 19.0, 20.0],
                     'Feature4': [10.0, 20.0, 30.0, 40.0, 50.0, np.nan, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0, 130.0, 140.0, 150.0, 160.0, 170.0, 180.0, 190.0, 200.0],
                     'Feature5': ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'],
                     'Feature6': ['X', 'Y', 'X', 'Y', 'X', 'Y', 'X', 'Y', 'X', 'Y', 'X', 'Y', 'X', 'Y', 'X', 'Y', 'X', 'Y', 'X', 'Y'],
                     'Feature7': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium', 'Medium', 'Low', 'High', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium']})

# Identify columns with missing values
missing_cols = data.columns[data.isnull().any()].tolist()

# Convert categorical columns to string type
for col in missing_cols:
    if data[col].dtype == 'object':
        data[col] = data[col].astype(str)

# Initialize the MissForest imputer
imputer = MissForest()

# Perform imputation
imputed_data = imputer.fit_transform(data)

# The 'imputed_data' DataFrame now contains imputed values for both numeric and categorical features
print("Imputed Data:\n", imputed_data)

ModuleNotFoundError: No module named 'missingpy'

In [40]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError

class CategoricalImputer(BaseEstimator):
    def __init__(self, numeric_strategy='most_frequent', categorical_strategy='decision_tree', random_state=None):
        self.numeric_strategy = numeric_strategy
        self.categorical_strategy = categorical_strategy
        self.random_state = random_state
        self.numeric_imputer = None
        self.categorical_imputer = None
        self.label_encoders = {}
        self.fitted = False

    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input data must be a pandas DataFrame.")

        # Separate numeric and categorical features
        numeric_features = X.select_dtypes(include=[np.number]).columns
        categorical_features = X.select_dtypes(exclude=[np.number]).columns

        if len(numeric_features) > 0:
            self.numeric_imputer = IterativeImputer(estimator=RandomForestRegressor(random_state=self.random_state),
                                                    random_state=self.random_state)
            self.numeric_imputer.fit(X[numeric_features])

        if len(categorical_features) > 0:
            if self.categorical_strategy == 'most_frequent':
                self.categorical_imputer = SimpleImputer(strategy='most_frequent')
                X[categorical_features] = X[categorical_features].fillna(X[categorical_features].mode().iloc[0])
            elif self.categorical_strategy == 'decision_tree':
                self.label_encoders = {}
                X[categorical_features] = self._fit_decision_tree_imputer(X, categorical_features)
            else:
                raise ValueError("Invalid categorical imputation strategy. Supported strategies: 'most_frequent', 'decision_tree'.")

        self.fitted = True
        return self

    def transform(self, X):
        if not self.fitted:
            raise NotFittedError("CategoricalImputer is not fitted. Call fit() before transforming data.")

        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input data must be a pandas DataFrame.")

        # Separate numeric and categorical features
        numeric_features = X.select_dtypes(include=[np.number]).columns
        categorical_features = X.select_dtypes(exclude=[np.number]).columns

        # Impute numeric features
        if self.numeric_imputer and len(numeric_features) > 0:
            X[numeric_features] = self.numeric_imputer.transform(X[numeric_features])

        return X

    def _fit_decision_tree_imputer(self, X, categorical_features):
        for col in categorical_features:
            mask = X[col].notnull()
            clf = DecisionTreeClassifier(random_state=self.random_state)
            clf.fit(X.loc[mask, [col]].values, X.loc[mask, col])
            self.label_encoders[col] = LabelEncoder()
            self.label_encoders[col].fit(X.loc[mask, col])
            X.loc[~mask, col] = self.label_encoders[col].inverse_transform(clf.predict(X.loc[~mask, [col]].values))
        return X

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

# Create a sample DataFrame with missing values
data = pd.DataFrame({
    'Numeric1': [1.0, 2.0, np.nan, 4.0, 5.0],
    'Category1': ['A', 'B', np.nan, 'B', 'A'],
    'Category2': ['X', 'Y', 'X', 'Y', np.nan]
})

# Instantiate the CategoricalImputer
imputer = CategoricalImputer(numeric_strategy='mean', categorical_strategy='decision_tree', random_state=42)

# Use the imputer to fill missing values
imputed_data = imputer.fit_transform(data)

# Display the imputed DataFrame
print(imputed_data)


ValueError: could not convert string to float: 'A'

In [53]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError

class CategoricalImputer(BaseEstimator):
    def __init__(self, numeric_strategy='most_frequent', categorical_strategy='decision_tree', random_state=None):
        self.numeric_strategy = numeric_strategy
        self.categorical_strategy = categorical_strategy
        self.random_state = random_state
        self.numeric_imputer = None
        self.categorical_imputer = None
        self.label_encoders = {}
        self.fitted = False

    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input data must be a pandas DataFrame.")

        # Separate numeric and categorical features
        numeric_features = X.select_dtypes(include=[np.number]).columns
        categorical_features = X.select_dtypes(exclude=[np.number]).columns
        print('here1')
        print(numeric_features)
        print(categorical_features)
        if len(numeric_features) > 0:
            self.numeric_imputer = IterativeImputer(estimator=RandomForestRegressor(random_state=self.random_state),
                                                    random_state=self.random_state)
            self.numeric_imputer.fit(X[numeric_features])

        if len(categorical_features) > 0:
            print('here3')
            if self.categorical_strategy == 'most_frequent':
                self.categorical_imputer = SimpleImputer(strategy='most_frequent')
                X[categorical_features] = X[categorical_features].fillna(X[categorical_features].mode().iloc[0])
            elif self.categorical_strategy == 'decision_tree':
                print('here4')
                self.label_encoders = {}
                print('here5')
                X[categorical_features] = self._fit_decision_tree_imputer(X, categorical_features)
            else:
                raise ValueError("Invalid categorical imputation strategy. Supported strategies: 'most_frequent', 'decision_tree'.")
        print('here2')
        self.fitted = True
        return self

    def transform(self, X):
        if not self.fitted:
            raise NotFittedError("CategoricalImputer is not fitted. Call fit() before transforming data.")

        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input data must be a pandas DataFrame.")

        # Separate numeric and categorical features
        numeric_features = X.select_dtypes(include=[np.number]).columns
        categorical_features = X.select_dtypes(exclude=[np.number]).columns

        # Impute numeric features
        if self.numeric_imputer and len(numeric_features) > 0:
            X[numeric_features] = self.numeric_imputer.transform(X[numeric_features])

        return X

    def _fit_decision_tree_imputer(self, X, categorical_features):
        for col in categorical_features:
            print('here6', col)
            mask = X[col].notnull()
            print(mask)
            clf = DecisionTreeClassifier(random_state=self.random_state)
            print('here7')
            clf.fit(X.loc[mask, [col]].values, X.loc[mask, col])
            print('here8')
            self.label_encoders[col] = LabelEncoder()
            print('here9')
            self.label_encoders[col].fit(X.loc[mask, col])
            print('here10')
            X.loc[~mask, col] = self.label_encoders[col].inverse_transform(clf.predict(X.loc[~mask, [col]].values))
        return X

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

# Create a sample DataFrame with missing values
data = pd.DataFrame({
    'Numeric1': [1.0, 2.0, np.nan, 4.0, 5.0],
    'Category1': ['A', 'B', np.nan, 'B', 'A'],
    'Category2': ['X', 'Y', 'X', 'Y', np.nan]
})

# Instantiate the CategoricalImputer
imputer = CategoricalImputer(numeric_strategy='mean', categorical_strategy='decision_tree', random_state=42)

# Use the imputer to fill missing values
imputed_data = imputer.fit_transform(data)

# Display the imputed DataFrame
print(imputed_data)


here1
Index(['Numeric1'], dtype='object')
Index(['Category1', 'Category2'], dtype='object')
here3
here4
here5
here6 Category1
0     True
1     True
2    False
3     True
4     True
Name: Category1, dtype: bool
here7


ValueError: could not convert string to float: 'A'

In [54]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError

class CategoricalImputer(BaseEstimator):
    def __init__(self, numeric_strategy='most_frequent', categorical_strategy='decision_tree', random_state=None):
        self.numeric_strategy = numeric_strategy
        self.categorical_strategy = categorical_strategy
        self.random_state = random_state
        self.numeric_imputer = None
        self.categorical_imputer = None
        self.label_encoders = {}
        self.fitted = False

    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input data must be a pandas DataFrame.")

        # Separate numeric and categorical features
        numeric_features = X.select_dtypes(include=[np.number]).columns
        categorical_features = X.select_dtypes(exclude=[np.number]).columns

        if len(numeric_features) > 0:
            self.numeric_imputer = IterativeImputer(estimator=RandomForestRegressor(random_state=self.random_state),
                                                    random_state=self.random_state)
            self.numeric_imputer.fit(X[numeric_features])

        if len(categorical_features) > 0:
            if self.categorical_strategy == 'most_frequent':
                self.categorical_imputer = SimpleImputer(strategy='most_frequent')
                X[categorical_features] = X[categorical_features].fillna(X[categorical_features].mode().iloc[0])
            elif self.categorical_strategy == 'decision_tree':
                self.label_encoders = {}
                X[categorical_features] = self._fit_decision_tree_imputer(X, categorical_features)
            else:
                raise ValueError("Invalid categorical imputation strategy. Supported strategies: 'most_frequent', 'decision_tree'.")

        self.fitted = True
        return self

    def transform(self, X):
        if not self.fitted:
            raise NotFittedError("CategoricalImputer is not fitted. Call fit() before transforming data.")

        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input data must be a pandas DataFrame.")

        # Separate numeric and categorical features
        numeric_features = X.select_dtypes(include=[np.number]).columns
        categorical_features = X.select_dtypes(exclude=[np.number]).columns

        # Impute numeric features
        if self.numeric_imputer and len(numeric_features) > 0:
            X[numeric_features] = self.numeric_imputer.transform(X[numeric_features])

        return X

    def _fit_decision_tree_imputer(self, X, categorical_features):
        for col in categorical_features:
            clf = DecisionTreeClassifier(random_state=self.random_state)
            clf.fit(X.loc[:, [col]].values, X.loc[:, col])
            self.label_encoders[col] = LabelEncoder()
            self.label_encoders[col].fit(X.loc[:, col])
            missing_mask = X[col].isnull()
            X.loc[missing_mask, col] = self.label_encoders[col].inverse_transform(clf.predict(X.loc[missing_mask, [col]].values))
        return X

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

# Create a sample DataFrame with missing values
data = pd.DataFrame({
    'Numeric1': [1.0, 2.0, np.nan, 4.0, 5.0],
    'Category1': ['A', 'B', np.nan, 'B', 'A'],
    'Category2': ['X', 'Y', 'X', 'Y', np.nan]
})

# Instantiate the CategoricalImputer
imputer = CategoricalImputer(numeric_strategy='mean', categorical_strategy='decision_tree', random_state=42)

# Use the imputer to fill missing values
imputed_data = imputer.fit_transform(data)

# Display the imputed DataFrame
print(imputed_data)


ValueError: could not convert string to float: 'A'

In [55]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError

class CategoricalImputer(BaseEstimator):
    def __init__(self, numeric_strategy='most_frequent', categorical_strategy='most_frequent', random_state=None):
        self.numeric_strategy = numeric_strategy
        self.categorical_strategy = categorical_strategy
        self.random_state = random_state
        self.numeric_imputer = None
        self.categorical_imputer = None
        self.label_encoders = {}
        self.fitted = False

    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input data must be a pandas DataFrame.")

        # Separate numeric and categorical features
        numeric_features = X.select_dtypes(include=[np.number]).columns
        categorical_features = X.select_dtypes(exclude=[np.number]).columns

        if len(numeric_features) > 0:
            self.numeric_imputer = IterativeImputer(estimator=RandomForestRegressor(random_state=self.random_state),
                                                    random_state=self.random_state)
            self.numeric_imputer.fit(X[numeric_features])

        if len(categorical_features) > 0:
            self.categorical_imputer = SimpleImputer(strategy=self.categorical_strategy)
            X[categorical_features] = self.categorical_imputer.fit_transform(X[categorical_features])

        self.fitted = True
        return self

    def transform(self, X):
        if not self.fitted:
            raise NotFittedError("CategoricalImputer is not fitted. Call fit() before transforming data.")

        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input data must be a pandas DataFrame.")

        # Separate numeric and categorical features
        numeric_features = X.select_dtypes(include=[np.number]).columns

        # Impute numeric features
        if self.numeric_imputer and len(numeric_features) > 0:
            X[numeric_features] = self.numeric_imputer.transform(X[numeric_features])

        return X

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

# Create a sample DataFrame with missing values
data = pd.DataFrame({
    'Numeric1': [1.0, 2.0, np.nan, 4.0, 5.0],
    'Category1': ['A', 'B', np.nan, 'B', 'A'],
    'Category2': ['X', 'Y', 'X', 'Y', np.nan]
})

# Instantiate the CategoricalImputer with different categorical strategy
imputer = CategoricalImputer(numeric_strategy='mean', categorical_strategy='most_frequent', random_state=42)

# Use the imputer to fill missing values
imputed_data = imputer.fit_transform(data)

# Display the imputed DataFrame
print(imputed_data)


   Numeric1 Category1 Category2
0       1.0         A         X
1       2.0         B         Y
2       3.0         A         X
3       4.0         B         Y
4       5.0         A         X


In [56]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError

class CategoricalImputer(BaseEstimator):
    def __init__(self, numeric_strategy='mean', categorical_strategy='most_frequent', random_state=None):
        self.numeric_strategy = numeric_strategy
        self.categorical_strategy = categorical_strategy
        self.random_state = random_state
        self.numeric_imputer = None
        self.categorical_imputer = None
        self.label_encoders = {}
        self.fitted = False

    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input data must be a pandas DataFrame.")

        # Separate numeric and categorical features
        numeric_features = X.select_dtypes(include=[np.number]).columns
        categorical_features = X.select_dtypes(exclude=[np.number]).columns

        if len(numeric_features) > 0:
            if self.numeric_strategy == 'mean':
                self.numeric_imputer = SimpleImputer(strategy='mean')
            elif self.numeric_strategy == 'median':
                self.numeric_imputer = SimpleImputer(strategy='median')
            else:
                raise ValueError("Invalid numeric imputation strategy. Supported strategies: 'mean', 'median'.")

            X[numeric_features] = self.numeric_imputer.fit_transform(X[numeric_features])

        if len(categorical_features) > 0:
            if self.categorical_strategy == 'most_frequent':
                self.categorical_imputer = SimpleImputer(strategy='most_frequent')
            elif self.categorical_strategy == 'decision_tree':
                self.label_encoders = {}
                X[categorical_features] = self._fit_decision_tree_imputer(X, categorical_features)
            else:
                raise ValueError("Invalid categorical imputation strategy. Supported strategies: 'most_frequent', 'decision_tree'.")

        self.fitted = True
        return self

    def transform(self, X):
        if not self.fitted:
            raise NotFittedError("CategoricalImputer is not fitted. Call fit() before transforming data.")

        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input data must be a pandas DataFrame.")

        # Separate numeric and categorical features
        numeric_features = X.select_dtypes(include=[np.number]).columns

        # Impute numeric features
        if self.numeric_imputer and len(numeric_features) > 0:
            X[numeric_features] = self.numeric_imputer.transform(X[numeric_features])

        return X

    def _fit_decision_tree_imputer(self, X, categorical_features):
        for col in categorical_features:
            clf = DecisionTreeClassifier(random_state=self.random_state)
            clf.fit(X.loc[:, [col]].values, X.loc[:, col])
            self.label_encoders[col] = LabelEncoder()
            self.label_encoders[col].fit(X.loc[:, col])
            missing_mask = X[col].isnull()
            X.loc[missing_mask, col] = self.label_encoders[col].inverse_transform(clf.predict(X.loc[missing_mask, [col]].values))
        return X

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

# Create a sample DataFrame with missing values
data = pd.DataFrame({
    'Numeric1': [1.0, 2.0, np.nan, 4.0, 5.0],
    'Category1': ['A', 'B', np.nan, 'B', 'A'],
    'Category2': ['X', 'Y', 'X', 'Y', np.nan]
})

# Instantiate the CategoricalImputer with different numeric and categorical strategies
imputer = CategoricalImputer(numeric_strategy='median', categorical_strategy='most_frequent', random_state=42)

# Use the imputer to fill missing values
imputed_data = imputer.fit_transform(data)

# Display the imputed DataFrame
print(imputed_data)


   Numeric1 Category1 Category2
0       1.0         A         X
1       2.0         B         Y
2       3.0       NaN         X
3       4.0         B         Y
4       5.0         A       NaN


In [57]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

class CustomRandomForestImputer:
    def __init__(self, n_estimators=10, max_depth=None, random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state
        
    def fit(self, X):
        self.imputer = IterativeImputer(
            estimator=RandomForestRegressor(
                n_estimators=self.n_estimators,
                max_depth=self.max_depth,
                random_state=self.random_state
            )
        )
        self.imputer.fit(X)
        
    def transform(self, X):
        return self.imputer.transform(X)
    
    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

# Example usage:
if __name__ == "__main__":
    # Create a sample DataFrame with missing values
    data = {
        'Feature1': [1, 2, 3, np.nan, 5, 6, 7, 8, 9, 10],
        'Feature2': [11, 12, 13, 14, np.nan, 16, 17, 18, 19, 20],
        'Feature3': [21, 22, 23, 24, 25, 26, 27, np.nan, 29, 30]
    }
    df = pd.DataFrame(data)
    
    # Create the custom imputer and impute missing values
    custom_imputer = CustomRandomForestImputer()
    imputed_data = custom_imputer.fit_transform(df)
    
    # Display the imputed DataFrame
    imputed_df = pd.DataFrame(imputed_data, columns=df.columns)
    print("Imputed DataFrame:")
    print(imputed_df)

Imputed DataFrame:
   Feature1  Feature2  Feature3
0       1.0      11.0      21.0
1       2.0      12.0      22.0
2       3.0      13.0      23.0
3       4.3      14.0      24.0
4       5.0      14.5      25.0
5       6.0      16.0      26.0
6       7.0      17.0      27.0
7       8.0      18.0      27.3
8       9.0      19.0      29.0
9      10.0      20.0      30.0




In [58]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

class CustomRandomForestImputer:
    def __init__(self, n_estimators=10, max_depth=None, random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state
        
    def fit(self, X):
        self.imputer = IterativeImputer(
            estimator=RandomForestRegressor(
                n_estimators=self.n_estimators,
                max_depth=self.max_depth,
                random_state=self.random_state
            )
        )
        self.imputer.fit(X)
        
    def transform(self, X):
        return self.imputer.transform(X)
    
    def fit_transform(self, df, numeric_columns):
        numeric_df = df[numeric_columns]
        self.fit(numeric_df)
        imputed_data = self.transform(numeric_df)
        
        # Create a copy of the original DataFrame and replace missing values with imputed values
        imputed_df = df.copy()
        imputed_df[numeric_columns] = imputed_data
        return imputed_df

# Example usage:
if __name__ == "__main__":
    # Create a sample DataFrame with missing values
    data = {
        'Feature1': [1, 2, 3, np.nan, 5, 6, 7, 8, 9, 10],
        'Feature2': [11, 12, 13, 14, np.nan, 16, 17, 18, 19, 20],
        'Feature3': [21, 22, 23, 24, 25, 26, 27, np.nan, 29, 30]
    }
    df = pd.DataFrame(data)
    
    # Define the list of numeric columns
    numeric_columns = ['Feature1', 'Feature2', 'Feature3']
    
    # Create the custom imputer and impute missing values
    custom_imputer = CustomRandomForestImputer()
    imputed_df = custom_imputer.fit_transform(df, numeric_columns)
    
    # Display the imputed DataFrame
    print("Imputed DataFrame:")
    print(imputed_df)

Imputed DataFrame:
   Feature1  Feature2  Feature3
0       1.0      11.0      21.0
1       2.0      12.0      22.0
2       3.0      13.0      23.0
3       3.7      14.0      24.0
4       5.0      14.0      25.0
5       6.0      16.0      26.0
6       7.0      17.0      27.0
7       8.0      18.0      27.6
8       9.0      19.0      29.0
9      10.0      20.0      30.0




In [60]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

class CustomCosineSimilarityImputer:
    def fit(self, X):
        self.X = X
    
    def transform(self, X):
        missing_rows = X[X.isnull().any(axis=1)]
        for col in missing_rows.columns:
            missing_rows[col] = self.impute_column(missing_rows[col])
        return missing_rows
    
    def impute_column(self, col):
        known_rows = self.X.dropna(subset=[col])
        if known_rows.empty:
            return col  # If no known values, return as is
        
        # Convert categorical data to binary vectors (one-hot encoding)
        known_vectors = pd.get_dummies(known_rows, columns=[col])
        missing_vector = pd.get_dummies(known_rows.copy(), columns=[col])
        
        # Calculate cosine similarity
        similarities = cosine_similarity(missing_vector.values, known_vectors.values)
        weighted_sum = np.dot(similarities, known_rows[col].values)
        similarity_sum = similarities.sum(axis=1)
        
        # Avoid division by zero
        similarity_sum[similarity_sum == 0] = 1
        
        # Calculate the imputed value using weighted average
        imputed_value = weighted_sum / similarity_sum
        
        return imputed_value

# Example usage:
if __name__ == "__main__":
    # Create a sample DataFrame with categorical features and missing values
    data = {
        'Category1': ['A', 'B', 'A', 'C', 'C', 'B', 'A', 'C'],
        'Category2': ['X', 'Y', 'X', 'Z', np.nan, 'Y', 'X', 'Z'],
        'Value': [1, 2, 3, 4, 5, 6, 7, 8]
    }
    df = pd.DataFrame(data)
    
    # Create the custom imputer and impute missing categorical values
    custom_imputer = CustomCosineSimilarityImputer()
    custom_imputer.fit(df)  # Fit on the entire dataset
    imputed_df = custom_imputer.transform(df)
    
    # Display the imputed DataFrame
    print("Imputed DataFrame:")
    print(imputed_df)



KeyError: ['C']

In [61]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

class CustomCosineSimilarityImputer:
    def fit(self, X):
        self.X = X
    
    def transform(self, X):
        missing_rows = X[X.isnull().any(axis=1)]
        for col in missing_rows.columns:
            missing_rows[col] = self.impute_column(missing_rows[col], col)
        return missing_rows
    
    def impute_column(self, col, col_name):
        known_rows = self.X.dropna(subset=[col_name])
        if known_rows.empty:
            return col  # If no known values, return as is
        
        # Convert categorical data to binary vectors (one-hot encoding)
        known_vectors = pd.get_dummies(known_rows, columns=[col_name])
        missing_vector = pd.get_dummies(self.X.copy(), columns=[col_name])
        
        # Calculate cosine similarity
        similarities = cosine_similarity(missing_vector.values, known_vectors.values)
        
        # Calculate weighted sum and similarity sum separately for each missing row
        weighted_sums = np.dot(similarities, known_rows[col_name].values)
        similarity_sums = similarities.sum(axis=1)
        
        # Avoid division by zero
        similarity_sums[similarity_sums == 0] = 1
        
        # Calculate the imputed value for each missing row using weighted average
        imputed_values = np.divide(weighted_sums, similarity_sums)
        
        return imputed_values

# Example usage:
if __name__ == "__main__":
    # Create a sample DataFrame with categorical features and missing values
    data = {
        'Category1': ['A', 'B', 'A', 'C', 'C', 'B', 'A', 'C'],
        'Category2': ['X', 'Y', 'X', 'Z', np.nan, 'Y', 'X', 'Z'],
        'Value': [1, 2, 3, 4, 5, 6, 7, 8]
    }
    df = pd.DataFrame(data)
    
    # Create the custom imputer and impute missing categorical values
    custom_imputer = CustomCosineSimilarityImputer()
    custom_imputer.fit(df)  # Fit on the entire dataset
    imputed_df = custom_imputer.transform(df)
    
    # Display the imputed DataFrame
    print("Imputed DataFrame:")
    print(imputed_df)


ValueError: could not convert string to float: 'X'

In [62]:
import numpy as np
import pandas as pd

class CustomCosineSimilarityImputer:
    def fit(self, X):
        self.X = X
    
    def transform(self, X):
        missing_rows = X[X.isnull().any(axis=1)]
        for col in missing_rows.columns:
            if col in self.X.select_dtypes(include='object').columns:
                missing_rows[col] = self.impute_categorical_column(missing_rows[col], col)
            else:
                missing_rows[col] = self.impute_numeric_column(missing_rows[col], col)
        return missing_rows
    
    def impute_categorical_column(self, col, col_name):
        mode_value = self.X[col_name].mode().values[0]
        col.fillna(mode_value, inplace=True)
        return col
    
    def impute_numeric_column(self, col, col_name):
        known_rows = self.X.dropna(subset=[col_name])
        if known_rows.empty:
            return col  # If no known values, return as is
        
        # Convert numeric data to a numeric array
        known_values = known_rows[col_name].values
        
        # Calculate cosine similarity
        similarities = np.dot(self.X[col_name].values[:, np.newaxis], known_values[np.newaxis, :])
        similarity_sums = similarities.sum(axis=1)
        
        # Avoid division by zero
        similarity_sums[similarity_sums == 0] = 1
        
        # Calculate the imputed value using weighted average
        imputed_values = np.dot(similarities, known_values) / similarity_sums
        
        return imputed_values

# Example usage:
if __name__ == "__main__":
    # Create a sample DataFrame with categorical and numeric features and missing values
    data = {
        'Category1': ['A', 'B', 'A', 'C', 'C', 'B', 'A', 'C'],
        'Category2': ['X', 'Y', 'X', 'Z', np.nan, 'Y', 'X', 'Z'],
        'Value': [1, 2, 3, 4, 5, 6, 7, 8]
    }
    df = pd.DataFrame(data)
    
    # Create the custom imputer and impute missing values
    custom_imputer = CustomCosineSimilarityImputer()
    custom_imputer.fit(df)  # Fit on the entire dataset
    imputed_df = custom_imputer.transform(df)
    
    # Display the imputed DataFrame
    print("Imputed DataFrame:")
    print(imputed_df)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows[col] = self.impute_categorical_column(missing_rows[col], col)


ValueError: Length of values (8) does not match length of index (1)

In [63]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

class CustomCosineSimilarityImputer:
    def fit(self, X):
        self.X = X
    
    def transform(self, X):
        missing_rows = X[X.isnull().any(axis=1)]
        for col in missing_rows.columns:
            missing_rows[col] = self.impute_categorical_column(missing_rows[col], col)
        return missing_rows
    
    def impute_categorical_column(self, col, col_name):
        # Encode the categorical data as binary vectors (one-hot encoding)
        known_rows = self.X.dropna(subset=[col_name])
        if known_rows.empty:
            return col  # If no known values, return as is
        
        known_encoded = pd.get_dummies(known_rows[col_name])
        missing_encoded = pd.get_dummies(self.X.copy(), columns=[col_name])
        
        # Calculate cosine similarity
        similarities = cosine_similarity(missing_encoded.values, known_encoded.values)
        
        # Calculate weighted sum and similarity sum separately for each missing row
        weighted_sums = np.dot(similarities, known_rows[col_name].values)
        similarity_sums = similarities.sum(axis=1)
        
        # Avoid division by zero
        similarity_sums[similarity_sums == 0] = 1
        
        # Calculate the imputed value for each missing row using weighted average
        imputed_values = np.divide(weighted_sums, similarity_sums)
        
        return imputed_values

# Example usage:
if __name__ == "__main__":
    # Create a sample DataFrame with categorical features and missing values
    data = {
        'Category1': ['A', 'B', 'A', 'C', 'C', 'B', 'A', 'C'],
        'Category2': ['X', 'Y', 'X', 'Z', np.nan, 'Y', 'X', 'Z'],
        'Value': [1, 2, 3, 4, 5, 6, 7, 8]
    }
    df = pd.DataFrame(data)
    
    # Create the custom imputer and impute missing categorical values
    custom_imputer = CustomCosineSimilarityImputer()
    custom_imputer.fit(df)  # Fit on the entire dataset
    imputed_df = custom_imputer.transform(df)
    
    # Display the imputed DataFrame
    print("Imputed DataFrame:")
    print(imputed_df)


ValueError: could not convert string to float: 'X'

## Simple Imputer

In [None]:
from sklearn.impute import SimpleImputer

# Create a sample dataset with missing values for categorical and numeric features
data = pd.DataFrame({'Color': ['Red', 'Green', 'Blue', 'Red', 'Green', 'Blue', 'Red', 'Green', 'Blue', np.nan],
                     'Size': ['Small', 'Medium', 'Large', 'Small', 'Medium', 'Large', 'Small', 'Medium', 'Large', 'Medium'],
                     'Weight': [10, 20, 30, 15, np.nan, 25, 20, 10, 30, 40],
                     'Height': [5.0, 6.0, 5.5, 5.2, 6.2, 5.8, 5.4, 6.5, np.nan, 5.9],
                     'Price': [100, 200, np.nan, 150, 180, 220, 120, 250, 190, 210]})

# Separate numeric and categorical columns
numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = data.select_dtypes(exclude=[np.number]).columns.tolist()

# Impute missing values for numeric features using SimpleImputer (e.g., mean imputation)
numeric_imputer = SimpleImputer(strategy='mean')
data[numeric_cols] = numeric_imputer.fit_transform(data[numeric_cols])

# Impute missing values for categorical features using the mode (most frequent value)
categorical_imputer = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = categorical_imputer.fit_transform(data[categorical_cols])

# The 'data' DataFrame now contains imputed values for both numeric and categorical features
print("Imputed Data:\n", data)