# Custom Data Cleaner Python Library Generation

Step 1: Set Up a New Google Colab Notebook

Step 2: Write the Code for the Toolkit in Google Colab

1. Add Library:

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, LabelEncoder

2. Implement the DataCleaner Class: Create a code cell in Colab and add the code for your toolkit:

In [11]:
class DataCleaner:
    def __init__(self, dataframe):
        """
        Initialize with a pandas DataFrame.
        """
        self.df = dataframe
        self.log = []  # To store transformation logs

    def handle_missing(self, method="mean", columns=None):
      """
      Handle missing values in the DataFrame.

      Parameters:
          method (str): The method to handle missing values ("mean", "median", "mode", "drop").
          columns (list): List of columns to process. If None, all columns are processed.

      Returns:
          pd.DataFrame: DataFrame with missing values handled.
      """
      if columns is None:
          columns = self.df.columns

      for col in columns:
          if self.df[col].dtype in [np.float64, np.int64]:  # Numeric columns
              if method == "mean":
                  self.df[col].fillna(self.df[col].mean(), inplace=True)
              elif method == "median":
                  self.df[col].fillna(self.df[col].median(), inplace=True)
              elif method == "mode":
                  self.df[col].fillna(self.df[col].mode()[0], inplace=True)
              elif method == "drop":
                  self.df.dropna(subset=[col], inplace=True)
              else:
                  raise ValueError("Unsupported method for numeric columns.")
          elif self.df[col].dtype == "object":  # Categorical columns
              if method == "mode":
                  self.df[col].fillna(self.df[col].mode()[0], inplace=True)
              elif method == "drop":
                  self.df.dropna(subset=[col], inplace=True)
              else:
                  # Automatically default to "mode" for categorical columns if "mean" or "median" is passed.
                  self.df[col].fillna(self.df[col].mode()[0], inplace=True)
                  self.log.append(f"Auto-applied 'mode' for column {col} (categorical).")
          else:
              raise ValueError(f"Column {col} has unsupported data type for {method} method.")

          self.log.append(f"Handled missing values in {col} using {method} method.")
      return self.df


    def detect_outliers(self, method="zscore", threshold=3):
        """
        Detect outliers in the DataFrame.
        """
        outliers = {}
        if method == "zscore":
            for col in self.df.select_dtypes(include=[np.number]).columns:
                z_scores = (self.df[col] - self.df[col].mean()) / self.df[col].std()
                outliers[col] = self.df[col][np.abs(z_scores) > threshold].index.tolist()
        elif method == "iqr":
            for col in self.df.select_dtypes(include=[np.number]).columns:
                Q1 = self.df[col].quantile(0.25)
                Q3 = self.df[col].quantile(0.75)
                IQR = Q3 - Q1
                outliers[col] = self.df[col][(self.df[col] < (Q1 - 1.5 * IQR)) | (self.df[col] > (Q3 + 1.5 * IQR))].index.tolist()
        else:
            raise ValueError("Unsupported method for detecting outliers.")
        self.log.append(f"Detected outliers using {method} method.")
        return outliers

    def scale_features(self, method="standard", columns=None):
        """
        Scale features in the DataFrame.
        """
        if columns is None:
            columns = self.df.select_dtypes(include=[np.number]).columns

        scaler = None
        if method == "standard":
            scaler = StandardScaler()
        elif method == "minmax":
            scaler = MinMaxScaler()
        elif method == "robust":
            scaler = RobustScaler()
        else:
            raise ValueError("Unsupported scaling method.")

        self.df[columns] = scaler.fit_transform(self.df[columns])
        self.log.append(f"Scaled features using {method} method.")
        return self.df

    def encode_categorical(self, method="onehot", columns=None):
        """
        Encode categorical features in the DataFrame.
        """
        if columns is None:
            columns = self.df.select_dtypes(include=["object", "category"]).columns

        if method == "onehot":
            self.df = pd.get_dummies(self.df, columns=columns, drop_first=True)
        elif method == "label":
            for col in columns:
                le = LabelEncoder()
                self.df[col] = le.fit_transform(self.df[col])
        else:
            raise ValueError("Unsupported encoding method.")
        self.log.append(f"Encoded categorical features using {method} method.")
        return self.df

    def get_logs(self):
        """
        Retrieve transformation logs.
        """
        return self.log

Step 3: Demonstrate Usage with Examples

1. Add an Example DataFrame: Create a small sample DataFrame in a new cell:

In [12]:
# Sample DataFrame
data = {
    "Age": [25, None, 35, 45, 30],
    "Income": [50000, 60000, None, 80000, 75000],
    "Gender": ["Male", "Female", None, "Male", "Female"]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

Original DataFrame:
    Age   Income  Gender
0  25.0  50000.0    Male
1   NaN  60000.0  Female
2  35.0      NaN    None
3  45.0  80000.0    Male
4  30.0  75000.0  Female


2. Demonstrate the DataCleaner Functions:



*   Handling Missing Values:



In [13]:
cleaner = DataCleaner(df)
df_cleaned = cleaner.handle_missing(method="mean")
print("\nDataFrame after handling missing values:")
print(df_cleaned)


DataFrame after handling missing values:
     Age   Income  Gender
0  25.00  50000.0    Male
1  33.75  60000.0  Female
2  35.00  66250.0  Female
3  45.00  80000.0    Male
4  30.00  75000.0  Female


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df[col].fillna(self.df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df[col].fillna(self.df[col].mode()[0], inplace=True)


*   Outlier Detection:

In [14]:
outliers = cleaner.detect_outliers(method="zscore")
print("\nOutliers detected:")
print(outliers)


Outliers detected:
{'Age': [], 'Income': []}


*   Feature Scaling:

In [15]:
df_scaled = cleaner.scale_features(method="standard")
print("\nDataFrame after scaling features:")
print(df_scaled)


DataFrame after scaling features:
        Age    Income  Gender
0 -1.322876 -1.523624    Male
1  0.000000 -0.586009  Female
2  0.188982  0.000000  Female
3  1.700840  1.289220    Male
4 -0.566947  0.820413  Female


*   Categorical Encoding:

In [16]:
df_encoded = cleaner.encode_categorical(method="onehot")
print("\nDataFrame after encoding categorical features:")
print(df_encoded)


DataFrame after encoding categorical features:
        Age    Income  Gender_Male
0 -1.322876 -1.523624         True
1  0.000000 -0.586009        False
2  0.188982  0.000000        False
3  1.700840  1.289220         True
4 -0.566947  0.820413        False



*   Retrieve Logs:

In [17]:
logs = cleaner.get_logs()
print("\nTransformation Logs:")
print(logs)


Transformation Logs:
['Handled missing values in Age using mean method.', 'Handled missing values in Income using mean method.', "Auto-applied 'mode' for column Gender (categorical).", 'Handled missing values in Gender using mean method.', 'Detected outliers using zscore method.', 'Scaled features using standard method.', 'Encoded categorical features using onehot method.']
