In [2]:
def validate_dataframe(df):
    """
    Validates the dataframe by checking for the necessary columns and correct data types.

    Args:
        df (pd.DataFrame): The dataframe to validate.
    
    Returns:
        bool: True if the dataframe is valid, raises ValueError otherwise.
    """
    required_columns = ['Age', 'Gender', 'Income']
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Missing required columns: {', '.join(required_columns)}")
    
    if not np.issubdtype(df['Age'].dtype, np.number) or not np.issubdtype(df['Income'].dtype, np.number):
        raise ValueError("Columns 'Age' and 'Income' should be numeric.")
    
    if not np.issubdtype(df['Gender'].dtype, object):  # Change np.object to object
        raise ValueError("Column 'Gender' should be categorical.")

    return True


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Step 1: Load a sample dataset
def load_data():
    """
    Load a sample dataset for preprocessing. The dataset contains both numerical and categorical columns.

    Returns:
        pd.DataFrame: The loaded dataset.
    """
    # Example dataset: Replace this with your actual data loading step
    data = {
        'Age': [25, np.nan, 30, 35, np.nan],
        'Gender': ['M', 'F', 'M', 'F', 'M'],
        'Income': [50000, 60000, 55000, np.nan, 70000]
    }
    df = pd.DataFrame(data)
    return df

# Step 2: Check input dataframe
def validate_dataframe(df):
    """
    Validates the dataframe by checking for the necessary columns and correct data types.

    Args:
        df (pd.DataFrame): The dataframe to validate.
    
    Returns:
        bool: True if the dataframe is valid, raises ValueError otherwise.
    """
    required_columns = ['Age', 'Gender', 'Income']
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Missing required columns: {', '.join(required_columns)}")
    
    if not np.issubdtype(df['Age'].dtype, np.number) or not np.issubdtype(df['Income'].dtype, np.number):
        raise ValueError("Columns 'Age' and 'Income' should be numeric.")
    
    if not np.issubdtype(df['Gender'].dtype, np.object):
        raise ValueError("Column 'Gender' should be categorical.")

    return True

# Step 3: Imputation Function
def impute_data(df):
    """
    Fills missing values in the dataframe using mean imputation for numerical columns.

    Args:
        df (pd.DataFrame): The dataframe with missing values.

    Returns:
        pd.DataFrame: The dataframe with missing values imputed.
    """
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    imputer = SimpleImputer(strategy='mean')
    df[numerical_cols] = imputer.fit_transform(df[numerical_cols])
    return df

# Step 4: Encoding Categorical Data
def encode_categorical(df):
    """
    Encodes categorical columns using OneHotEncoder.

    Args:
        df (pd.DataFrame): The dataframe with categorical columns.

    Returns:
        pd.DataFrame: The dataframe with encoded categorical columns.
    """
    # Check if 'Gender' column exists
    if 'Gender' not in df.columns:
        raise ValueError("Column 'Gender' is missing in the dataframe.")
    
    one_hot_encoder = OneHotEncoder(sparse=False, drop='first')
    gender_encoded = one_hot_encoder.fit_transform(df[['Gender']])
    gender_encoded_df = pd.DataFrame(gender_encoded, columns=one_hot_encoder.get_feature_names_out(['Gender']))
    df = df.join(gender_encoded_df).drop('Gender', axis=1)
    return df

# Step 5: Feature Scaling (Standardization)
def scale_features(df):
    """
    Scales the numerical features of the dataframe using StandardScaler.

    Args:
        df (pd.DataFrame): The dataframe with numerical columns to scale.

    Returns:
        pd.DataFrame: The dataframe with scaled features.
    """
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    return df

# Step 6: Build a Preprocessing Pipeline
def build_pipeline():
    """
    Builds a preprocessing pipeline to impute missing values, encode categorical data, 
    and scale numerical features.

    Returns:
        sklearn.pipeline.Pipeline: The complete preprocessing pipeline.
    """
    # Column transformer to apply different transformations on different columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', SimpleImputer(strategy='mean'), ['Age', 'Income']),
            ('cat', OneHotEncoder(sparse=False, drop='first'), ['Gender'])
        ]
    )
    
    # Create a pipeline with preprocessing steps
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler())
    ])
    
    return pipeline

# Step 7: Apply the Preprocessing Pipeline
def apply_pipeline(df):
    """
    Applies the preprocessing pipeline to the given dataframe.

    Args:
        df (pd.DataFrame): The dataframe to preprocess.

    Returns:
        pd.DataFrame: The preprocessed dataframe.
    """
    # Validate the dataframe
    validate_dataframe(df)
    
    # Split the data into features and target (if applicable)
    X = df.drop('Income', axis=1)  # Assuming 'Income' is the target, adjust as needed
    y = df['Income']
    
    # Build and apply the pipeline
    pipeline = build_pipeline()
    X_transformed = pipeline.fit_transform(X)
    
    # Return the transformed data
    transformed_df = pd.DataFrame(X_transformed, columns=['Age', 'Gender_M'])
    transformed_df['Income'] = y
    return transformed_df

# Example usage
if __name__ == "__main__":
    # Load the data
    df = load_data()
    
    # Apply preprocessing pipeline
    preprocessed_data = apply_pipeline(df)
    print(preprocessed_data)

  if not np.issubdtype(df['Gender'].dtype, np.object):


AttributeError: module 'numpy' has no attribute 'object'.
`np.object` was a deprecated alias for the builtin `object`. To avoid this error in existing code, use `object` by itself. Doing this will not modify any behavior and is safe. 
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations