In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
# Step 1: Extract
def extract_data(filepath):
    print("Extracting data...")
    return pd.read_csv(filepath)
# Step 2: Transform
def transform_data(df):
    print("Transforming data...")
    # Identify numerical and categorical columns
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    # Define transformers
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])
    # Combine transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    # Apply transformations
    processed_data = preprocessor.fit_transform(df)
    # Save the preprocessor for later use
    joblib.dump(preprocessor, 'preprocessor.joblib')
    return processed_data
# Step 3: Load
def load_data(processed_data, output_path='processed_data.csv'):
    print("Loading data...")
    # Convert to DataFrame (if sparse matrix, convert to dense first)
    if hasattr(processed_data, "toarray"):
        processed_data = processed_data.toarray()
    pd.DataFrame(processed_data).to_csv(output_path, index=False)
    print(f"Processed data saved to: {output_path}")
# Main ETL Pipeline
def main():
    filepath = 'data.csv'  # <- Replace with your data file
    df = extract_data(filepath)
    processed_data = transform_data(df)
    load_data(processed_data)
if __name__ == '__main__':
    main()

Extracting data...
Transforming data...
Loading data...
Processed data saved to: processed_data.csv
