In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# Step 1: Extract Data (Load CSV)
def extract_data(file_path):
    df = pd.read_csv(file_path)
    return df

# Step 2: Handle Missing Values
def handle_missing_values(df):
    imputer = SimpleImputer(strategy="mean")  # Fill missing numerical values with mean
    df[df.select_dtypes(include=['number']).columns] = imputer.fit_transform(df.select_dtypes(include=['number']))
    return df

# Step 3: Encode Categorical Data
def encode_categorical(df):
    encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    categorical_cols = df.select_dtypes(include=['object']).columns
    encoded_data = encoder.fit_transform(df[categorical_cols])
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))

    # Drop original categorical columns and concatenate encoded data
    df = df.drop(columns=categorical_cols).reset_index(drop=True)
    df = pd.concat([df, encoded_df], axis=1)
    return df

# Step 4: Scale Numerical Data
def scale_numerical_data(df):
    scaler = StandardScaler()
    numerical_cols = df.select_dtypes(include=['number']).columns
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    return df

# Step 5: Split Data into Training and Testing
def split_data(df, target_column, test_size=0.2):
    X = df.drop(columns=[target_column])
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    return X_train, X_test, y_train, y_test

# Step 6: Load Data (Save Processed Data)
def load_data(X_train, X_test, y_train, y_test):
    X_train.to_csv("X_train.csv", index=False)
    X_test.to_csv("X_test.csv", index=False)
    y_train.to_csv("y_train.csv", index=False)
    y_test.to_csv("y_test.csv", index=False)
    print("Data successfully saved!")

# Pipeline Execution
if __name__ == "__main__":
    file_path = "heart.csv"  # Update this with your dataset path
    target_column = "target"  # Replace with actual target column name

    df = extract_data(file_path)
    df = handle_missing_values(df)
    df = encode_categorical(df)
    df = scale_numerical_data(df)
    X_train, X_test, y_train, y_test = split_data(df, target_column)
    load_data(X_train, X_test, y_train, y_test)


Data successfully saved!
