In [None]:
#Step 1: Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
import os


In [None]:
# Load Titanic dataset (Example Dataset)


df = sns.load_dataset('titanic')
df = df.dropna(subset=['age', 'embarked', 'fare', 'sex', 'class', 'survived'])  # Clean basic NaNs
df = df[['age', 'fare', 'sex', 'embarked', 'class', 'survived']]  # Select relevant columns
df.head()


Unnamed: 0,age,fare,sex,embarked,class,survived
0,22.0,7.25,male,S,Third,0
1,38.0,71.2833,female,C,First,1
2,26.0,7.925,female,S,Third,1
3,35.0,53.1,female,S,First,1
4,35.0,8.05,male,S,Third,0


In [None]:
#  Step 2: Load Data (Extract)
# Replace 'your_dataset.csv' with the path to your dataset
#file_path = 'your_dataset.csv'
#df = pd.read_csv(file_path)
#df.head()

In [None]:



#  Step 3: Preprocessing Setup

# Set your target column here
target_column ='survived' # Replace with your actual target column name

# Split features and target
X = df.drop(target_column, axis=1)
y = df[target_column]

# Identify numerical and categorical columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# Define pipelines for numeric and categorical data
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine both in a column transformer
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

In [None]:
# Step 4: Transformation and Feature Selection

# Optional: use SelectKBest to reduce features
num_features = 10  # Adjust or set to None for no selection

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif, k=num_features))
]) if num_features else Pipeline([
    ('preprocessor', preprocessor)
])

X_processed = pipeline.fit_transform(X, y)


In [None]:
# Step 5: Save Transformed Data (Load)

output_dir = 'processed_data'
os.makedirs(output_dir, exist_ok=True)

# Convert X to DataFrame if possible
X_df = pd.DataFrame(X_processed)
y_df = pd.DataFrame(y, columns=[target_column])

X_df.to_csv(os.path.join(output_dir, 'features.csv'), index=False)
y_df.to_csv(os.path.join(output_dir, 'labels.csv'), index=False)

print("✅ ETL process complete. Files saved in 'processed_data/'")


✅ ETL process complete. Files saved in 'processed_data/'
