# 02 — Feature Engineering

Goal:
- Load dataset
- Fix types
- Handle missing values
- Encode categorical variables
- Split into train/test
- Save X_train, X_test, y_train, y_test


In [1]:
import os
import pandas as pd
import numpy as np

DATA_PATH = "../artifacts/data/eda_with_target.csv"
assert os.path.exists(DATA_PATH), f"Missing: {DATA_PATH}"

df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()


(7043, 22)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,_target
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,No,No,One year,No,Mailed check,56.95,1889.5,No,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1


In [2]:
TARGET_COL = "_target"
assert TARGET_COL in df.columns
y = df[TARGET_COL].astype(int)
X = df.drop(columns=[TARGET_COL])


In [3]:
def is_id_like(series: pd.Series) -> bool:
    if series.dtype != "object":
        return False
    nunique = series.nunique(dropna=True)
    return nunique > 0 and nunique / max(len(series), 1) > 0.9

id_cols = [c for c in X.columns if is_id_like(X[c])]
id_cols


['customerID', 'TotalCharges']

In [4]:

X = X.drop(columns=id_cols, errors="ignore")
print("Dropped ID-like columns:", id_cols)
print("X shape:", X.shape)


Dropped ID-like columns: ['customerID', 'TotalCharges']
X shape: (7043, 19)


In [5]:

for c in X.columns:
    if X[c].dtype == "object":
        converted = pd.to_numeric(X[c].astype(str).str.replace(",", "").str.strip(), errors="coerce")
        if converted.notna().mean() > 0.7:
            X[c] = converted

X.dtypes.value_counts()


object     16
int64       2
float64     1
Name: count, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
cat_cols = [c for c in X.columns if c not in num_cols]

num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
    ],
    remainder="drop"
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train:", X_train.shape, "Test:", X_test.shape)


Train: (5634, 19) Test: (1409, 19)


In [7]:
import joblib

os.makedirs("../artifacts/data", exist_ok=True)
joblib.dump((X_train, X_test, y_train, y_test, preprocess), "../artifacts/data/splits_and_preprocess.joblib")
print("Saved: ../artifacts/data/splits_and_preprocess.joblib")


Saved: ../artifacts/data/splits_and_preprocess.joblib
