# Clean Data Pipeline

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
# Load data from parameter
def load_raw_data(filepath):
    return pd.read_csv(filepath)

In [None]:
# Data preprocessing
df = load_raw_data('data.csv')
df_cleaned = df.dropna()
df_cleaned = df_cleaned[df_cleaned['age'] > 0]

In [None]:
# Feature engineering
df_cleaned['age_squared'] = df_cleaned['age'] ** 2
df_cleaned['income_log'] = np.log(df_cleaned['income'] + 1)

In [None]:
# Prepare features
feature_cols = ['age', 'age_squared', 'income_log']
X = df_cleaned[feature_cols]
y = df_cleaned['target']

In [None]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [None]:
# Train model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# Evaluate
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f'Train: {train_score:.3f}, Test: {test_score:.3f}')