In [21]:
# train.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import sys

# ------------- USER SETTINGS -------------
CSV_PATH = r"C:\Users\Adan\Downloads\motor_second.csv"
TARGET_COLUMN = "harga"
MODEL_OUTPUT = "model.pkl"
RANDOM_STATE = 42
# -----------------------------------------

def load_data(path):
    df = pd.read_csv(path)

    # Normalisasi nama kolom
    df.columns = (
        df.columns
        .str.strip()
        .str.replace(" ", "")
        .str.lower()
    )

    print("Cleaned Columns:", df.columns.tolist())
    return df

def auto_detect_columns(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = [c for c in df.columns if c not in numeric_cols]
    return numeric_cols, categorical_cols

def build_pipeline(numeric_cols, categorical_cols):
    numeric_transformer = Pipeline(steps=[
        ("scaler", StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_cols),
            ("cat", categorical_transformer, categorical_cols),
        ]
    )

    model = RandomForestRegressor(
        n_estimators=100,
        random_state=RANDOM_STATE,
        n_jobs=-1
    )

    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    return pipeline

def main():
    print("Loading data:", CSV_PATH)
    df = load_data(CSV_PATH)

    if TARGET_COLUMN not in df.columns:
        print(f"ERROR: target column '{TARGET_COLUMN}' tidak ditemukan!")
        print("Kolom tersedia:", df.columns.tolist())
        sys.exit(1)

    # Drop rows yang targetnya kosong
    df = df.dropna(subset=[TARGET_COLUMN]).copy()

    X = df.drop(columns=[TARGET_COLUMN])
    y = df[TARGET_COLUMN].astype(float)

    numeric_cols, categorical_cols = auto_detect_columns(X)

    print("\nDetected numeric columns:", numeric_cols)
    print("Detected categorical columns:", categorical_cols)

    pipeline = build_pipeline(numeric_cols, categorical_cols)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE
    )

    print("\nTraining model...")
    pipeline.fit(X_train, y_train)

    print("\nEvaluating...")
    y_pred = pipeline.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R2: {r2:.3f}")

    print("\nRunning 5-fold Cross Validation (MAE)...")
    cv_scores = -1 * cross_val_score(
        pipeline,
        X_train,
        y_train,
        cv=5,
        scoring="neg_mean_absolute_error",
        n_jobs=-1
    )
    print(f"CV MAE: mean={cv_scores.mean():.2f}, std={cv_scores.std():.2f}")

    # Residual stats
    train_pred = pipeline.predict(X_train)
    residuals = y_train - train_pred

    metadata = {
        "target": TARGET_COLUMN,
        "numeric_cols": numeric_cols,
        "categorical_cols": categorical_cols,
        "residual_mean": float(residuals.mean()),
        "residual_std": float(residuals.std()),
        "r2": float(r2)
    }

    joblib.dump(
        {
            "pipeline": pipeline,
            "metadata": metadata
        },
        MODEL_OUTPUT
    )

    print(f"\nModel saved successfully to {MODEL_OUTPUT}")

if __name__ == "__main__":
    main()


Loading data: C:\Users\Adan\Downloads\motor_second.csv
Cleaned Columns: ['model', 'tahun', 'harga', 'transmisi', 'odometer', 'jenis', 'pajak', 'konsumsibbm', 'mesin']

Detected numeric columns: ['tahun', 'odometer', 'pajak', 'konsumsibbm', 'mesin']
Detected categorical columns: ['model', 'transmisi', 'jenis']

Training model...

Evaluating...
MAE: 844.06
RMSE: 1368807.70
R2: 0.903

Running 5-fold Cross Validation (MAE)...
CV MAE: mean=1241.38, std=251.35

Model saved successfully to model.pkl


In [5]:
import os

getData=os.getcwd()
getData

'C:\\Users\\Adan\\benerProject\\Harga-Motor-Al'