In [None]:
import pickle

import numpy as np
import pandas as pd
from scipy import stats
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
)

from .utils._logger import logger
from .utils._validation import config_args

In [None]:
# Load the dataset
from pandas import DataFrame

try:
    df: DataFrame = pd.read_csv(config_args.data_path)
    logger.info("Dataset loaded successfully.")
except FileNotFoundError:
    logger.error("Error: CSV file not found.  Please check the file path.")

In [None]:
# Data Exploration
df = df.set_index("car_ID")

logger.info(f"Initial DataFrame shape: {df.shape}")

In [None]:
# Handle Missing Values
if df.isnull().sum().sum() > 0:
    print("Missing values found. Imputation will be performed.")
    logger.warning("Missing values found. Imputation will be performed.")
else:
    print("No missing values found.")
    logger.info("No missing values found.")

In [None]:
# Data Inspection
df.info()
print(df.describe().T)

In [None]:
# Feature Engineering
df["brand"] = df["CarName"].str.split(" ", n=1, expand=True)[0]
df["model"] = df["CarName"].str.split(" ", n=1, expand=True)[1].str.replace(" ", "")

df = df.drop("CarName", axis=1)

In [None]:
# Declaring columns
numeric_cols = [
    col
    for col in df.columns
    if pd.api.types.is_numeric_dtype(df[col]) and col != "price"
]
categorical_cols = [
    col for col in df.columns if col not in numeric_cols and col != "price"
]

In [None]:
df = df[(np.abs(stats.zscore(df[numeric_cols])) < 3).all(axis=1)]

In [None]:
# One-Hot Encoding
logger.info("Applying One-Hot Encoding...")
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
preprocessor = ColumnTransformer(
    [("onehot", encoder, categorical_cols)], remainder="passthrough"
)
df_processed = pd.DataFrame(
    preprocessor.fit_transform(df), columns=preprocessor.get_feature_names_out()
)

In [None]:
numeric_cols: list[str] = [
    column for column in list(df_processed.columns) if not column.startswith("onehot")
]

In [None]:
# Scaling Numeric Features
scaler = StandardScaler()
df_processed[numeric_cols] = scaler.fit_transform(df_processed[numeric_cols])

In [None]:
# Split Data
X = df_processed
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Model Training
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [None]:
# Model Evaluation
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R2 Score: {r2:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")

In [None]:
# Save the Model
pickle.dump(model, open(config_args.save_model_name, "wb"))
logger.info(f"Model saved to {config_args.save_model_name}")