From 5162b4e7025dc33661a1c62442237f06b69d3d62 Mon Sep 17 00:00:00 2001 From: yufo fukuda Date: Mon, 9 Mar 2026 16:13:09 +0000 Subject: [PATCH] add model functions --- src/diamonds/model.py | 75 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 6 deletions(-) diff --git a/src/diamonds/model.py b/src/diamonds/model.py index cd3ebcc..095dddf 100644 --- a/src/diamonds/model.py +++ b/src/diamonds/model.py @@ -1,4 +1,14 @@ from sklearn.base import BaseEstimator, Pipeline +from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import OneHotEncoder, StandardScaler +from sklearn.impute import SimpleImputer +from sklearn.impute import KNNImputer +from sklearn.linear_model import Ridge +from sklearn.ensemble import RandomForestRegressor +from sklearn.neighbors import KNeighborsRegressor +from sklearn.linear_model import LinearRegression +from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error +import pandas as pd def create_model(model_name: str) -> BaseEstimator: @@ -15,23 +25,73 @@ def create_model(model_name: str) -> BaseEstimator: BaseEstimator The model ready to be fitted """ - pass -def create_preproc() -> Pipeline: + models = { + "ridge": Ridge(alpha=1.0), + "random_forest": RandomForestRegressor( + n_estimators=200, + max_depth=10, + random_state=42 + ), + "knn": KNeighborsRegressor(n_neighbors=5), + "linear": LinearRegression(fit_intercept=True), + } + + if model_name not in models: + raise ValueError(f"Unknown model: {model_name}") + + return models[model_name] + +def create_preproc(num_cols: list[str], cat_cols: list[str]) -> Pipeline: """ Create a preprocessing pipeline. """ - pass + + # pipeline numérique + num_pipeline = Pipeline([ + ("imputer", KNNImputer(n_neighbors=5)), + ("scaler", StandardScaler()) + ]) + + # pipeline catégorique + cat_pipeline = Pipeline([ + ("imputer", SimpleImputer(strategy="most_frequent")), + ("encoder", OneHotEncoder(handle_unknown="ignore")) + ]) + + # combiner les deux + preprocessing = ColumnTransformer([ + ("num", num_pipeline, num_cols), + ("cat", cat_pipeline, cat_cols) + ]) + + return preprocessing def train_model(model, X_train, y_train): - pass + model.fit(X_train, y_train) + return model def evaluate_model(model, X_test, y_test) -> dict[str, float]: # NB : mae, mse, r2_score, mape # Only print the metrics for now - pass + y_pred = model.predict(X_test) + + mae = mean_absolute_error(y_test, y_pred) + mse = mean_squared_error(y_test, y_pred) + r2 = r2_score(y_test, y_pred) + mape = mean_absolute_percentage_error(y_test, y_pred) + + metrics = { + "mae": mae, + "mse": mse, + "r2": r2, + "mape": mape + } -def predict(model, X): + for name, value in metrics.items(): + print(f"{name}: {value:.4f}") + +def predict(model, X: pd.DataFrame) -> pd.Series: """ Make predictions using the trained model. @@ -47,4 +107,7 @@ def predict(model, X): pd.Series The predicted values """ + y_pred = model.predict(X) + + return pd.Series(y_pred, index=X.index, name="prediction")