Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 70 additions & 8 deletions src/diamonds/model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
import pandas as pd


def create_model(model_name: str) -> BaseEstimator:
Expand All @@ -16,23 +25,73 @@ def create_model(model_name: str) -> BaseEstimator:
BaseEstimator
The model ready to be fitted
"""
pass

def create_preproc() -> Pipeline:
models = {
"ridge": Ridge(alpha=1.0),
"random_forest": RandomForestRegressor(
n_estimators=200,
max_depth=10,
random_state=42
),
"knn": KNeighborsRegressor(n_neighbors=5),
"linear": LinearRegression(fit_intercept=True),
}

if model_name not in models:
raise ValueError(f"Unknown model: {model_name}")

return models[model_name]

def create_preproc(num_cols: list[str], cat_cols: list[str]) -> Pipeline:
"""
Create a preprocessing pipeline.
"""
pass

# pipeline numérique
num_pipeline = Pipeline([
("imputer", KNNImputer(n_neighbors=5)),
("scaler", StandardScaler())
])

# pipeline catégorique
cat_pipeline = Pipeline([
("imputer", SimpleImputer(strategy="most_frequent")),
("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# combiner les deux
preprocessing = ColumnTransformer([
("num", num_pipeline, num_cols),
("cat", cat_pipeline, cat_cols)
])

return preprocessing

def train_model(model, X_train, y_train):
pass
model.fit(X_train, y_train)
return model

def evaluate_model(model, X_test, y_test) -> dict[str, float]:
# NB : mae, mse, r2_score, mape
# Only print the metrics for now
pass
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

metrics = {
"mae": mae,
"mse": mse,
"r2": r2,
"mape": mape
}

def predict(model, X):
for name, value in metrics.items():
print(f"{name}: {value:.4f}")

def predict(model, X: pd.DataFrame) -> pd.Series:
"""
Make predictions using the trained model.

Expand All @@ -48,4 +107,7 @@ def predict(model, X):
pd.Series
The predicted values
"""
y_pred = model.predict(X)

return pd.Series(y_pred, index=X.index, name="prediction")