# Using a model


In [0]:
spark.sql("USE CATALOG jeromeaymon_lakehouse")
spark.sql("USE SCHEMA ml_sandbox")

In [0]:
from pyspark.sql.functions import col
import pandas as pd

# Load dataset
data_path = "/Volumes/jeromeaymon_lakehouse/ml_sandbox/data/test.csv"
test_df = spark.read.csv(data_path, header=True, inferSchema=True)

# Cast Boolean columns to int
test_df = test_df.withColumn("PassengerId", col("PassengerId").cast("string")) \
                 .withColumn("VIP", col("VIP").cast("int")) \
                 .withColumn("CryoSleep", col("CryoSleep").cast("int"))

display(test_df)

In [0]:
import mlflow
import mlflow.sklearn

# If you're running on Databricks, you may not need to set the tracking URI manually
# But if needed, do so:
mlflow.set_tracking_uri("databricks")
mlflow.set_registry_uri("databricks-uc")

# Define model name
model_uri = "models:/decision_tree_model/1"

# Load model
loaded_model = mlflow.sklearn.load_model(model_uri)
loaded_model

In [0]:
test = test_df.toPandas()

# Perform inference via model.predict()
predictions = loaded_model.predict(test)
predictions

## Optimiser un modèle avancé


In [0]:
import xgboost as xgb
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

MODEL_NAME = "xgboost_model"

# Charger le dataset d'entraînement
train_path = "/Volumes/jeromeaymon_lakehouse/ml_sandbox/data/train.csv"
train_df = spark.read.csv(train_path, header=True, inferSchema=True)

# Cast des colonnes
train_df = (
    train_df
    .withColumn("PassengerId", col("PassengerId").cast("string"))
    .withColumn("VIP", col("VIP").cast("int"))
    .withColumn("CryoSleep", col("CryoSleep").cast("int"))
    .withColumn("Transported", col("Transported").cast("int"))
)

# Spark → Pandas
pdf = train_df.toPandas()

X = pdf.drop(columns=["Transported", "PassengerId"])
y = pdf["Transported"]


# ------------------------------------------------------------------
# 2. Séparation colonnes numériques / catégorielles
# ------------------------------------------------------------------
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

# ------------------------------------------------------------------
# 3. Préprocessing + modèle XGBoost
# ------------------------------------------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", xgb_model)
])

# ------------------------------------------------------------------
# 4. Train / Test split
# ------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ------------------------------------------------------------------
# 5. Entraînement + MLflow
# ------------------------------------------------------------------
with mlflow.start_run(run_name="xgboost_optimized"):
    pipeline.fit(X_train, y_train)

    preds = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, preds)

    # Infer signature from training data
    signature = infer_signature(X_train, preds)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.sklearn.log_model(
        pipeline, 
        artifact_path="model",
        signature=signature,
        registered_model_name=MODEL_NAME
    )

accuracy

In [0]:
import mlflow.sklearn 
# Define model name
model_uri = "models:/decision_tree_model/1"

# ou "models:/xgboost_model/1" si pas encore en Production 
xgb_model = mlflow.sklearn.load_model(model_uri) 
xgb_model

In [0]:


# Spark → Pandas
test_pdf = test_df.toPandas()

# Supprimer PassengerId
X_test = test_pdf.drop(columns=["PassengerId"])

# Prédictions
predictionsXgb = xgb_model.predict(X_test)

predictionsXgb
