#Developing Model

##Installing Dependancies and Configuring CLI

In [0]:
!pip install databricks-cli mlflow

In [0]:
df = spark.read.format("delta").load("/mnt/datamount/delta_table")

In [0]:
with open('tokenfile', 'w') as f:
    f.write(dbutils.secrets.get(scope="tokens", key="dbtoken"))
!databricks configure --host https://adb-6724577987585661.1.azuredatabricks.net/ --token-file tokenfile
!rm -rf tokenfile

##MLFlow modelOps to train and register model

In [0]:
# Import necessary libraries
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

# Load the data
data = df.toPandas()

# Define feature columns and target
features = ["Pclass", "Age", "SibSp", "Parch", "Fare"]
target = "Survived"

# Split the data into training and testing sets
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mlflow.set_experiment("/Users/bhaveshkak26122000@gmail.com/my_experiment")

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

# Start an MLflow run
with mlflow.start_run():
    # Train a machine learning model (Random Forest in this example)
    model = RandomForestClassifier(random_state=42)
    
    # Create GridSearchCV with cross-validation
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)

    # Fit the model to the training data
    grid_search.fit(X_train, y_train)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Make predictions on the test set
    y_pred = best_model.predict(X_test)

    # Calculate and log metrics
    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", accuracy)

    cm = confusion_matrix(y_test, y_pred)
    mlflow.log_text(str(cm), "confusion_matrix.txt")

    mse = mean_squared_error(y_test, y_pred)
    mlflow.log_metric("mse", mse)

    # Log the model
    mlflow.sklearn.log_model(best_model, "model")

     # Log the best hyperparameters
    mlflow.log_params(grid_search.best_params_)

    # Save the feature columns for reference
    mlflow.log_param("features", features)

    # Register the model in MLflow
    model_uri = f"runs:/{mlflow.active_run().info.run_id}/model"
    registered_model_name = "titanic_model"

    # Tags and Description
    model_tags = {
        "Features": ", ".join(features),
        "Label": target,
    }
    model_description = "Description of the registered model"

    # Register the model with MLflow
    registered_model = mlflow.register_model(model_uri, registered_model_name, tags=model_tags)

    # Print the registered model information
    print(f"Registered model: {registered_model.name} (Version {registered_model.version})")


In [0]:
# Specify the experiment name and model name
experiment_name = "/Users/bhaveshkak26122000@gmail.com/my_experiment"
model_name = "titanic_model"

# List all runs for the given experiment and model
runs = mlflow.search_runs(experiment_ids=[experiment_name], filter_string=f"tags.ModelName = '{model_name}'")

# Print the available model versions and their run IDs
for _, run in runs.iterrows():
    print(f"Run ID: {run.run_id}, Model Version: {run.artifact_uri.split('/')[-1]}")