In [None]:
from snowflake.snowpark.session import Session,col
import snowflake.snowpark.functions
from snowflake.ml.modeling.metrics import confusion_matrix, accuracy_score, f1_score, recall_score,precision_score
from snowflake.ml.modeling.preprocessing import LabelEncoder, StandardScaler
from snowflake.ml.modeling.impute import SimpleImputer
from snowflake.ml.modeling.ensemble import GradientBoostingClassifier
from snowflake.ml.modeling.pipeline import Pipeline
import snowflake

In [1]:
from snowflake.ml.modeling.metrics import confusion_matrix, accuracy_score, f1_score, recall_score,precision_score

In [None]:
import json
print(f"Starting Experiment Execution with the following params:\n{os.getenv('EXPERIMENT_DETAILS')}\n")
exp_details=json.loads(os.getenv("EXPERIMENT_DETAILS"))
exp_details

In [None]:
def get_conn_details_from_ds_name(dataset_name, project_id):
    """
    To get connection details by using dataset_name and project_id from connection manager API.
    :param dataset_name:
    :param project_id:
    :return: connection_details
    """
    connection_manager = os.getenv("CONNECTION_MANAGER_BASE_URL", "http://fdc-project-manager:80/project-manager")
    #https://dev.fdc.leni.ai/project-manager/connections/api/ConnectionManager/v1/allConnections?projectId=6a3f39a4-fad3-4f32-b31c-a706dc2f4a35
    url = f"{connection_manager}/connections/api/ConnectionManager/v1/allConnections?projectId={project_id}"
    return requests.get(url, verify=False).json()

connection_details = get_conn_details_from_ds_name("HR_DATA", os.getenv("PROJECT_ID"))
connection_parameters = {
        "user": connection_details[0]["connectionDetails"]["dbUserName"],
        "password": connection_details[0]["connectionDetails"]["dbPassword"],
        "account": connection_details[0]["connectionDetails"]["accountName"],
        "database": connection_details[0]["connectionDetails"]["defaultDb"],
        "role": connection_details[0]["connectionDetails"]["role"],
        "cloudPlatform": connection_details[0]["connectionDetails"]["cloudPlatform"],
        "schema": connection_details[0]["connectionDetails"]["defaultSchema"],
        "wareHouse": connection_details[0]["connectionDetails"]["wareHouse"],
        "region": connection_details[0]["connectionDetails"]["region"] + "." + connection_details[0]["connectionDetails"]["cloudPlatform"]
}
print(connection_parameters)

new_session = Session.builder.configs(connection_parameters).create()

In [None]:
new_session.query_tag = exp_details.get("description", "sample_description")
dataset_name = exp_details.get("dataset")
df = new_session.table(dataset_name)
input_data_frame, test_df = df.randomSplit([0.75, 0.25])

In [None]:
def apply_label_encoding(input_data_frame):
    for i in input_data_frame.dtypes:
        if i[1].find('string') >= 0:
            label_encoder = LabelEncoder(input_cols=i[0],output_cols=i[0],drop_input_cols=True)
            input_data_frame = label_encoder.fit(input_data_frame).transform(input_data_frame)
    return input_data_frame

In [None]:
label_encoder_column = []     
input_data_frame = apply_label_encoding(input_data_frame)
feature_cols = input_data_frame.columns
target_col = exp_details.get("target_column")
feature_cols.remove(target_col)
OUTPUT_COLS = [target_col + '_PREDICTION']
input_data_frame.show()

In [None]:
pipeline = GradientBoostingClassifier(input_cols=feature_cols, label_cols=target_col)

pipeline.fit(input_data_frame)
test_df = apply_label_encoding(test_df)
scored_df = pipeline.predict(test_df)
# cf_matrix = confusion_matrix(df=scored_df, y_true_col_name=target_col, y_pred_col_name=target_col)
# cf_matrix
accurary = accuracy_score(df=scored_df, y_true_col_names=target_col, y_pred_col_names=target_col)
precision_score11 = precision_score(df=scored_df, y_true_col_names=target_col, y_pred_col_names=target_col)
recall =  recall_score(df=scored_df, y_true_col_names=target_col, y_pred_col_names=target_col)
f1= f1_score(df=scored_df, y_true_col_names=target_col, y_pred_col_names=target_col)
metrics_json = {'accuracy_score': accurary, "f1_score":f1, "recall_score": recall, "score": accurary, "precision_score": precision_score11}

In [None]:
from snowflake.ml.registry import Registry

reg = Registry(session=new_session, database_name=connection_parameters["database"], schema_name=connection_parameters["schema"])
mv = reg.log_model(pipeline,
                   model_name=exp_details.get("name", "sample_experiment"),
                   version_name="v1",
                   comment=exp_details.get("description", "sample_description"),
                   conda_dependencies=['scikit-learn==1.3.0'],
                   metrics=metrics_json,
                   sample_input_data=input_data_frame.columns,
                   python_version="3.9")

In [None]:
def score_and_dump_func(file_path):
    """
    :param
    file_path
    """

    def score_func(model, request):
        """
        :param
        model
        request
        :returns
        score_output
        """
        # Enter your custom score function here

        score_output = "Success"
        return score_output

    with open(file_path, "wb") as out:
        cloudpickle.dump(score_func, out)

In [None]:
import mlflow.sklearn
from mlflow.models import infer_signature
import cloudpickle
import json


mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URL", "http://mlflow-server"))
mlflow.set_experiment(exp_details.get("name", "sample_experiment"))
tags = {'mlflow.note.content': exp_details.get("description", "sample_description")}
params = pipeline.to_sklearn().get_params()
for i in metrics_json:
    mlflow.log_metric(i, metrics_json[i])

algorithm_name = str(pipeline.to_sklearn()).replace("()","")
for k in pipeline.to_sklearn().get_params():
    mlflow.log_param(algorithm_name + "_" + str(k),pipeline.to_sklearn().get_params()[k])

dataset = mlflow.data.from_pandas(input_data_frame.to_pandas(), source="")
mlflow.log_input(dataset, context="input")
# Set custom tags
mlflow.set_tags({
    "template_id": os.getenv("template_id", "sample_template_id"),
    "notebook_name": os.getenv("notebook_name", "sample_notebook_name"),
    "algorithm": algorithm_name,
    "algo_details": exp_details.get("algo_details")
})
signature = infer_signature(input_data_frame.to_pandas(), scored_df.to_pandas()[target_col])
# Storing score function for the model
score_and_dump_func("/tmp/scoring_func")
mlflow.log_artifact("/tmp/scoring_func")
#Register the model
mlflow.sklearn.log_model(
    pipeline.to_sklearn(), "model",
    registered_model_name=exp_details.get("name", "sample_experiment"), signature=signature
)