In [1]:
import os
exp_data = '{"name": "moremetrices01", "algo_details": {"snowflake.ml.modeling.ensemble.GradientBoostingClassifier": null}, "id": "367", "dataset": "AIRLINE_DEP_DELAY_10K", "target_column": "DEP_DEL15"}'

In [2]:
import logging, sys, os
from snowflake.snowpark.session import Session

In [3]:
CONNECTION_PARAMETERS = {
    "account": "ug94937.us-east4.gcp",
    "user":"ADITYASINGH",
    "password": os.environ.get('SF_Password'),
    "role": "ADITYASINGH",
    "database": "FIRST_DB",
    "warehouse": "FOSFOR_INSIGHT_WH",
    "schema": "PUBLIC"
}


In [31]:
def create_stage(session, stage_name="demo"):
    try:
        session.sql(f"create or replace stage {stage_name}").collect()
        return f"@{stage_name}"
    except Exception as ex:
        print("Error while creating snowflake session", ex)
        raise ex

def get_session():
    """
    Method creates snowflake session object.
    :return:
    """
    try:
        return Session.builder.configs(CONNECTION_PARAMETERS).create()
    except Exception as ex:
        print("Error while creating snowflake session", ex)
        raise ex


# Stored Procedure
def train_ml_models(session: Session, exp_data: str) -> list:
    # variable for holding logs
    logs = []
    
    # function for accumulating logs
    def log_message(level: str, message: str):
        logs.append(f"{level}: {message}")
        
    from snowflake.ml.modeling.pipeline import Pipeline
    from snowflake.ml.modeling.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
    from snowflake.ml.modeling.metrics import mean_squared_error, mean_absolute_error, \
         r2_score, accuracy_score, precision_score, roc_auc_score, f1_score, recall_score

    from snowflake.ml.modeling.xgboost import XGBRegressor
    import importlib, sys, json
    import snowpark

    log_message("INFO","Starting Experiment Recipe Execution")
    
    # Experiment details
    exp_details=json.loads(exp_data)
    
    # Read dataset, Random split
    log_message("INFO","Reading and Identifing dataset features")
    data = session.table(exp_details.get("dataset"))
    
    # Data Preprocessing: Validating and encoding the data if required and imputing null values.
    window_spec = Window.order_by(exp_details.get("target_column"))
    data_row_num = data.with_column('ROW_NUM', row_number().over(window_spec))
    columns = data_row_num.columns
    columns.remove('ROW_NUM')
    # forward fillna
    data_ff = data_row_num.select(
            [data_row_num['ROW_NUM']] + [
                last_value(col(column), ignore_nulls=True).over(Window.order_by('ROW_NUM')).alias(column)
                for column in columns
            ]
        )
    # backward fillna
    data_bf = data_row_num.select(
            [data_row_num['ROW_NUM']] + [
                first_value(col(column), ignore_nulls=True).over(Window.order_by(data_ff['ROW_NUM'].desc())).alias(column)
                for column in columns
            ]
        )
    data = data_bf.drop('ROW_NUM')
    
    # Replace special character(- with _)
    data = data.select([column.replace("-", "_") for column in data.columns])
    
#     df_train, df_test = session.table(exp_details.get("dataset")).drop('ROW').random_split(weights=[0.9, 0.1], seed=0)
    features = data.columns
#     features.remove(exp_details.get("target_column"))
    
    # get features
    data_schema = session.sql(f"DESCRIBE TABLE {exp_details.get('dataset')}").collect()
    categorical_types = ['VARCHAR','CHAR','STRING','TEXT','BOOL']
    categorical_features = []  
    for row in data_schema:
        for typ in categorical_types:
            if typ in row['type']:
                categorical_features.append(row['name'])
                break
    numerical_features = list(set(features) - set(categorical_features))
    log_message("INFO",f"numerical_features:  {numerical_features}")
    log_message("INFO",f"categorical_features_oe: {categorical_features}")
    
    #identify columns for labelencoding and onehotencoding   
    le_dict_feature = {}
    le_column_feature = []
    oh_column_feature = []
    oh_enc_feature = None
    if len(categorical_features) >= 1:
        print(f"{categorical_features} columns are non numeric in feature dataset, encoding required.")
        for column in categorical_features:
            if df_feature.select(df_feature[column]).distinct().count() >= 10:
                le_column_feature.append(column)
            else:
                oh_column_feature.append(column)

        log_message(f"Columns identified to be encoded with label encoder: {le_column_feature}\n"
              f"Columns identified to be encoded with one hot encoder: {oh_column_feature}")
    
    #pipeline steps 
    log_message("INFO","Setting up preprocessing pipeline based on dataset")
    #         'ord': OrdinalEncoder(input_cols=categorical_features, output_cols=categorical_features_oe) 
    categorical_pp = {
        'ohe': OneHotEncoder(input_cols=categorical_features, output_cols=categorical_features)
    }
    numerical_pp = {
        'scaler': MinMaxScaler(input_cols=numerical_features, output_cols=numerical_features)
    }
    steps = [(key, categorical_pp[key]) for key in categorical_pp if categorical_features!=[]] + \
    [(key, numerical_pp[key]) for key in numerical_pp if numerical_features!=[]]
#     steps = [(key, numerical_pp[key]) for key in numerical_pp if numerical_features!=[]]
    
    
    # Define a pipeline that does the preprocessing and training of 
    # dynamically import selected algorithms
    for algorithm, hyperparam in exp_details.get("algo_details").items():
        algorithm = algorithm.rsplit('.', 1)
        module = importlib.import_module(algorithm[0])
        log_message("INFO",f"Running Algorithm {algorithm[1]}")
        attr = getattr(module, algorithm[1])
        
        pipe = Pipeline(steps=steps+[("algorithm", attr(input_cols=categorical_features+numerical_features
                                              , label_cols=[exp_details.get("target_column")]
                                              , output_cols=[f'PREDICTIONS_{algorithm[1]}'.upper()]))]
               )

        # Fit the pipeline
        log_message("INFO",f"Running model pipeline {algorithm[1]}")
        model = pipe.fit(df_train)
        
#         log_message("INFO",f"final model size {model.size()} bytes")
        
        # Test the model
        log_message("INFO","Running prediction on model with test dataset")
        df_test_pred = model.predict(df_test)
        
        # metrices
        log_message("INFO","Generating Metrices")
        accuracy = accuracy_score(df=df_test_pred, y_true_col_names=exp_details.get("target_column"), y_pred_col_names=f'PREDICTIONS_{algorithm[1]}'.upper())
#         mse = mean_squared_error(df=df_test_pred, y_true_col_names=exp_details.get("target_column"), y_pred_col_names=f'PREDICTIONS_{algorithm[1]}'.upper())
#         mae = mean_absolute_error(df=df_test_pred, y_true_col_names=exp_details.get("target_column"), y_pred_col_names=f'PREDICTIONS_{algorithm[1]}'.upper())
#         r2 = r2_score(df=df_test_pred, y_true_col_name=exp_details.get("target_column"), y_pred_col_name=f'PREDICTIONS_{algorithm[1]}'.upper())
        f1_score = f1_score(df=df_test_pred, y_true_col_names=exp_details.get("target_column"), y_pred_col_names=f'PREDICTIONS_{algorithm[1]}'.upper())
        recall_score = recall_score(df=df_test_pred, y_true_col_names=exp_details.get("target_column"), y_pred_col_names=f'PREDICTIONS_{algorithm[1]}'.upper())
        precision_score = precision_score(df=df_test_pred, y_true_col_names=exp_details.get("target_column"), y_pred_col_names=f'PREDICTIONS_{algorithm[1]}'.upper())
        roc_auc_score = roc_auc_score(df=df_test_pred, y_true_col_names=exp_details.get("target_column"), y_score_col_names=f'PREDICTIONS_{algorithm[1]}'.upper())
        print("Execution Completed")
        print(f'{algorithm[1]} MSE: {accuracy}')
#         print(f'{algorithm[1]} MSE: {mse}')
#         print(f'{algorithm[1]} MAE: {mae}')
#         print(f'{algorithm[1]} R2: {r2}')
        print(f'{algorithm[1]} R2: {precision_score}')
        print(f'{algorithm[1]} R2: {roc_auc_score}')
        

        # LOG MODEL INTO SNOWFLAKE REGISTRY
        from snowflake.ml.registry.registry import Registry
        reg = Registry(session=session)
        # Log the model
        log_message("INFO","Started: Registering model on snowflake")
        try:
            mv = reg.log_model(model=model,
                               model_name=exp_details.get("name", "sample_experiment")+"_"+algorithm[1],
                               comment="test",
                               version_name="run1",
                               python_version="3.9.19",
                               conda_dependencies=["scikit-learn==1.3.2"],
                               metrics=[{"model_metrics": {"roc_auc_score": roc_auc_score, "precision_score": precision_score, "f1_score": f1_score, "recall_score": recall_score, "accuracy_score": accuracy}, "project_id": "0001", "type": "EXP"}])
            log_message("INFO","Registeration of model completed!!!")
        except Exception as ex:
            key = 'Processing aborted due to error 370001' 
            if key in str(ex):
                log_message("INFO","Registeration of model completed!!!")
                pass
            else:
                log_message("ERROR","Exception Occured while registering model")
                return str(ex).split('?')
    return [{"Execution Logs:": "\n".join(logs),
             "EXP_NAME":exp_details.get("name", "sample_experiment"),
             "Version":"Run1",
             "matrices":{"model_metrics": {"roc_auc_score": roc_auc_score, "precision_score": precision_score, "f1_score": f1_score, "recall_score": recall_score, "accuracy_score": accuracy}, "project_id": "0001", "type": "EXP"},
             "Alogirthm_Type":"Regression",
             "Alogithms": list(exp_details.get("algo_details").keys()),
             "RUN_STATUS":"SUCCESS",
             "registry_exp_name":""}]

In [32]:
# Initilization
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
print("Creating Snowflake Session object...")
session = get_session()
stage = create_stage(session)
print("Session has been created !")

print("Creating stored procedure...")
session.sproc.register(func=train_ml_models,
                       name="train_ml_models",
                       packages=["snowflake-snowpark-python", "snowflake-ml-python"],
                       isPermanant=False,
                       stage_location=stage,
                       replace=True)
print("Stored procedure has been created successfully!")

print("Executing Procedure")
procedure_response = session.call("train_ml_models", exp_data)
# procedure_response = train_ml_models(session, exp_data)
print("Stored Procedure Executed Successfully !")
print(procedure_response)

#Log in mlflow
print("Logging in mlflow completed !")

Creating Snowflake Session object...
INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 3.10.1, Python Version: 3.9.18, Platform: Linux-6.1.58+-x86_64-with-glibc2.34
INFO:snowflake.connector.connection:This connection is in OCSP Fail Open Mode. TLS Certificates would be checked for validity and revocation status. Any other Certificate Revocation related exceptions or OCSP Responder failures would be disregarded in favor of connectivity.
INFO:snowflake.snowpark.session:Snowpark Session information: 
"version" : 1.17.0,
"python.version" : 3.9.18,
"python.connector.version" : 3.10.1,
"python.connector.session.id" : 96125691243170,
"os.name" : Linux

INFO:snowflake.connector.cursor:Number of results in first chunk: 1
Session has been created !
Creating stored procedure...
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 2
INFO:snowflake.connector.cursor:Number of results in first c

SnowparkSQLException: (1304): 100357 (P0000): None: Python Interpreter Error:
Traceback (most recent call last):
  File "/home/udf/1466804725/udf_py_1107268314.zip/udf_py_1107268314.py", line 147, in compute
    return func(session,arg1)
  File "<ipython-input-31-468971ff5c79>", line 103, in train_ml_models
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/ml/_internal/telemetry.py", line 352, in wrap
    return update_stmt_params_if_snowpark_df(func(*args, **kwargs), statement_params)
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/ml/modeling/pipeline/pipeline.py", line 443, in fit
    estimator[1].fit(transformed_dataset)
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/ml/_internal/telemetry.py", line 352, in wrap
    return update_stmt_params_if_snowpark_df(func(*args, **kwargs), statement_params)
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/ml/modeling/framework/base.py", line 435, in fit
    return self._fit(dataset)
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py", line 440, in _fit
    self._sklearn_object = model_trainer.train()
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py", line 634, in train
    raise e
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py", line 618, in train
    sproc_export_file_name: str = fit_wrapper_sproc(
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/snowpark/stored_procedure.py", line 131, in __call__
    return session._call(
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/snowpark/session.py", line 2995, in _call
    return df.collect(statement_params=statement_params)[0][0]
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/snowpark/_internal/telemetry.py", line 144, in wrap
    result = func(*args, **kwargs)
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/snowpark/dataframe.py", line 597, in collect
    return self._internal_collect_with_tag_no_telemetry(
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/snowpark/dataframe.py", line 645, in _internal_collect_with_tag_no_telemetry
    return self._session._conn.execute(
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/snowpark/_internal/server_connection.py", line 510, in execute
    result_set, result_meta = self.get_result_set(
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/snowpark/_internal/analyzer/snowflake_plan.py", line 191, in wrap
    raise ne.with_traceback(tb) from None
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/snowpark/_internal/analyzer/snowflake_plan.py", line 122, in wrap
    return func(*args, **kwargs)
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/snowpark/_internal/server_connection.py", line 612, in get_result_set
    result = self.run_query(
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/snowpark/_internal/server_connection.py", line 123, in wrap
    raise ex
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/snowpark/_internal/server_connection.py", line 117, in wrap
    return func(*args, **kwargs)
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/snowpark/_internal/server_connection.py", line 417, in run_query
    raise ex
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/snowpark/_internal/server_connection.py", line 402, in run_query
    results_cursor = self.execute_and_notify_query_listener(
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/snowpark/_internal/server_connection.py", line 354, in execute_and_notify_query_listener
    results_cursor = self._cursor.execute(query, **kwargs)
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/connector/cursor.py", line 1016, in execute
    Error.errorhandler_wrapper(
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/connector/errors.py", line 232, in errorhandler_wrapper
    handed_over = Error.hand_to_other_handler(
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/connector/errors.py", line 287, in hand_to_other_handler
    cursor.errorhandler(connection, cursor, error_class, error_value)
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/connector/errors.py", line 165, in default_errorhandler
    raise error_class(
snowflake.snowpark.exceptions.SnowparkSQLException: (1304): 01b4f002-0000-6872-0000-576d06e25eb6: 100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "/home/udf/1466804729/udf_py_1563073981.zip/udf_py_1563073981.py", line 76, in compute
    return func(session,arg1,arg2,arg3,arg4,arg5,arg6,arg7)
  File "/usr/lib/python_udf/70b058ddce3e885a76d6b823ebf8c17508499e722327db558f6f0c9d79db0210/lib/python3.9/site-packages/snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py", line 233, in fit_wrapper_function
    estimator.fit(**args)
  File "/usr/lib/python_udf/16d39e7e207c9e3b20567e6b1997a64cbb9decff613241125135d7237d1733ae/lib/python3.9/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/lib/python_udf/16d39e7e207c9e3b20567e6b1997a64cbb9decff613241125135d7237d1733ae/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 416, in fit
    X, y = self._validate_data(
  File "/usr/lib/python_udf/16d39e7e207c9e3b20567e6b1997a64cbb9decff613241125135d7237d1733ae/lib/python3.9/site-packages/sklearn/base.py", line 621, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/usr/lib/python_udf/16d39e7e207c9e3b20567e6b1997a64cbb9decff613241125135d7237d1733ae/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1147, in check_X_y
    X = check_array(
  File "/usr/lib/python_udf/16d39e7e207c9e3b20567e6b1997a64cbb9decff613241125135d7237d1733ae/lib/python3.9/site-packages/sklearn/utils/validation.py", line 917, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/usr/lib/python_udf/16d39e7e207c9e3b20567e6b1997a64cbb9decff613241125135d7237d1733ae/lib/python3.9/site-packages/sklearn/utils/_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/usr/lib/python_udf/16d39e7e207c9e3b20567e6b1997a64cbb9decff613241125135d7237d1733ae/lib/python3.9/site-packages/pandas/core/generic.py", line 2150, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'Delta Air Lines Inc.'
 in function SNOWPARK_TEMP_PROCEDURE_OJJVABPD4N with handler udf_py_1563073981.compute
 in function TRAIN_ML_MODELS with handler udf_py_1107268314.compute