In [8]:
import os
exp_data = '{"name": "py_func_exp4", "algo_details": {"snowflake.ml.modeling.naive_bayes.MultinomialNB": null}, "id": "367", "dataset": "AIRLINE_DEP_DELAY_100K", "target_column": "DEP_DEL15"}'
# exp_data = '{"name": "vaibhav_exp2", "algo_details": {"snowflake.ml.modeling.neighbors.KNeighborsClassifier": null, "snowflake.ml.modeling.naive_bayes.MultinomialNB": null, "snowflake.ml.modeling.ensemble.GradientBoostingClassifier": null}, "id": "367", "dataset": "AIRLINE_DEP_DELAY_100K", "target_column": "DEP_DEL15"}'

In [9]:
import logging, sys, os
from snowflake.snowpark.session import Session

In [10]:
CONNECTION_PARAMETERS = {
    "account": "ug94937.us-east4.gcp",
    "user":"ADITYASINGH",
    "password": os.environ.get('SF_Password'),
    "role": "ADITYASINGH",
    "database": "FIRST_DB",
    "warehouse": "FOSFOR_INSIGHT_WH",
    "schema": "PUBLIC"
}


In [11]:
def create_stage(session, stage_name="demo"):
    try:
        session.sql(f"create or replace stage {stage_name}").collect()
        return f"@{stage_name}"
    except Exception as ex:
        print("Error while creating snowflake session", ex)
        raise ex

def get_session():
    """
    Method creates snowflake session object.
    :return:
    """
    try:
        return Session.builder.configs(CONNECTION_PARAMETERS).create()
    except Exception as ex:
        print("Error while creating snowflake session", ex)
        raise ex


# Stored Procedure
def train_ml_models(session: Session, exp_data: str) -> list:
    # variable for holding logs
    logs = []
    
    # function for accumulating logs
    def log_message(level: str, message: str):
        logs.append(f"{level}: {message}")
      
    #imports
    from snowflake.ml.modeling.pipeline import Pipeline
    from snowflake.ml.modeling.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
    from snowflake.ml.modeling.metrics import r2_score, accuracy_score, precision_score, roc_auc_score, f1_score, recall_score
    from snowflake.snowpark.functions import col, is_null, regexp_replace, when, lit
    from snowflake.snowpark.types import StringType
    from snowflake.snowpark.exceptions import SnowparkSQLException
    import importlib, sys, json
        
    
    log_message("INFO","Starting Experiment Recipe Execution")
    
    # Experiment details
    exp_details=json.loads(exp_data)
    
    # Read dataset, Random split
    log_message("INFO","Reading and Identifing dataset features")
    data = session.table(exp_details.get("dataset"))
    
    # Replace special character(- with _)
    schema_fields = data.schema.fields
    data = data.select([
        regexp_replace(regexp_replace(col(field.name),"-", "_"), "\\.","").alias(field.name) if isinstance(field.datatype, StringType) else col(field.name)
        for field in schema_fields])
    
    #fillna
    fill_values = {field.name: "Unknown" if isinstance(field.datatype, StringType) else 0 for field in schema_fields}
    data = data.fillna(fill_values)
    
    
    # get features
    schema_fields = data.schema.fields
    features = data.columns
    features.remove(exp_details.get("target_column"))
    data_schema = session.sql(f"DESCRIBE TABLE {exp_details.get('dataset')}").collect()
    categorical_types = ['VARCHAR','CHAR','STRING','TEXT','BOOL']
    categorical_features = []  
    for row in data_schema:
        for typ in categorical_types:
            if typ in row['type']:
                categorical_features.append(row['name'])
                break
    numerical_features = list(set(features) - set(categorical_features))
    log_message("INFO",f"numerical_features:  {numerical_features}")
    log_message("INFO",f"categorical_features: {categorical_features}")
    
    
    #identify columns for labelencoding and onehotencoding   
    le_column_feature = []
    oh_column_feature = []
    if len(categorical_features) >= 1:
        print(f"{categorical_features} columns are non numeric in feature dataset, encoding required.")
        for column in categorical_features:
            if data.select(data[column]).distinct().count() >= 10:
                le_column_feature.append(column)
            elif column == exp_details.get("target_column"):
                le_column_feature.append(column)
            else:
                oh_column_feature.append(column)
        log_message("INFO",f"Columns identified to be encoded with label encoder: {le_column_feature}")
        log_message("INFO",f"Columns identified to be encoded with one hot encoder: {oh_column_feature}")
        
    
    #pipeline steps 
    log_message("INFO","Setting up preprocessing pipeline based on dataset")
    categorical_pp = {f'le_{column}':LabelEncoder(input_cols=column, output_cols=column) for column in le_column_feature}
    if len(oh_column_feature)>0:
        categorical_pp['oh_enc'] = OneHotEncoder(input_cols=oh_column_feature, output_cols=oh_column_feature, handle_unknown='ignore')
    numerical_pp = {
        'scaler': MinMaxScaler(input_cols=numerical_features, output_cols=numerical_features)
    }
    steps = [(key, categorical_pp[key]) for key in categorical_pp if categorical_pp[key]!=[]] + \
    [(key, numerical_pp[key]) for key in numerical_pp if numerical_features!=[]]
        
        
    # Run preprocessing pipeline steps 
    log_message("INFO","Running data preprocessing pipeline")
    print("Running data preprocessing pipeline")
    print(f"Selected preprocesing steps: \n{steps}") 
    pp_pipeline = Pipeline(steps=steps)
    data = pp_pipeline.fit(data).transform(data)
    print(data.show())
    
    # Split train and test data
    df_train, df_test = data.random_split(weights=[0.8, 0.2], seed=0)
    input_cols = categorical_features+numerical_features
    if exp_details.get("target_column") in categorical_features:
        input_cols.remove(exp_details.get("target_column"))
        
 
    # dynamically import selected algorithms
    for algorithm, hyperparam in exp_details.get("algo_details").items():
        algorithm = algorithm.rsplit('.', 1)
        module = importlib.import_module(algorithm[0])
        log_message("INFO",f"----Running Algorithm {algorithm[1]}----")
        print(f"----Running Algorithm {algorithm[1]}----")
        attr = getattr(module, algorithm[1])
        
        pipe = Pipeline(steps=[("algorithm", attr(input_cols=input_cols
                                              , label_cols=[exp_details.get("target_column")]
                                              , output_cols=[f'PREDICTIONS_{algorithm[1]}'.upper()]))]
               )

        # Fit the pipeline
        log_message("INFO",f"Running model pipeline {algorithm[1]}")
        print(f"Running model pipeline {algorithm[1]}")
        model = pipe.fit(df_train)
 
        # Test the model
        log_message("INFO","Running prediction on model with test dataset")
        print("Running prediction on model with test dataset")
        df_test_pred = model.predict(df_test)
 
        # metrices
        log_message("INFO","Generating Metrices")
        print("Generating Metrices")
        accuracy = accuracy_score(df=df_test_pred, y_true_col_names=exp_details.get("target_column"), y_pred_col_names=f'PREDICTIONS_{algorithm[1]}'.upper())
        f1_sc = f1_score(df=df_test_pred, y_true_col_names=exp_details.get("target_column"), y_pred_col_names=f'PREDICTIONS_{algorithm[1]}'.upper())
        recall_sc = recall_score(df=df_test_pred, y_true_col_names=exp_details.get("target_column"), y_pred_col_names=f'PREDICTIONS_{algorithm[1]}'.upper())
        precision_sc = precision_score(df=df_test_pred, y_true_col_names=exp_details.get("target_column"), y_pred_col_names=f'PREDICTIONS_{algorithm[1]}'.upper())
        roc_auc_sc = roc_auc_score(df=df_test_pred, y_true_col_names=exp_details.get("target_column"), y_score_col_names=f'PREDICTIONS_{algorithm[1]}'.upper())
#         except SnowparkSQLException as se:
#             print(se.message)
#             accuracy=f1_sc=recall_sc=precision_sc=roc_auc_sc = 0.0
        log_message("INFO","Metrices generation completed!!!")
        print("Metrices generation completed!!!")
        

        # LOG MODEL INTO SNOWFLAKE REGISTRY
        from snowflake.ml.registry.registry import Registry
        reg = Registry(session=session)
        
        # Log the model
        log_message("INFO","Started: Registering model on snowflake")
        print("Started: Registering model on snowflake")
        try:
            mv = reg.log_model(model=model,
                               model_name=exp_details.get("name", "sample_experiment")+"_"+algorithm[1],
                               comment="test",
                               version_name="run1",
                               python_version="3.9.19",
                               conda_dependencies=["xgboost","scikit-learn==1.2.2"],
                               metrics=[{"model_metrics": {"roc_auc_score": roc_auc_sc, "precision_score": precision_sc, "f1_score": f1_sc, "recall_score": recall_sc, "accuracy_score": accuracy}, "project_id": "0001", "type": "EXP"}])
            log_message("INFO","Registeration of model completed!!!")
        except Exception as ex:
            key = 'Processing aborted due to error 370001' 
            if key in str(ex):
                log_message("INFO","Registeration of model completed!!!")
                pass
            else:
                log_message("ERROR","Exception Occured while registering model")
                return str(ex).split('?')
            
        print("trying to set tag")
        m = reg.get_model(exp_details.get("name", "sample_experiment")+"_"+algorithm[1])
        m.set_tag("accuracy", accuracy)  
    return [{"Execution Logs:": "\n".join(logs),
             "EXP_NAME":exp_details.get("name", "sample_experiment"),
             "Version":"Run1",
             "matrices":{"model_metrics": {"roc_auc_score": roc_auc_sc, "precision_score": precision_sc, "f1_score": f1_sc, "recall_score": recall_sc, "accuracy_score": accuracy}, "project_id": "0001", "type": "EXP"},
             "Alogirthm_Type":"Regression",
             "Alogithms": list(exp_details.get("algo_details").keys()),
             "RUN_STATUS":"SUCCESS",
             "registry_exp_name":""}]

In [12]:
%%time
# Initilization
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
print("Creating Snowflake Session object...")
session = get_session()
stage = create_stage(session)
print("Session has been created !")

print("Creating stored procedure...")
session.sproc.register(func=train_ml_models,
                       name="train_ml_models",
                       packages=["snowflake-snowpark-python", "snowflake-ml-python"],
                       isPermanant=False,
                       stage_location=stage,
                       replace=True)
print("Stored procedure has been created successfully!")

print("Executing Procedure")
# session.query_tag='my_session_for_sproc'
# procedure_response = session.call("train_ml_models", exp_data)
# q_id = session.sql("select * from table(information_schema.query_history()) where query_tag='my_session_for_sproc'").to_pandas()
procedure_response = train_ml_models(session, exp_data)
print("Stored Procedure Executed Successfully !")
print(procedure_response)

#Log in mlflow
print("Logging in mlflow completed !")

Creating Snowflake Session object...
INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 3.10.1, Python Version: 3.9.18, Platform: Linux-5.10.218-208.862.amzn2.x86_64-x86_64-with-glibc2.34
INFO:snowflake.connector.connection:This connection is in OCSP Fail Open Mode. TLS Certificates would be checked for validity and revocation status. Any other Certificate Revocation related exceptions or OCSP Responder failures would be disregarded in favor of connectivity.
INFO:snowflake.snowpark.session:Snowpark Session information: 
"version" : 1.18.0,
"python.version" : 3.9.18,
"python.connector.version" : 3.10.1,
"python.connector.session.id" : 96125692416466,
"os.name" : Linux

INFO:snowflake.connector.cursor:Number of results in first chunk: 1
Session has been created !
Creating stored procedure...
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 2
INFO:snowflake.connector.cursor:Number

INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 1
INFO:snowflake.connector.cursor:Number of results in first chunk: 349
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of resu

INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 1
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
----Running Algorithm MultinomialNB----
Running model pipeline MultinomialNB
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 1
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 5
INFO:snowflake.connector.cursor:Number of results in first chunk: 1
INFO:snowflake.connector.cursor:Number of results in first chunk: 1
INFO:snowflake.connector.cursor:Number of results in first chunk: 1
INFO:snowflake.connector.cursor:Number 

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 1
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 1
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
Running prediction on model with test dataset
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 3
INFO:snowflake.connector.cursor:Number of results in first chunk: 1
INFO:snowflake.connector.cursor:Number of results in first chunk: 1
IN

  return next(self.gen)


INFO:snowflake.ml.registry._manager.model_manager:Start creating MODEL object for you in the Snowflake.
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 1
INFO:snowflake.connector.cursor:Number of results in first chunk: 1
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 3
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 1
INFO:snowflake.connector.cursor:Number of results in first chunk: 1
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowfla

SnowparkSQLException: (1300) (1304): 01b5404a-0000-6c91-0000-576d070ef036: 002003 (02000): 01b5404a-0000-6c91-0000-576d070ef036: SQL compilation error:
Tag 'FIRST_DB.PUBLIC.ACCURACY' does not exist or not authorized.

In [26]:
q_id

Unnamed: 0,QUERY_ID,QUERY_TEXT,DATABASE_NAME,SCHEMA_NAME,QUERY_TYPE,SESSION_ID,USER_NAME,ROLE_NAME,WAREHOUSE_NAME,WAREHOUSE_SIZE,...,TRANSACTION_ID,QUERY_ACCELERATION_BYTES_SCANNED,QUERY_ACCELERATION_PARTITIONS_SCANNED,QUERY_ACCELERATION_UPPER_LIMIT_SCALE_FACTOR,BYTES_WRITTEN_TO_RESULT,ROWS_WRITTEN_TO_RESULT,ROWS_INSERTED,QUERY_RETRY_TIME,QUERY_RETRY_CAUSE,FAULT_HANDLING_TIME
0,01b53f31-0000-6c40-0000-576d070e605e,select * from table(information_schema.query_h...,FIRST_DB,PUBLIC,UNKNOWN,96125692378674,ADITYASINGH,ADITYASINGH,FOSFOR_INSIGHT_WH,,...,0,0,0,0,0,0,0,0,,0
1,01b53f2f-0000-6c44-0000-576d070e35da,"CALL train_ml_models('{""name"": ""py_func_exp3"",...",FIRST_DB,PUBLIC,CALL,96125692378674,ADITYASINGH,ADITYASINGH,FOSFOR_INSIGHT_WH,X-Small,...,0,0,0,0,375,1,0,0,,0
2,01b53f2f-0000-6c40-0000-576d070e6052,describe procedure TRAIN_ML_MODELS(STRING),FIRST_DB,PUBLIC,DESCRIBE,96125692378674,ADITYASINGH,ADITYASINGH,FOSFOR_INSIGHT_WH,,...,0,0,0,0,4246,12,0,0,,0
3,01b53f2d-0000-6c56-0000-576d070e5602,"select query_id, query_text from table(informa...",FIRST_DB,PUBLIC,SELECT,96125692378642,ADITYASINGH,ADITYASINGH,FOSFOR_INSIGHT_WH,X-Small,...,0,0,0,0,718,3,0,0,,0
4,01b53f2c-0000-6c8d-0000-576d070e44c2,"CALL train_ml_models('{""name"": ""py_func_exp3"",...",FIRST_DB,PUBLIC,CALL,96125692378642,ADITYASINGH,ADITYASINGH,FOSFOR_INSIGHT_WH,X-Small,...,0,0,0,0,375,1,0,0,,0
5,01b53f2c-0000-6c40-0000-576d070e6016,describe procedure TRAIN_ML_MODELS(STRING),FIRST_DB,PUBLIC,DESCRIBE,96125692378642,ADITYASINGH,ADITYASINGH,FOSFOR_INSIGHT_WH,,...,0,0,0,0,4246,12,0,0,,0


In [24]:
q_id.columns

Index(['QUERY_ID', 'QUERY_TEXT', 'DATABASE_NAME', 'SCHEMA_NAME', 'QUERY_TYPE',
       'SESSION_ID', 'USER_NAME', 'ROLE_NAME', 'WAREHOUSE_NAME',
       'WAREHOUSE_SIZE', 'WAREHOUSE_TYPE', 'CLUSTER_NUMBER', 'QUERY_TAG',
       'EXECUTION_STATUS', 'ERROR_CODE', 'ERROR_MESSAGE', 'START_TIME',
       'END_TIME', 'TOTAL_ELAPSED_TIME', 'BYTES_SCANNED', 'ROWS_PRODUCED',
       'COMPILATION_TIME', 'EXECUTION_TIME', 'QUEUED_PROVISIONING_TIME',
       'QUEUED_REPAIR_TIME', 'QUEUED_OVERLOAD_TIME',
       'TRANSACTION_BLOCKED_TIME', 'OUTBOUND_DATA_TRANSFER_CLOUD',
       'OUTBOUND_DATA_TRANSFER_REGION', 'OUTBOUND_DATA_TRANSFER_BYTES',
       'INBOUND_DATA_TRANSFER_CLOUD', 'INBOUND_DATA_TRANSFER_REGION',
       'INBOUND_DATA_TRANSFER_BYTES', 'CREDITS_USED_CLOUD_SERVICES',
       'LIST_EXTERNAL_FILE_TIME', 'RELEASE_VERSION',
       'EXTERNAL_FUNCTION_TOTAL_INVOCATIONS',
       'EXTERNAL_FUNCTION_TOTAL_SENT_ROWS',
       'EXTERNAL_FUNCTION_TOTAL_RECEIVED_ROWS',
       'EXTERNAL_FUNCTION_TOTAL_SENT_BYT

In [11]:
! pip list | grep -i sci

scikit-learn               1.3.2      
scipy                      1.13.1     
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
