In [1]:
import os
exp_data = '{"name": "moremetrices01", "algo_details": {"snowflake.ml.modeling.ensemble.GradientBoostingClassifier": null}, "id": "367", "dataset": "AIRLINE_DEP_DELAY_10K", "target_column": "DEP_DEL15"}'

In [2]:
import logging, sys, os
from snowflake.snowpark.session import Session

In [3]:
CONNECTION_PARAMETERS = {
    "account": "ug94937.us-east4.gcp",
    "user":"ADITYASINGH",
    "password": os.environ.get('SF_Password'),
    "role": "ADITYASINGH",
    "database": "FIRST_DB",
    "warehouse": "FOSFOR_INSIGHT_WH",
    "schema": "PUBLIC"
}

In [4]:
def create_stage(session, stage_name="demo"):
    try:
        session.sql(f"create or replace stage {stage_name}").collect()
        return f"@{stage_name}"
    except Exception as ex:
        print("Error while creating snowflake session", ex)
        raise ex

def get_session():
    """
    Method creates snowflake session object.
    :return:
    """
    try:
        return Session.builder.configs(CONNECTION_PARAMETERS).create()
    except Exception as ex:
        print("Error while creating snowflake session", ex)
        raise ex

In [7]:
def train_ml_models(session: Session, exp_data: str) -> list:
    # variable for holding logs
    logs = []
    
    # function for accumulating logs
    def log_message(level: str, message: str):
        logs.append(f"{level}: {message}")
        
    from snowflake.ml.modeling.pipeline import Pipeline
    from snowflake.ml.modeling.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
    from snowflake.ml.modeling.metrics import mean_squared_error, mean_absolute_error, \
         r2_score, accuracy_score, precision_score, roc_auc_score, f1_score, recall_score
    from snowflake.snowpark.functions import col, last_value, row_number
    from snowflake.snowpark.window import Window
    import importlib, sys, json
    from snowflake import snowpark

    log_message("INFO","Starting Experiment Recipe Execution")
    
    def encoding(df, target_column):
        df_target = df[[target_column]]
        le_target = None
        # Target column validation and encoding
        if df.dtypes[target_column].name in ['object', 'bool']:
            print(f"target_column is of {df.dtypes[target_column].name} datatype, encoding required.")
            le_target = LabelEncoder()
            df_target[target_column] = pd.DataFrame(le_target.fit_transform(df_target[target_column].astype(str)))
            print(f"Target column label encoded {df_target[target_column]}, object: {le_target}")
    
        # Feature column validation and encoding
        df_feature = df.drop(target_column, axis=1)
        non_numeric_cols = df_feature.select_dtypes(include=['object', 'bool']).columns.tolist()
        le_dict_feature = {}
        le_column_feature = []
        oh_column_feature = []
        oh_enc_feature = None
        if len(non_numeric_cols) >= 1:
            print(f"{non_numeric_cols} columns are non numeric in feature dataset, encoding required.")
            for col in non_numeric_cols:
                if df_feature[col].nunique() >= 10:
                    le_column_feature.append(col)
                else:
                    oh_column_feature.append(col)
    
            print(f"Columns identified to be encoded with label encoder: {le_column_feature}\n"
                  f"Columns identified to be encoded with one hot encoder: {oh_column_feature}")
    
            # columns to be label encoded
            if len(le_column_feature) == 0:
                df_feature = df_feature
            else:
                for col in le_column_feature:
                    le_dict_feature[col] = LabelEncoder()
                    df_feature[col] = le_dict_feature[col].fit_transform(df_feature[col].astype(str))
                    print(f"{col} column label encoded {df_feature[col]}, object: {le_dict_feature[col]}")
    
            # columns to be one hot encoded
            if len(oh_column_feature) == 0:
                df_feature = df_feature
            else:
                unique_combinations = pd.get_dummies(df_feature[oh_column_feature])
                unique_combinations_list = unique_combinations.columns.tolist()
                oh_enc_feature = OneHotEncoder()
                oh_encoded_array = oh_enc_feature.fit_transform(df_feature[oh_column_feature]).toarray() if len(
                    oh_column_feature) > 1 else oh_enc_feature.fit_transform(df_feature[oh_column_feature]).toarray()
                df_oh_enc = pd.DataFrame(oh_encoded_array, columns=unique_combinations_list)
                df_feature = df_feature.drop(columns=oh_column_feature)
                df_feature = df_feature.join(df_oh_enc)
                print(f"new one hot encoded df: {oh_encoded_array}\n"
                      f"one hot encoder object: {oh_enc_feature}\n")
            print(f"final feature df created: {df_feature}")
        return df_target, le_target, df_feature, le_dict_feature, oh_enc_feature, le_column_feature, oh_column_feature
           
    
    # Experiment details
    exp_details=json.loads(exp_data)
    
    # Read dataset, Random split
    log_message("INFO","Reading and Identifing dataset features")
    data = session.table(exp_details.get("dataset"))
    
    # Data Preprocessing: Validating and encoding the data if required and imputing null values.
    window_spec = Window.order_by()
    data_row_num = data.with_column('row_num', row_number().over(window_spec))
    columns = data_row_num.columns
    column.remove('row_num')
    data = data_row_num.select(
            [data_row_num['row_num']] + [
                last_value(col(column), ignore_nulls=True).over(Window.order_by('row_num')).alias(column)
                for column in columns
            ]
        )
    data = data.drop('row_num')
#     data = data.select([col for col in data.columns], 
#                       *[snowpark.functions.coalesce(col, snowpark.functions.lag(col)) for col in data.columns])
#     data = data.select([col for col in data.columns], 
#                       *[snowpark.functions.coalesce(col, snowpark.functions.lead(col)) for col in data.columns])
    df_target, le_target, df_feature, le_dict_feature, oh_enc_feature, le_column_feature, oh_column_feature = encoding(
        data, exp_details.get("target_column"))
    
    
    df_train, df_test = session.table(exp_details.get("dataset")).drop('ROW').random_split(weights=[0.9, 0.1], seed=0)
    features = df_train.columns
    features.remove(exp_details.get("target_column"))
    
    # get features
    data_schema = session.sql(f"DESCRIBE TABLE {exp_details.get('dataset')}").collect()
    categorical_types = ['VARCHAR','CHAR','STRING','TEXT','BOOL']
    categorical_features = []
    
    for row in data_schema:
        for typ in categorical_types:
            if typ in row['type'] and row['name']!=exp_details.get("target_column"):
                categorical_features.append(row['name'])
                break
    numerical_features = list(set(features) - set(categorical_features))
    log_message("INFO",f"numerical_features:  {numerical_features}")
    log_message("INFO",f"categorical_features_oe: {categorical_features}")
    
    
    #pipeline steps 
    log_message("INFO","Setting up preprocessing pipeline based on dataset")
    categorical_pp = {
        'ohe': OneHotEncoder(input_cols=categorical_features, output_cols=categorical_features)
    }
    numerical_pp = {
        'scaler': MinMaxScaler(input_cols=numerical_features, output_cols=numerical_features)
    }
    steps = [(key, categorical_pp[key]) for key in categorical_pp if categorical_features!=[]] + \
    [(key, numerical_pp[key]) for key in numerical_pp if numerical_features!=[]]
    
    
    # Define a pipeline that does the preprocessing and training of 
    # dynamically import selected algorithms
    for algorithm, hyperparam in exp_details.get("algo_details").items():
        algorithm = algorithm.rsplit('.', 1)
        module = importlib.import_module(algorithm[0])
        log_message("INFO",f"Running Algorithm {algorithm[1]}")
        attr = getattr(module, algorithm[1])
        
        pipe = Pipeline(steps=steps+[("algorithm", attr(input_cols=categorical_features+numerical_features
                                              , label_cols=[exp_details.get("target_column")]
                                              , output_cols=[f'PREDICTIONS_{algorithm[1]}'.upper()]))]
               )

        # Fit the pipeline
        log_message("INFO",f"Running model pipeline {algorithm[1]}")
        model = pipe.fit(df_train)
        
#         log_message("INFO",f"final model size {model.size()} bytes")
        
        # Test the model
        log_message("INFO","Running prediction on model with test dataset")
        df_test_pred = model.predict(df_test)
        
        # metrices
        log_message("INFO","Generating Metrices")
        accuracy = accuracy_score(df=df_test_pred, y_true_col_names=exp_details.get("target_column"), y_pred_col_names=f'PREDICTIONS_{algorithm[1]}'.upper())
        f1_score = f1_score(df=df_test_pred, y_true_col_names=exp_details.get("target_column"), y_pred_col_names=f'PREDICTIONS_{algorithm[1]}'.upper())
        recall_score = recall_score(df=df_test_pred, y_true_col_names=exp_details.get("target_column"), y_pred_col_names=f'PREDICTIONS_{algorithm[1]}'.upper())
        precision_score = precision_score(df=df_test_pred, y_true_col_names=exp_details.get("target_column"), y_pred_col_names=f'PREDICTIONS_{algorithm[1]}'.upper())
        roc_auc_score = roc_auc_score(df=df_test_pred, y_true_col_names=exp_details.get("target_column"), y_score_col_names=f'PREDICTIONS_{algorithm[1]}'.upper())
        print("Execution Completed")
        print(f'{algorithm[1]} MSE: {accuracy}')

        print(f'{algorithm[1]} R2: {precision_score}')
        print(f'{algorithm[1]} R2: {roc_auc_score}')
        

        # LOG MODEL INTO SNOWFLAKE REGISTRY
        from snowflake.ml.registry.registry import Registry
        reg = Registry(session=session)
        # Log the model
        log_message("INFO","Started: Registering model on snowflake")
        try:
            mv = reg.log_model(model=model,
                               model_name=exp_details.get("name", "sample_experiment")+"_"+algorithm[1],
                               comment="test",
                               version_name="run1",
                               python_version="3.9.19",
                               conda_dependencies=["scikit-learn==1.3.2"],
                               metrics=[{"model_metrics": {"roc_auc_score": roc_auc_score, "precision_score": precision_score, "f1_score": f1_score, "recall_score": recall_score, "accuracy_score": accuracy}, "project_id": "0001", "type": "EXP"}])
            log_message("INFO","Registeration of model completed!!!")
        except Exception as ex:
            key = 'Processing aborted due to error 370001' 
            if key in str(ex):
                log_message("INFO","Registeration of model completed!!!")
                pass
            else:
                log_message("ERROR","Exception Occured while registering model")
                return str(ex).split('?')
    return [{"Execution Logs:": "\n".join(logs),
             "EXP_NAME":exp_details.get("name", "sample_experiment"),
             "Version":"Run1",
             "matrices":{"model_metrics": {"roc_auc_score": roc_auc_score, "precision_score": precision_score, "f1_score": f1_score, "recall_score": recall_score, "accuracy_score": accuracy}, "project_id": "0001", "type": "EXP"},
             "Alogirthm_Type":"Regression",
             "Alogithms": list(exp_details.get("algo_details").keys()),
             "RUN_STATUS":"SUCCESS",
             "registry_exp_name":""}]

In [8]:
# Initilization
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
print("Creating Snowflake Session object...")
session = get_session()
stage = create_stage(session)
print("Session has been created !")

print("Creating stored procedure...")
session.sproc.register(func=train_ml_models,
                       name="train_ml_models",
                       packages=["snowflake-snowpark-python", "snowflake-ml-python"],
                       isPermanant=False,
                       stage_location=stage,
                       replace=True)
print("Stored procedure has been created successfully!")

print("Executing Procedure")
# procedure_response = session.call("train_ml_models", exp_data)
procedure_response = train_ml_models(session, exp_data)
print("Stored Procedure Executed Successfully !")
print(procedure_response)

#Log in mlflow
print("Logging in mlflow completed !")

Creating Snowflake Session object...
INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 3.10.1, Python Version: 3.9.18, Platform: Linux-6.1.58+-x86_64-with-glibc2.34
INFO:snowflake.connector.connection:This connection is in OCSP Fail Open Mode. TLS Certificates would be checked for validity and revocation status. Any other Certificate Revocation related exceptions or OCSP Responder failures would be disregarded in favor of connectivity.
INFO:snowflake.snowpark.session:Snowpark Session information: 
"version" : 1.17.0,
"python.version" : 3.9.18,
"python.connector.version" : 3.10.1,
"python.connector.session.id" : 96125691302234,
"os.name" : Linux

INFO:snowflake.connector.cursor:Number of results in first chunk: 1
Session has been created !
Creating stored procedure...
INFO:snowflake.connector.cursor:Number of results in first chunk: 0
INFO:snowflake.connector.cursor:Number of results in first chunk: 2
INFO:snowflake.connector.cursor:Number of results in first c

TypeError: The input of select() must be Column, column name, TableFunctionCall, or a list of them