In [None]:
# Imports
from snowflake.snowpark.session import Session
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.preprocessing import MinMaxScaler, LabelEncoder, OrdinalEncoder
from snowflake.ml.modeling.metrics import r2_score, accuracy_score, precision_score, roc_auc_score, f1_score, recall_score
from snowflake.snowpark.functions import col, is_null, regexp_replace, when, lit
from snowflake.snowpark.types import StringType
from snowflake.snowpark.exceptions import SnowparkSQLException
import importlib, sys, json, os, logging


In [None]:
CONNECTION_PARAMETERS = {
    "account": "ug94937.us-east4.gcp",
    "user":"ADITYASINGH",
    "password": os.environ.get('SF_Password'),
    "role": "ADITYASINGH",
    "database": "FIRST_DB",
    "warehouse": "FOSFOR_INSIGHT_WH",
    "schema": "PUBLIC"
}

In [None]:
def create_stage(session, stage_name="demo"):
    try:
        session.sql(f"create or replace stage {stage_name}").collect()
        return f"@{stage_name}"
    except Exception as ex:
        print("Error while creating snowflake session", ex)
        raise ex

        
def get_session():
    """
    Method creates snowflake session object.
    :return:
    """
    try:
        return Session.builder.configs(CONNECTION_PARAMETERS).create()
    except Exception as ex:
        print("Error while creating snowflake session", ex)
        raise ex
        
        
def apply_data_cleansing(df):
    """
    Method handles null values in snowpark dataframe.
    :param:
    df: input dataframe
    :returns:
    df_cleaned: dataframe after null handling
    """
    #fillna
    schema_fields = df.schema.fields
    fill_values = {field.name: "Unknown" if isinstance(field.datatype, StringType) else 0 for field in schema_fields}
    df_cleaned = df.fillna(fill_values)
    return df_cleaned


def get_feature_columns(df):
    """
    Identifies the numerical and categorical features in dataset.
    Identifies features for label encoding and onehot encoding
    :param:
    df: input dataframe
    :returns:
    categorical_features: list of non-numerical feature columns
    numerical_features: list of numerical feature columns
    le_column_features: list of feature label encoder columns
    oh_column_features: list of feature one hot encoder columns
    """
    schema_fields = df.schema.fields
    features = df.columns
    features.remove(exp_details.get("target_column"))
    df_schema = session.sql(f"DESCRIBE TABLE {exp_details.get('dataset')}").collect()
    categorical_types = ['VARCHAR','CHAR','STRING','TEXT','BOOL']
    categorical_features = []  
    for row in df_schema:
        for typ in categorical_types:
            if typ in row['type']:
                categorical_features.append(row['name'])
                break
    numerical_features = list(set(features) - set(categorical_features))
    print("INFO",f"numerical_features:  {numerical_features}")
    print("INFO",f"categorical_features: {categorical_features}")
    
    
    #identify columns for labelencoding and onehotencoding   
    le_column_feature = []
    oh_column_feature = []
    if len(categorical_features) >= 1:
        print(f"{categorical_features} columns are non numeric in feature dataset, encoding required.")
        for column in categorical_features:
            if data.select(data[column]).distinct().count() >= 10:
                le_column_feature.append(column)
            else:
                oh_column_feature.append(column)
        log_message("INFO",f"Columns identified to be encoded with label encoder: {le_column_feature}")
        log_message("INFO",f"Columns identified to be encoded with one hot encoder: {oh_column_feature}")
    
    