In [46]:
def run_exp(sf_pass, algos, dataset, target):    
    import os, importlib
    from snowflake.snowpark import Session
    from snowflake.ml.modeling.pipeline import Pipeline
    from snowflake.ml.modeling.preprocessing import MinMaxScaler, OrdinalEncoder
    from snowflake.ml.modeling.metrics import mean_squared_error, mean_absolute_error, r2_score
    from snowflake.snowpark.types import StructType, StructField, IntegerType, StringType
#     from snowflake.ml.modeling.xgboost import XGBClassifier
    from snowflake.snowpark import Session, FileOperation

    connection_parameters = {
        "account": "ug94937.us-east4.gcp",
        "user": "ADITYASINGH",
        "password": sf_pass,
        "role": "ADITYASINGH",  # optional
        "warehouse": "FOSFOR_INSIGHT_WH",  # optional
        "database": "FIRST_DB",  # optional
        "schema": "PUBLIC",  # optional
    } 
    
    session = Session.builder.configs(connection_parameters).create()
    session.sql_simplifier_enabled = True
    
    # Read dataset
    df_train, df_test = session.table(dataset).drop('ROW').random_split(weights=[0.9, 0.1], seed=0)
    features = df_train.columns
    features.remove(target)
    
    # generating feature names
    data_schema = session.sql(f"DESCRIBE TABLE {dataset}").collect()
    categorical_types = ['VARCHAR','CHAR','STRING','TEXT','BOOL']
    categorical_features = []
    for row in data_schema:
        for typ in categorical_types:
            if typ in row['type'] and row['name']!=target:
                categorical_features.append(row['name'])
                break
    numerical_features = list(set(features) - set(categorical_features))
    categorical_features_oe = list(map(lambda a: a+'_OE', categorical_features))
    
    
    # Define a pipeline that does the preprocessing and training of 
    # dynamically generate list of selected algorithms for imports
    for algorithm in algos:
        df_all_pred = None
        algorithm = algorithm.rsplit('.', 1)
        module = importlib.import_module(algorithm[0])
        attr = getattr(module, algorithm[1])
        pipe = Pipeline(steps=[
                  ("ord", OrdinalEncoder(input_cols=categorical_features, output_cols=categorical_features_oe)),
                  ("scaler", MinMaxScaler(input_cols=numerical_features, output_cols=numerical_features)),
                  ("algorithm", attr(input_cols=categorical_features_oe+numerical_features
                                              , label_cols=[target]
                                              , output_cols=[f'PREDICTIONS_{algorithm[1]}']))
                 ]
               )
         
        # Fit the pipeline
        xgb_model = pipe.fit(df_train)
         
        # Test the model
        df_test_pred = xgb_model.predict(df_test)
        
        #combining predictions
        if df_all_pred is None:
#             schema = StructType([
#                 StructField(f"PREDICTIONS_{algorithm[1]}", IntegerType)
#             ])
#             df_all_pred = session.create_dataframe([], schema)
            df_all_pred = df_test_pred.select(df_test_pred['PREDICTIONS'])
        else:
            df_all_pred[f"PREDICTIONS_{algorithm[1]}"] = df_test_pred[f'PREDICTIONS_{algorithm[1]}']
        # metrices
        mse = mean_squared_error(df=df_test_pred, y_true_col_names=target, y_pred_col_names=f'PREDICTIONS_{algorithm[1]}')
        mae = mean_absolute_error(df=df_test_pred, y_true_col_names=target, y_pred_col_names=f'PREDICTIONS_{algorithm[1]}')
        r2 = r2_score(df=df_test_pred, y_true_col_name=target, y_pred_col_name=f'PREDICTIONS_{algorithm[1]}')
        print(f'{algorithm[1]} MSE: {mse}')
        print(f'{algorithm[1]} MAE: {mae}')
        print(f'{algorithm[1]} R2: {r2}')
    return df_test_pred

In [47]:
test = run_exp(os.environ.get('SF_Password'),
               ['snowflake.ml.modeling.naive_bayes.GaussianNB',
                'snowflake.ml.modeling.neighbors.KNeighborsClassifier',
               ],
               'EMPLOYEE', 
               'LEAVEORNOT')

The version of package 'snowflake-snowpark-python' in the local environment is 1.17.0, which does not fit the criteria for the requirement 'snowflake-snowpark-python'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn==1.3.0'. Your UDF might not work when the package version is different between the server and your local environment.
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn==1.3.0'. Your UDF might not work when the package version is different between the server and your local environment.


MSE: 0.1727640136965599
MAE: 0.31845926309526407
R2: 0.26641396693757746


In [1]:
import os, importlib
from snowflake.snowpark import Session
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.preprocessing import MinMaxScaler, OrdinalEncoder
from snowflake.ml.modeling.metrics import mean_squared_error, mean_absolute_error, r2_score
from snowflake.snowpark.types import StructType, StructField, IntegerType, StringType
#    from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.snowpark import Session, FileOperation
connection_parameters = {
    "account": "ug94937.us-east4.gcp",
    "user": "ADITYASINGH",
    "password": os.environ.get('SF_Password'),
    "role": "ADITYASINGH",  # optional
    "warehouse": "FOSFOR_INSIGHT_WH",  # optional
    "database": "FIRST_DB",  # optional
    "schema": "PUBLIC",  # optional
} 

session = Session.builder.configs(connection_parameters).create()
session.sql_simplifier_enabled = True

In [2]:
schema = StructType([
            StructField("PREDICTIONS", IntegerType())
            ])

In [4]:
df_train, df_test = session.table("EMPLOYEE").drop('ROW').random_split(weights=[0.9, 0.1], seed=0)

In [7]:
df_all_pred2 = df_train.select(df_train['CITY'])

In [8]:
df_all_pred = df_all_pred.join(df_all_pred2)

In [9]:
df_all_pred.show()

----------------------------
|"LEAVEORNOT"  |"CITY"     |
----------------------------
|0             |Bangalore  |
|0             |Bangalore  |
|0             |New Delhi  |
|0             |Pune       |
|0             |Bangalore  |
|0             |New Delhi  |
|0             |New Delhi  |
|0             |Bangalore  |
|0             |Pune       |
|0             |New Delhi  |
----------------------------



In [43]:
algo = 'snowflake.ml.modeling.ensemble.AdaBoostRegressor'

'AdaBoostRegressor'