In [26]:
def run_exp(sf_pass, algos, dataset, target):    
    import os, importlib
    from snowflake.snowpark import Session
    from snowflake.ml.modeling.pipeline import Pipeline
    from snowflake.ml.modeling.preprocessing import MinMaxScaler, OrdinalEncoder
    from snowflake.ml.modeling.metrics import mean_squared_error, mean_absolute_error, r2_score
    from snowflake.snowpark.types import StructType, StructField, IntegerType, StringType
#     from snowflake.ml.modeling.xgboost import XGBClassifier
    from snowflake.ml.modeling.compose import ColumnTransformer
    from snowflake.snowpark import Session, FileOperation

    connection_parameters = {
        "account": "ug94937.us-east4.gcp",
        "user": "ADITYASINGH",
        "password": sf_pass,
        "role": "ADITYASINGH",  # optional
        "warehouse": "FOSFOR_INSIGHT_WH",  # optional
        "database": "FIRST_DB",  # optional
        "schema": "PUBLIC",  # optional
    } 
    
    session = Session.builder.configs(connection_parameters).create()
    session.sql_simplifier_enabled = True
    
    # Read dataset
    df_train, df_test = session.table(dataset).drop('ROW').random_split(weights=[0.9, 0.1], seed=0)
    print(df_train.show())
    features = df_train.columns
    features.remove(target)
    
    # generating feature names
    data_schema = session.sql(f"DESCRIBE TABLE {dataset}").collect()
    categorical_types = ['VARCHAR','CHAR','STRING','TEXT','BOOL']
    categorical_features = []
    for row in data_schema:
        for typ in categorical_types:
            if typ in row['type'] and row['name']!=target:
                categorical_features.append(row['name'])
                break
    numerical_features = list(set(features) - set(categorical_features))
    categorical_features_oe = list(map(lambda a: a+'_OE', categorical_features))
    print("numerical_features: ", numerical_features)
    print("categorical_features_oe: ", categorical_features_oe)
    
#     #Numerical pipeline
#     numeric_transform = Pipeline(steps=[
#         ("scaler", MinMaxScaler(output_cols=numerical_features))
#     ]
#     )
    
#     #Categorical pipeline
#     categoric_transform = Pipeline(steps=[
#         ("ord", OrdinalEncoder(output_cols=categorical_features_oe))
#     ]
#     )
    
#     #preprocessor
#     preprocessor = ColumnTransformer(
#         output_cols=categorical_features_oe+numerical_features+[target],
#         transformers=[
#             ('num', numeric_transform, numerical_features),
#             ('cat', categoric_transform, categorical_features)
#         ],
#         remainder='passthrough'
#     )
    # Define a pipeline that does the preprocessing and training of 
    # dynamically generate list of selected algorithms for imports
    
    #pipeline steps #elif key=='num' and numerical_features!=[]
    categorical_pp = {
        'ord': OrdinalEncoder(input_cols=categorical_features, output_cols=categorical_features_oe) 
    }
    
    numerical_pp = {
        'scaler': MinMaxScaler(input_cols=numerical_features, output_cols=numerical_features)
    }
    steps = []
    steps.append([(key, categorical_pp[key]) for key in categorical_pp if categorical_features!=[]])
    steps.append([(key, numerical_pp[key]) for key in numerical_pp if numerical_features!=[]])
    print(steps)
    
    df_all_pred = None
    for algorithm in algos:
        algorithm = algorithm.rsplit('.', 1)
        module = importlib.import_module(algorithm[0])
        print(algorithm[1])
        attr = getattr(module, algorithm[1])
#         pipe = Pipeline(steps=[
#                 ("ord", OrdinalEncoder(input_cols=categorical_features, output_cols=categorical_features_oe)),
#                 ("scaler", MinMaxScaler(input_cols=numerical_features, output_cols=numerical_features)),
#                 ("algorithm", attr(input_cols=categorical_features_oe+numerical_features
#                                               , label_cols=[target]
#                                               , output_cols=[f'PREDICTIONS_{algorithm[1]}'.upper()]))
#                  ]
#                )
        pipe = Pipeline(steps=steps+[("algorithm", attr(input_cols=categorical_features_oe+numerical_features
                                              , label_cols=[target]
                                              , output_cols=[f'PREDICTIONS_{algorithm[1]}'.upper()]))]
               )
#         pipe = Pipeline(steps=[
#             ('preprocessor', preprocessor),
#             ('algorithm', attr(input_cols=categorical_features+numerical_features
#                                               , label_cols=[target]
#                                               , output_cols=[f'PREDICTIONS_{algorithm[1]}'.upper()]))
#         ])
        # Fit the pipeline
        xgb_model = pipe.fit(df_train)
         
        # Test the model
        df_test_pred = xgb_model.predict(df_test)
        
        #combining predictions
        if df_all_pred is None:
            df_all_pred = df_test_pred.select(df_test_pred[f'PREDICTIONS_{algorithm[1]}'.upper()])
        else:
            df_all_pred = df_all_pred.join(df_test_pred.select(df_test_pred[f'PREDICTIONS_{algorithm[1]}'.upper()]))
            
        # metrices
        mse = mean_squared_error(df=df_test_pred, y_true_col_names=target, y_pred_col_names=f'PREDICTIONS_{algorithm[1]}'.upper())
        mae = mean_absolute_error(df=df_test_pred, y_true_col_names=target, y_pred_col_names=f'PREDICTIONS_{algorithm[1]}'.upper())
        r2 = r2_score(df=df_test_pred, y_true_col_name=target, y_pred_col_name=f'PREDICTIONS_{algorithm[1]}'.upper())
        print(f'{algorithm[1]} MSE: {mse}')
        print(f'{algorithm[1]} MAE: {mae}')
        print(f'{algorithm[1]} R2: {r2}')
        
    return df_all_pred

In [28]:
test = run_exp(os.environ.get('SF_Password'),
               ['snowflake.ml.modeling.naive_bayes.GaussianNB',
                'snowflake.ml.modeling.neighbors.KNeighborsClassifier',
               ],
               'EMPLOYEE', 
               'LEAVEORNOT')

-------------------------------------------------------------------------------------------------------------------------------------------
|"EDUCATION"  |"JOININGYEAR"  |"CITY"     |"PAYMENTTIER"  |"AGE"  |"GENDER"  |"EVERBENCHED"  |"EXPERIENCEINCURRENTDOMAIN"  |"LEAVEORNOT"  |
-------------------------------------------------------------------------------------------------------------------------------------------
|Bachelors    |2017           |Bangalore  |3              |34     |Male      |No             |0                            |0             |
|Bachelors    |2013           |Pune       |1              |28     |Female    |No             |3                            |1             |
|Bachelors    |2014           |New Delhi  |3              |38     |Female    |No             |2                            |0             |
|Masters      |2016           |Bangalore  |3              |27     |Male      |No             |5                            |1             |
|Masters      |2017 

ValueError: not enough values to unpack (expected 2, got 1)

In [13]:
test.show()

-----------------------------------------------------------------
|"PREDICTIONS_GAUSSIANNB"  |"PREDICTIONS_KNEIGHBORSCLASSIFIER"  |
-----------------------------------------------------------------
|1                         |0                                   |
|0                         |0                                   |
|0                         |0                                   |
|1                         |0                                   |
|0                         |0                                   |
|0                         |0                                   |
|1                         |0                                   |
|0                         |0                                   |
|0                         |0                                   |
|0                         |0                                   |
-----------------------------------------------------------------



In [14]:
test = run_exp(os.environ.get('SF_Password'),
               ['snowflake.ml.modeling.linear_model.SGDRegressor',
                'snowflake.ml.modeling.svm.LinearSVR',
               ],
               'ALCOHOL_QUALITY', 
               'QUALITY')

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"FIXED_ACIDITY"  |"VOLATILE_ACIDITY"  |"CITRIC_ACID"  |"RESIDUAL_SUGAR"  |"CHLORIDES"  |"FREE_SULFUR_DIOXIDE"  |"TOTAL_SULFUR_DIOXIDE"  |"DENSITY"  |"PH"  |"SULPHATES"  |"ALCOHOL"  |"QUALITY"  |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|7.4              |0.7                 |0.0            |1.9               |0.076        |11.0                   |34.0                    |0.9978     |3.51  |0.56         |9.4        |5          |
|7.8              |0.88                |0.0            |2.6               |0.098        |25.0                   |67.0                    |0.9968     |3.2   |0.68         |9.8        |5          |
|7.8              |0

The version of package 'snowflake-snowpark-python' in the local environment is 1.17.0, which does not fit the criteria for the requirement 'snowflake-snowpark-python'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn==1.3.0'. Your UDF might not work when the package version is different between the server and your local environment.
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn==1.3.0'. Your UDF might not work when the package version is different between the server and your local environment.


SGDRegressor MSE: 0.5911036852513518
SGDRegressor MAE: 0.5638203185808013
SGDRegressor R2: 0.2573450961761776
LinearSVR


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn==1.3.0'. Your UDF might not work when the package version is different between the server and your local environment.


LinearSVR MSE: 0.48555950380657903
LinearSVR MAE: 0.497915045290663
LinearSVR R2: 0.38994941903148417


In [15]:
test.show()

--------------------------------------------------------
|"PREDICTIONS_SGDREGRESSOR"  |"PREDICTIONS_LINEARSVR"  |
--------------------------------------------------------
|5.367994358476709           |5.269499192414458        |
|5.367994358476709           |5.091083657079792        |
|5.367994358476709           |5.302380489762069        |
|5.367994358476709           |4.964908195867796        |
|5.367994358476709           |4.859939892167627        |
|5.367994358476709           |5.843309701689729        |
|5.367994358476709           |6.6160259500357          |
|5.367994358476709           |5.336366639516772        |
|5.367994358476709           |5.142199548781186        |
|5.367994358476709           |5.646059298483008        |
--------------------------------------------------------

