In [None]:
!pip install snowflake['ml']

In [None]:
!pip uninstall cloudpickle -y 
!pip install cloudpickle==2.2.1

In [None]:
# !pip uninstall numpy -y
# !pip install numpy==1.5.1

In [None]:
# !pip uninstall snowflake-snowpark-python -y
# !pip install snowflake-snowpark-python==1.5.1

In [19]:
import os
from snowflake.snowpark import Session
from snowflake.snowpark.version import VERSION
from snowflake.snowpark.types import StructType, StructField, DoubleType, StringType
import snowflake.snowpark.functions as F

In [20]:
connection_parameters = {
    "account": "ug94937.us-east4.gcp",
    "user": "ADITYASINGH",
    "password": os.environ.get('SF_Password'),
    "role": "ADITYASINGH",  # optional
    "warehouse": "FOSFOR_INSIGHT_WH",  # optional
#     "authenticator": "externalbrowser", # optional
    "database": "FIRST_DB",  # optional
    "schema": "PUBLIC",  # optional
} 

In [21]:
# Make a Snowpark Connection

################################################################################################################
#  You can also use the SnowSQL Client to configure your connection params:
#  https://docs.snowflake.com/en/user-guide/snowsql-install-config.html
#
#  >>> from snowflake.ml.utils import connection_params
#  >>> session = Session.builder.configs(connection_params.SnowflakeLoginOptions()
#  >>> ).create()   
#
#  NOTE: If you have named connection params then specify the connection name
#  Example:
#  
#  >>> session = Session.builder.configs(
#  >>> connection_params.SnowflakeLoginOptions(connection_name='connections.snowml')
#  >>> ).create()
#
#################################################################################################################

# Edit the connection.json before creating the session object below
# Create Snowflake Session object
# connection_parameters = json.load(open('connection.json'))
session = Session.builder.configs(connection_parameters).create()
session.sql_simplifier_enabled = True

snowflake_environment = session.sql('SELECT current_user(), current_version()').collect()
snowpark_version = VERSION

# Current Environment Details
print('\nConnection Established with the following parameters:')
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))


Connection Established with the following parameters:
User                        : ADITYASINGH
Role                        : "ADITYASINGH"
Database                    : "FIRST_DB"
Schema                      : "PUBLIC"
Warehouse                   : "FOSFOR_INSIGHT_WH"
Snowflake version           : 8.20.10
Snowpark for Python version : 1.17.0


In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from snowflake.ml.modeling.xgboost import XGBRegressor

In [27]:
data = pd.read_csv('/data/mlflow_sample_data.csv')

In [28]:
def encoding(df, target_column):
    """
    Checking whether encoding required in target and feature datasets.
    If required, then encoding them with label and one hot encoding.
    :param:
    df: input dataframe
    target_column: target column
    :returns:
    df_target: target dataframe
    le_target: target label encoder object
    df_feature: feature dataframe
    le_dict_feature: dict of feature label encoder objects
    oh_enc_feature: feature one hot encoder object
    le_column_feature: list of feature label encoder columns
    oh_column_feature: list of feature one hot encoder columns
    """
    df_target = df[[target_column]]
    le_target = None
    # Target column validation and encoding
    if df.dtypes[target_column].name in ['object', 'bool']:
        print(f"target_column is of {df.dtypes[target_column].name} datatype, encoding required.")
        le_target = LabelEncoder()
        df_target[target_column] = pd.DataFrame(le_target.fit_transform(df_target[target_column].astype(str)))
        print(f"Target column label encoded {df_target[target_column]}, object: {le_target}")

    # Feature column validation and encoding
    df_feature = df.drop(target_column, axis=1)
    non_numeric_cols = df_feature.select_dtypes(include=['object', 'bool']).columns.tolist()
    le_dict_feature = {}
    le_column_feature = []
    oh_column_feature = []
    oh_enc_feature = None
    if len(non_numeric_cols) >= 1:
        print(f"{non_numeric_cols} columns are non numeric in feature dataset, encoding required.")
        for col in non_numeric_cols:
            if df_feature[col].nunique() >= 10:
                le_column_feature.append(col)
            else:
                oh_column_feature.append(col)

        print(f"Columns identified to be encoded with label encoder: {le_column_feature}\n"
              f"Columns identified to be encoded with one hot encoder: {oh_column_feature}")

        # columns to be label encoded
        if len(le_column_feature) == 0:
            df_feature = df_feature
        else:
            for col in le_column_feature:
                le_dict_feature[col] = LabelEncoder()
                df_feature[col] = le_dict_feature[col].fit_transform(df_feature[col].astype(str))
                print(f"{col} column label encoded {df_feature[col]}, object: {le_dict_feature[col]}")

        # columns to be one hot encoded
        if len(oh_column_feature) == 0:
            df_feature = df_feature
        else:
            unique_combinations = pd.get_dummies(df_feature[oh_column_feature])
            unique_combinations_list = unique_combinations.columns.tolist()
            oh_enc_feature = OneHotEncoder()
            oh_encoded_array = oh_enc_feature.fit_transform(df_feature[oh_column_feature]).toarray() if len(oh_column_feature) > 1 else oh_enc_feature.fit_transform(df_feature[oh_column_feature]).toarray()
            df_oh_enc = pd.DataFrame(oh_encoded_array, columns=unique_combinations_list)
            df_feature = df_feature.drop(columns=oh_column_feature)
            df_feature = df_feature.join(df_oh_enc)
            print(f"new one hot encoded df: {oh_encoded_array}\n"
                  f"one hot encoder object: {oh_enc_feature}\n")
        print(f"final feature df created: {df_feature}")
    return df_target, le_target, df_feature, le_dict_feature, oh_enc_feature, le_column_feature, oh_column_feature

In [29]:
df_target, le_target, df_feature, le_dict_feature, oh_enc_feature, le_column_feature, oh_column_feature = encoding(data,'quality')

In [33]:
model = XGBRegressor()

In [None]:
# import numpy as np
# import pandas as pd
# import random
# import string

# from sklearn.datasets import make_regression
# from snowflake.ml.modeling.preprocessing import MinMaxScaler, OrdinalEncoder
# from snowflake.ml.modeling.pipeline import Pipeline
# from snowflake.snowpark import Session

# # Create a session with your preferred method
# # session =

# NUMERICAL_COLS = ["X1", "X2", "X3"]
# CATEGORICAL_COLS = ["C1", "C2", "C3"]
# FEATURE_COLS = NUMERICAL_COLS + CATEGORICAL_COLS
# CATEGORICAL_OUTPUT_COLS = ["C1_OUT", "C2_OUT", "C3_OUT"]
# FEATURE_OUTPUT_COLS = ["X1_FEAT_OUT", "X2_FEAT_OUT", "X3_FEAT_OUT", "C1_FEAT_OUT", "C2_FEAT_OUT", "C3_FEAT_OUT"]

# # Create a dataset with numerical and categorical features
# X, _ = make_regression(
#     n_samples=1000,
#     n_features=3,
#     noise=0.1,
#     random_state=0,
# )
# X = pd.DataFrame(X, columns=NUMERICAL_COLS)

# def generate_random_string(length):
#     return "".join(random.choices(string.ascii_uppercase, k=length))

# categorical_feature_length = 2
# categorical_features = {}
# for c in CATEGORICAL_COLS:
#     categorical_column = [generate_random_string(categorical_feature_length) for _ in range(X.shape[0])]
#     categorical_features[c] = categorical_column

# X = X.assign(**categorical_features)

# features_df = session.create_dataframe(X)

# # Fit a pipeline with OrdinalEncoder and MinMaxScaler on Snowflake
# pipeline = Pipeline(
#     steps=[
#         (
#             "OE",
#             OrdinalEncoder(
#                 input_cols=CATEGORICAL_COLS,
#                 output_cols=CATEGORICAL_OUTPUT_COLS,
#             )
#         ),
#         (
#             "MMS",
#             MinMaxScaler(
#                 input_cols=NUMERICAL_COLS + CATEGORICAL_OUTPUT_COLS,
#                 output_cols=FEATURE_OUTPUT_COLS,
#             )
#         ),
#     ]
# )

# pipeline.fit(features_df)

# # Use the pipeline to transform a dataset.
# result = pipeline.transform(features_df)

In [None]:
# from snowflake.ml.modeling.xgboost import XGBRegressor

In [None]:
# xgboost_model = XGBRegressor(
#     input_cols=FEATURE_COLS,
#     label_cols=CATEGORICAL_OUTPUT_COLS,
#     output_cols=FEATURE_OUTPUT_COLS
# )

# xgboost_model.fit(features_df)

# # Use the model to make predictions.
# predictions = xgboost_model.predict(features_df)
# predictions[OUTPUT_COLS].show()

In [None]:
# import pandas as pd
# from sklearn.datasets import make_classification

# from snowflake.ml.modeling.xgboost import XGBRegressor
# from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
# from snowflake.snowpark import Session


In [None]:
# FEATURE_COLS = ["X1", "X2", "X3", "X4", "X5", "X6"]
# LABEL_COLS = ["Y"]
# OUTPUT_COLS = ["PREDICTIONS"]

# # Set up data.
# X, y = make_classification(
#     n_samples=40000,
#     n_features=6,
#     n_informative=4,
#     n_redundant=1,
#     random_state=0,
#     shuffle=True,
# )

# X = pd.DataFrame(X, columns=FEATURE_COLS)
# y = pd.DataFrame(y, columns=LABEL_COLS)

In [None]:
# import pandas as pd
# from sklearn.datasets import make_classification

# from snowflake.ml.modeling.xgboost import XGBRegressor
# from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
# from snowflake.snowpark import Session

# # Create a session with your preferred method
# # session =

# FEATURE_COLS = ["X1", "X2", "X3", "X4", "X5", "X6"]
# LABEL_COLS = ["Y"]
# OUTPUT_COLS = ["PREDICTIONS"]

# # Set up data.
# X, y = make_classification(
#     n_samples=40000,
#     n_features=6,
#     n_informative=4,
#     n_redundant=1,
#     random_state=0,
#     shuffle=True,
# )

# X = pd.DataFrame(X, columns=FEATURE_COLS)
# y = pd.DataFrame(y, columns=LABEL_COLS)

# features_pandas = pd.concat([X, y], axis=1)
# features_df = session.create_dataframe(features_pandas)

# # Train an XGBoost model on snowflake.
# xgboost_model = XGBRegressor(
#     input_cols=FEATURE_COLS,
#     label_cols=LABEL_COLS,
#     output_cols=OUTPUT_COLS
# )

# xgboost_model.fit(features_df)

# # Use the model to make predictions.
# predictions = xgboost_model.predict(features_df)
# predictions[OUTPUT_COLS].show()

In [None]:
!pip list | grep clou