In [1]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import udf, sum, col, array_construct, month, year, call_udf, lit
from snowflake.snowpark.types import Variant
from snowflake.snowpark.version import VERSION

# Snowpark ML
from snowflake.ml.modeling.compose import ColumnTransformer
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.preprocessing import PolynomialFeatures, StandardScaler
from snowflake.ml.modeling.linear_model import LinearRegression
from snowflake.ml.modeling.model_selection import GridSearchCV

# Misc
import json
import logging
logger = logging.getLogger("snowflake.snowpark.session")
logger.setLevel(logging.ERROR)

In [2]:
connection_parameters = json.load(open("C:\\Users\\argupta\\Snowflake\\Snowpark\\auth.json"))
session = Session.builder.configs(connection_parameters).create()
session.sql_simplifier_enabled = True

snowflake_environment = session.sql('select current_user(), current_version()').collect()
snowpark_version = VERSION

In [3]:
session.use_database('demo_db')
session.use_schema('public')

In [4]:
# Load data
df_spend_revenue_per_month = session.table('spend_and_revenue_per_month')

# Delete rows with missing values
df_spend_revenue_per_month = df_spend_revenue_per_month.dropna()

# Exclude columns we don't need for modeling
df_spend_revenue_per_month = df_spend_revenue_per_month.drop(['YEAR', 'MONTH'])

# Save features into Snowflake table call MARKETING_BUDGET_FEATURES
df_spend_revenue_per_month.write.mode('overwrite').save_as_table('MARKETING_BUDGET_FEATURES')
df_spend_revenue_per_month.show()

---------------------------------------------------------------------
|"SEARCH_ENGINE"  |"SOCIAL_MEDIA"  |"VIDEO"  |"EMAIL"  |"REVENUE"   |
---------------------------------------------------------------------
|516431           |517618          |516729   |517208   |3264300.11  |
|506497           |504679          |501098   |501947   |3208482.33  |
|522780           |521395          |522762   |518405   |3311966.98  |
|519959           |520537          |520685   |521584   |3311752.81  |
|507211           |507404          |511364   |507363   |3208563.06  |
|505715           |505221          |505292   |503748   |3185894.64  |
|522151           |518635          |520583   |521167   |3316455.44  |
|467736           |474679          |469856   |469784   |2995042.21  |
|518044           |523408          |523688   |519430   |3310662.6   |
|521339           |521528          |519625   |521698   |3314107.1   |
---------------------------------------------------------------------



In [11]:
# model training

CROSS_VALIDATION_FOLDS = 10
POLYNOMIAL_FEATURE_DEGREE = 2

# Create train and test dataframes
train_df, test_df = session.table("MARKETING_BUDGET_FEATURES").random_split(weights=[0.8, 0.2], seed=0)

# Preprocess the numeric columns
# We apply PolynomialFeatures and StandardScalar preprocessing steps to the numeric columns
numeric_features = ['SEARCH_ENGINE', 'SOCIAL_MEDIA', 'VIDEO', 'EMAIL']
numeric_transformer = Pipeline(steps=[('poly', PolynomialFeatures(degree = POLYNOMIAL_FEATURE_DEGREE)), ('scalar', StandardScaler())])

# Combine the preprocessed step together using the column transformer module
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ]
)

# The next step is to integrate the features we just preprocessed with ML algo
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LinearRegression())])
parameters = {}

# Use GridSearch to find the best fitting model based on number_of_folds
model = GridSearchCV(
    estimator = pipeline,
    param_grid = parameters,
    cv = CROSS_VALIDATION_FOLDS,
    label_cols = ["REVENUE"],
    output_cols = ["PREDICTED_REVENUE"],
    verbose = 2
)

# Fit and Score
model.fit(train_df)
train_r2_score = model.score(train_df)
test_r2_score = model.score(test_df)

# R2 score on train and test datasets
print(f"R2 score on Train : {train_r2_score}")
print(f"R2 score on test : {test_r2_score}")

R2 score on Train : 0.9579410972127744
R2 score on test : 0.8402022470039145


In [10]:
numeric_features

['SEARCH_ENGINE', 'SOCIAL_MEDIA', 'VIDEO', 'EMAIL']

In [12]:
# Save trained Model to Snowflake Stage

import os
from joblib import dump

# Extract SKLearn object
sk_model = model.to_sklearn()

model_output_dir = '/tmp'
model_file = os.path.join(model_output_dir, 'model.joblib')
dump(sk_model, model_file)
session.file.put(model_file, "@camp_models", overwrite=True)

[PutResult(source='model.joblib', target='model.joblib.gz', source_size=5348, target_size=2448, source_compression='NONE', target_compression='GZIP', status='UPLOADED', message='')]

In [13]:
# Create Scalar UDF for inference
"""
Now to deploy this model for inference, let's create and register a Snowpark Python UDF and add the trained model as a dependency. Once registered, getting new predictions is as simple as calling the function by passing in data.
"""

session.clear_imports()
session.clear_packages()

# Add trained model and Python packages from snowflake anaconda channel available on the server side as UDF dependencies
session.add_import('@camp_models/model.joblib.gz')
session.add_packages('pandas', 'joblib', 'scikit-learn==1.1.1')

@udf(name='predict_roi', session=session, replace=True, is_permanent=True, stage_location='camp_udfs')
def predict_roi(budget_allocations: list) -> float:
    import sys
    import pandas as pd
    from joblib import load
    import sklearn

    IMPORT_DIRECTORY_NAME = "snowflake_import_directory"
    import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME]

    model_file = import_dir + 'model.joblib.gz'
    model = load(model_file)

    features = ['SEARCH_ENGINE', 'SOCIAL_MEDIA', 'VIDEO', 'EMAIL']
    df = pd.DataFrame([budget_allocations], columns=features)
    roi = abs(model.predict(df)[0])
    return roi

In [14]:
# Call Scalar UDF for inference on new data

test_df_2 = session.create_dataframe([[250000,250000,200000,450000],[500000,500000,500000,500000],[8500,9500,2000,500]],
                                     schema=['SEARCH_ENGINE','SOCIAL_MEDIA','VIDEO','EMAIL'])
test_df_2.select(
    'SEARCH_ENGINE','SOCIAL_MEDIA','VIDEO','EMAIL',
    call_udf("predict_roi",
             array_construct(col("SEARCH_ENGINE"), col("SOCIAL_MEDIA"), col("VIDEO"), col("EMAIL"))).as_("PREDICTED_ROI")
).show()

-----------------------------------------------------------------------------
|"SEARCH_ENGINE"  |"SOCIAL_MEDIA"  |"VIDEO"  |"EMAIL"  |"PREDICTED_ROI"     |
-----------------------------------------------------------------------------
|500000           |500000          |500000   |500000   |3182207.8960703956  |
|250000           |250000          |200000   |450000   |25414662.867387168  |
|8500             |9500            |2000     |500      |2283241.0106569445  |
-----------------------------------------------------------------------------



In [16]:
# Create Vectorized User-Defined Function (UDF) using Batch API for inference
"""
Here we will leverage the Python UDF Batch API to create a vectorized UDF which takes a Pandas Dataframe as input. This means that each call to the UDF receives a set/batch of rows compared to a Scalar UDF which gets one row as input.

First we will create a helper function load_model() that uses cachetools to make sure we only load the model once followed by batch_predict_roi() function that does the inference.
"""

session.clear_imports()
session.clear_packages()

import cachetools
from snowflake.snowpark.types import PandasSeries, PandasDataFrame

session.add_import('@camp_models/model.joblib.gz')
session.add_packages('pandas','joblib','scikit-learn','cachetools')

@cachetools.cached(cache={})
def load_model(filename):
    import joblib
    import sys
    import os

    IMPORT_DIRECTORY_NAME = "snowflake_import_directory"
    import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME]

    if import_dir:
        with open(os.path.join(import_dir, filename), 'rb') as file:
            m = joblib.load(file)
            return m

@udf(name='batch_predict_roi', session=session, replace=True, is_permanent=True, stage_location='@camp_udfs')
def batch_predict_roi(budget_allocation_df: PandasDataFrame[int, int, int, int]) -> PandasSeries[float]:
    import sklearn
    budget_allocation_df.columns=['SEARCH_ENGINE','SOCIAL_MEDIA','VIDEO','EMAIL']
    model = load_model('model.joblib.gz')
    return abs(model.predict(budget_allocation_df))

In [17]:
# Call Vectorized User-Defined Function (UDF) using Batch API for inference on new data points

test_df_2.select(
    'SEARCH_ENGINE','SOCIAL_MEDIA','VIDEO','EMAIL',
    call_udf("batch_predict_roi",
             col("SEARCH_ENGINE"), col("SOCIAL_MEDIA"), col("VIDEO"), col("EMAIL")).as_("PREDICTED_ROI")
).show()

-----------------------------------------------------------------------------
|"SEARCH_ENGINE"  |"SOCIAL_MEDIA"  |"VIDEO"  |"EMAIL"  |"PREDICTED_ROI"     |
-----------------------------------------------------------------------------
|250000           |250000          |200000   |450000   |25414662.867387168  |
|500000           |500000          |500000   |500000   |3182207.8960703956  |
|8500             |9500            |2000     |500      |2283241.0106569445  |
-----------------------------------------------------------------------------

