# Connect to Snowflake

In [1]:
from dotenv import load_dotenv
load_dotenv()     # loads keys into os.environ so the rest of your code sees them

True

In [8]:
# authenticate into Snowflake
from snowflake.snowpark import Session
import os
connection_parameters = {
    "account": os.getenv('SNOWFLAKE_ACCOUNT'),
    "user": os.getenv('SNOWFLAKE_USER'),
    "password": os.getenv('SNOWFLAKE_PASSWORD'),
    "role": os.getenv('SNOWFLAKE_ROLE'),
    "warehouse": os.getenv('SNOWFLAKE_WAREHOUSE'),
    "database": os.getenv('SNOWFLAKE_DATABASE'),
    "schema": os.getenv('SNOWFLAKE_SCHEMA')
}
session = Session.builder.configs(connection_parameters).create()

In [9]:
# check connection has been successful
print("Session Current Account:", session.get_current_account())

Session Current Account: "WEVIRIP-NA38028"


# Working with User Defined Functions (UDF)

In [28]:
# NOTE: SAMPLE_EMPLOYEE_DATA table from "Working with DataFrames" section must be created

# create a UDF
def last_name_finder(input_name:str):
    last_name = input_name.split()[1]
    return last_name

from snowflake.snowpark.types import StringType, IntegerType, ArrayType

test = session.udf.register(
    func = last_name_finder,
    return_type = StringType(),
    input_types = [StringType()],
    is_permanent = True,
    name = 'LAST_NAME_FINDER',
    replace = True,
    stage_location = '@MY_STAGE'
)

In [29]:
# execute the UDF in a query
session.sql('''SELECT 
    NAME,
    LAST_NAME_FINDER(NAME) AS LAST_NAME
    FROM SAMPLE_EMPLOYEE_DATA
''').show() 

----------------------------------
|"NAME"            |"LAST_NAME"  |
----------------------------------
|John Doe          |Doe          |
|Jane Smith        |Smith        |
|Michael Johnson   |Johnson      |
|Sarah Williams    |Williams     |
|David Brown       |Brown        |
|Emily Davis       |Davis        |
|James Miller      |Miller       |
|Emma Wilson       |Wilson       |
|Alexander Taylor  |Taylor       |
|Olivia Anderson   |Anderson     |
----------------------------------



In [31]:
# call UDF from a DataFrame function
from snowflake.snowpark.functions import col, call_udf
df = session.table('SAMPLE_EMPLOYEE_DATA')
df.with_column("last_name", call_udf('LAST_NAME_FINDER', col('name'))).show()

---------------------------------------------------------------------------------------------------------
|"ID"  |"NAME"            |"AGE"  |"EMAIL"                      |"CITY"       |"COUNTRY"  |"LAST_NAME"  |
---------------------------------------------------------------------------------------------------------
|1     |John Doe          |25     |johndoe@example.com          |New York     |USA        |Doe          |
|2     |Jane Smith        |30     |janesmith@example.com        |Los Angeles  |USA        |Smith        |
|3     |Michael Johnson   |35     |michaeljohnson@example.com   |London       |UK         |Johnson      |
|4     |Sarah Williams    |28     |sarahwilliams@example.com    |Leeds        |UK         |Williams     |
|5     |David Brown       |32     |davidbrown@example.com       |Tokyo        |Japan      |Brown        |
|6     |Emily Davis       |29     |emilydavis@example.com       |Sydney       |Australia  |Davis        |
|7     |James Miller      |27     |jamesmiller

# Working with User Defined Table Functions (UDTF)

In [33]:
# NOTE: SAMPLE_EMPLOYEE_DATA table from "Working with DataFrames" section must be created

# UDTF Handler Class
class CalculateAverage:
    def __init__(self):
        self._values = []

    def process(self, input_measure: int):
        self._values.append(input_measure)

    def end_partition(self):
        values_list = self._values
        average = sum(values_list) / len(values_list)
        yield(average ,)
        

from snowflake.snowpark.types import StructType, StructField
from snowflake.snowpark.types import FloatType, IntegerType, StringType

output_schema = StructType([
    StructField('Avg_Age', FloatType())
])

session.udtf.register(
    handler = CalculateAverage,
    output_schema = output_schema,
    input_types = [IntegerType()],
    is_permanent = True,
    name = 'AVERAGE_AGE',
    replace = True,
    stage_location = '@MY_STAGE'
)

<snowflake.snowpark.udtf.UserDefinedTableFunction at 0x14338e410>

In [34]:
# execute the UDTF
session.sql('''
    SELECT
        COUNTRY, Avg_Age
    FROM
        SAMPLE_EMPLOYEE_DATA,
        table(AVERAGE_AGE(AGE) OVER (partition by COUNTRY))
''').show()

----------------------------------
|"COUNTRY"  |"AVG_AGE"           |
----------------------------------
|UK         |31.5                |
|USA        |27.333333333333332  |
|Germany    |33.0                |
|Italy      |31.0                |
|Australia  |27.5                |
|Japan      |32.0                |
----------------------------------



# Working with Vectorised UDF's

In [38]:
# NOTE: SAMPLE_EMPLOYEE_DATA table from "Working with DataFrames" section must be created

import pandas as pd
from snowflake.snowpark.functions import pandas_udf, call_udf
from snowflake.snowpark.types import IntegerType, PandasSeriesType, StringType

@pandas_udf(
    name='column_adder',
    stage_location='@MY_STAGE',
    input_types=[PandasSeriesType(StringType()), PandasSeriesType(StringType())],
    return_type=PandasSeriesType(StringType()),
    is_permanent = True,
    replace = True)
def column_adder(column1, column2):
    return column1.astype(str) + "," + column2.astype(str)

df = session.table("SAMPLE_EMPLOYEE_DATA")
df2 = df.withColumn('City_Country',
                    call_udf('column_adder', col('CITY'), col('COUNTRY')))
df2.show()

ValueError: You cannot create a non-vectorized UDF using pandas_udf(). Use udf() instead.

# Working with DataFrames

In [13]:
# create a table
session.sql('CREATE OR REPLACE TABLE SAMPLE_EMPLOYEE_DATA(id INT, name VARCHAR, age INT, email VARCHAR, city VARCHAR, country VARCHAR)').collect()

[Row(status='Table SAMPLE_EMPLOYEE_DATA successfully created.')]

In [14]:
# populate table with some data
session.sql("""
    INSERT INTO SAMPLE_EMPLOYEE_DATA VALUES
    (1,'John Doe',25,'johndoe@example.com','New York','USA'),
    (2,'Jane Smith',30,'janesmith@example.com','Los Angeles','USA'),
    (3,'Michael Johnson',35,'michaeljohnson@example.com','London',
       'UK'),
    (4,'Sarah Williams',28,'sarahwilliams@example.com','Leeds',
       'UK'),
    (5,'David Brown',32,'davidbrown@example.com','Tokyo','Japan'),
    (6,'Emily Davis',29,'emilydavis@example.com','Sydney',
       'Australia'),
    (7,'James Miller',27,'jamesmiller@example.com','Dallas','USA'),
    (8,'Emma Wilson',33,'emmawilson@example.com','Berlin','Germany'),
    (9,'Alexander Taylor',31,'alexandertaylor@example.com',
       'Rome','Italy'),
    (10,'Olivia Anderson',26,'oliviaanderson@example.com',
        'Melbourne','Australia')
""").collect()

[Row(number of rows inserted=10)]

In [15]:
# return number of rows
session.sql('SELECT COUNT(*) FROM SAMPLE_EMPLOYEE_DATA').collect()

[Row(COUNT(*)=10)]

In [16]:
# store query results in a DataFrame
from snowflake.snowpark.functions import col
df_subset_row = session.table("SAMPLE_EMPLOYEE_DATA").filter(col("id") == 1)
df_subset_row.show()

------------------------------------------------------------------------
|"ID"  |"NAME"    |"AGE"  |"EMAIL"              |"CITY"    |"COUNTRY"  |
------------------------------------------------------------------------
|1     |John Doe  |25     |johndoe@example.com  |New York  |USA        |
------------------------------------------------------------------------



# Stored Procedures

In [25]:
# define a Stored Procedure
def subset_table(snowpark_session:Session):
    df = snowpark_session.table('SAMPLE_EMPLOYEE_DATA').select('NAME', 'AGE')
    return df.collect()

from snowflake.snowpark.types import StringType
session.add_packages('snowflake-snowpark-python')

session.sproc.register(
    func = subset_table,
    return_type = StringType(),
    input_types = [],
    is_permanent = True,
    name = 'SPROC_SUBSET_TABLE',
    replace = True,
    stage_location = '@MY_STAGE'
)

<snowflake.snowpark.stored_procedure.StoredProcedure at 0x147204310>

In [26]:
# run the stored procedure
session.sql(''' CALL SPROC_SUBSET_TABLE() ''').show()

------------------------------------------------------
|"SPROC_SUBSET_TABLE"                                |
------------------------------------------------------
|[Row(NAME='John Doe', AGE=25), Row(NAME='Jane S...  |
------------------------------------------------------



# Close Snowflake Session

In [6]:
# always close a session
session.close()