# Connect to Snowflake

In [1]:
from dotenv import load_dotenv
load_dotenv()     # loads keys into os.environ so the rest of your code sees them

True

In [2]:
# authenticate into Snowflake
from snowflake.snowpark import Session
import os
connection_parameters = {
    "account": os.getenv('SNOWFLAKE_ACCOUNT'),
    "user": os.getenv('SNOWFLAKE_USER'),
    "password": os.getenv('SNOWFLAKE_PASSWORD'),
    "role": os.getenv('SNOWFLAKE_ROLE'),
    "warehouse": os.getenv('SNOWFLAKE_WAREHOUSE'),
    "database": os.getenv('SNOWFLAKE_DATABASE'),
    "schema": os.getenv('SNOWFLAKE_SCHEMA')
}
session = Session.builder.configs(connection_parameters).create()

In [3]:
# check connection has been successful
print("Session Current Account:", session.get_current_account())

Session Current Account: "WEVIRIP-NA38028"


# Ingest CSV File

In [4]:
# put file in stage
session.file.put('./datasets/purchase_history.csv', 'MY_STAGE')

[PutResult(source='purchase_history.csv', target='purchase_history.csv.gz', source_size=154442, target_size=50101, source_compression='NONE', target_compression='GZIP', status='UPLOADED', message='')]

In [5]:
# define structure of CSV File
import snowflake.snowpark.types as T
purchase_history_schema = T.StructType([
    T.StructField("ID", T.IntegerType()),
    T.StructField("Year_Birth", T.IntegerType()),
    T.StructField("Education", T.StringType()),
    T.StructField("Marital_Status", T.StringType()),
    T.StructField("Income", T.IntegerType()),
    T.StructField("Kidhome", T.IntegerType()),
    T.StructField("Teenhome", T.IntegerType()),
    T.StructField("Dt_Customer", T.DateType()),
    T.StructField("Recency", T.IntegerType()),
    T.StructField("MntWines", T.IntegerType()),
    T.StructField("MntFruits", T.IntegerType()),
    T.StructField("MntMeatProducts", T.IntegerType()),
    T.StructField("MntFishProducts", T.IntegerType()),
    T.StructField("MntSweetProducts", T.IntegerType()),
    T.StructField("MntGoldProds", T.IntegerType()),
    T.StructField("NumDealsPurchases", T.IntegerType()),
    T.StructField("NumWebPurchases", T.IntegerType()),
    T.StructField("NumCatalogPurchases", T.IntegerType()),
    T.StructField("NumStorePurchases", T.IntegerType()),
    T.StructField("NumWebVisitsMonth", T.IntegerType())
])

In [8]:
# read the CSV file into a table
purchase_history = session.read\
    .option('FIELD_DELIMITER', ',')\
    .option('SKIP_HEADER', 1)\
    .option('ON_ERROR', 'CONTINUE')\
    .schema(purchase_history_schema).csv('@MY_STAGE/purchase_history.csv.gz')\
    .copy_into_table('PURCHASE_HISTORY')

In [9]:
# show the table
session.table('PURCHASE_HISTORY').show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ID"  |"YEAR_BIRTH"  |"EDUCATION"  |"MARITAL_STATUS"  |"INCOME"  |"KIDHOME"  |"TEENHOME"  |"DT_CUSTOMER"  |"RECENCY"  |"MNTWINES"  |"MNTFRUITS"  |"MNTMEATPRODUCTS"  |"MNTFISHPRODUCTS"  |"MNTSWEETPRODUCTS"  |"MNTGOLDPRODS"  |"NUMDEALSPURCHASES"  |"NUMWEBPURCHASES"  |"NUMCATALOGPURCHASES"  |"NUMSTOREPURCHASES"  |"NUMWEBVISITSMONTH"  |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Ingest JSON File

In [10]:
# put file in stage
session.file.put('./datasets/campaign_info.json', 'MY_STAGE')

[PutResult(source='campaign_info.json', target='campaign_info.json.gz', source_size=316040, target_size=8324, source_compression='NONE', target_compression='GZIP', status='UPLOADED', message='')]

In [11]:
# read JSON file data into a DataFrame
df_from_json = session.read.json('@My_Stage/campaign_info.json.gz')

In [12]:
# write DataFrame into table as a variant
df_from_json.write.save_as_table('CAMPAIGN_INFO_TEMP', mode = 'overwrite')

In [13]:
# query JSON table
df_from_json.show()

------------------------
|"$1"                  |
------------------------
|{                     |
|  "AcceptedCmp1": 0,  |
|  "AcceptedCmp2": 0,  |
|  "AcceptedCmp3": 0,  |
|  "AcceptedCmp4": 0,  |
|  "AcceptedCmp5": 0,  |
|  "ID": 5524,         |
|  "Response": 1       |
|}                     |
|{                     |
|  "AcceptedCmp1": 0,  |
|  "AcceptedCmp2": 0,  |
|  "AcceptedCmp3": 0,  |
|  "AcceptedCmp4": 0,  |
|  "AcceptedCmp5": 0,  |
|  "ID": 2174,         |
|  "Response": 0       |
|}                     |
|{                     |
|  "AcceptedCmp1": 0,  |
|  "AcceptedCmp2": 0,  |
|  "AcceptedCmp3": 0,  |
|  "AcceptedCmp4": 0,  |
|  "AcceptedCmp5": 0,  |
|  "ID": 4141,         |
|  "Response": 0       |
|}                     |
|{                     |
|  "AcceptedCmp1": 0,  |
|  "AcceptedCmp2": 0,  |
|  "AcceptedCmp3": 0,  |
|  "AcceptedCmp4": 0,  |
|  "AcceptedCmp5": 0,  |
|  "ID": 6182,         |
|  "Response": 0       |
|}                     |
|{                     |


In [14]:
# flatten DataFrame and write to a table
from snowflake.snowpark.functions import col
df_flatten = df_from_json.select(col('$1')['ID'].as_('ID'),\
    col('$1')['AcceptedCmp1'].as_('AcceptedCmp1'),\
    col('$1')['AcceptedCmp2'].as_('AcceptedCmp2'),\
    col('$1')['AcceptedCmp3'].as_('AcceptedCmp3'),\
    col('$1')['AcceptedCmp4'].as_('AcceptedCmp4'),\
    col('$1')['AcceptedCmp5'].as_('AcceptedCmp5'),\
    col('$1')['Response'].as_('Response'))
df_flatten.write.save_as_table('CAMPAIGN_INFO')

In [15]:
# show the table
session.table('CAMPAIGN_INFO').show()

----------------------------------------------------------------------------------------------------------
|"ID"  |"ACCEPTEDCMP1"  |"ACCEPTEDCMP2"  |"ACCEPTEDCMP3"  |"ACCEPTEDCMP4"  |"ACCEPTEDCMP5"  |"RESPONSE"  |
----------------------------------------------------------------------------------------------------------
|5524  |0               |0               |0               |0               |0               |1           |
|2174  |0               |0               |0               |0               |0               |0           |
|4141  |0               |0               |0               |0               |0               |0           |
|6182  |0               |0               |0               |0               |0               |0           |
|5324  |0               |0               |0               |0               |0               |0           |
|7446  |0               |0               |0               |0               |0               |0           |
|965   |0               |0           

# Ingest Parquet File

In [16]:
# put file in stage
session.file.put('./datasets/complain_info.parquet', 'MY_STAGE')

[PutResult(source='complain_info.parquet', target='complain_info.parquet', source_size=6171, target_size=6171, source_compression='PARQUET', target_compression='PARQUET', status='UPLOADED', message='')]

In [17]:
# access file and load it into a table
df_raw = session.read.parquet('@My_Stage/complain_info.parquet')
df_raw.copy_into_table('COMPLAINT_INFO')

[Row(file='my_stage/complain_info.parquet', status='LOADED', rows_parsed=2240, rows_loaded=2240, error_limit=1, errors_seen=0, first_error=None, first_error_line=None, first_error_character=None, first_error_column_name=None)]

In [18]:
# show the table
session.table('COMPLAINT_INFO').show()

-----------------------------------------------------
|"ID"  |"COMPLAIN"  |"Z_COSTCONTACT"  |"Z_REVENUE"  |
-----------------------------------------------------
|5524  |0           |3                |11           |
|2174  |0           |3                |11           |
|4141  |0           |3                |11           |
|6182  |0           |3                |11           |
|5324  |0           |3                |11           |
|7446  |0           |3                |11           |
|965   |0           |3                |11           |
|6177  |0           |3                |11           |
|4855  |0           |3                |11           |
|5899  |0           |3                |11           |
-----------------------------------------------------



# Close Snowflake Session

In [None]:
# always close a session
session.close()