## Load Healthcare Stroke data into Snowflake

In [23]:
import json
with open('creds.json') as f:
    data = json.load(f)
    username = data['username']
    password = data['password']
    account = data["account"]
    warehouse = data["warehouse"]
    database = data["database"]
    schema = data["schema"]
    role = data["role"]

Snowpark Python offers a very similar API to query and transform data in Snowflake entirely using Snowflake virtual warehouses. 

In [24]:
from snowflake.snowpark.session import Session
from snowflake.snowpark import functions as F
from snowflake.snowpark import types as T
from snowflake.snowpark import Window
import pandas as pd
import numpy as np

In [25]:
CONNECTION_PARAMETERS = {
    'account': account,
    'user': username,
    'password': password,
    'schema': schema,
    'database': database,
    'warehouse': warehouse,
    "role": role
}

First we establish our session with our account credentials and information

In [26]:
session = Session.builder.configs(CONNECTION_PARAMETERS).create()

### Uploading Stroke Data into Snowflake Stage

In [28]:
data_stage_name = "DATA_STAGE"
data_file_path = './data/train_strokes.csv'
_ = session.sql(f"CREATE or REPLACE STAGE {data_stage_name}").collect()
print('Putting '+ data_file_path +' to stage: '+data_stage_name)
session.file.put(local_file_name=data_file_path, 
                     stage_location=data_stage_name + '/stroke', 
                     source_compression='NONE', 
                     overwrite=True)

Putting ./data/train_strokes.csv to stage: DATA_STAGE


[PutResult(source='train_strokes.csv', target='train_strokes.csv.gz', source_size=2635787, target_size=540464, source_compression='NONE', target_compression='GZIP', status='UPLOADED', message='')]

In [29]:
listresults = session.sql(f"list @{data_stage_name}").collect()
listresults

[Row(name='data_stage/stroke/train_strokes.csv.gz', size=540464, md5='21cd5c8d32316caacdf909b0d1d51610', last_modified='Tue, 1 Nov 2022 17:02:06 GMT')]

### Load the Stroke data into Snowflake

##### Define the schema

In [30]:
load_schema = T.StructType([T.StructField("ID", T.IntegerType()),
                             T.StructField("GENDER", T.StringType()), 
                             T.StructField("AGE", T.IntegerType()), 
                             T.StructField("HYPERTENSION", T.IntegerType()),
                             T.StructField("HEART_DISEASE", T.IntegerType()),
                             T.StructField("EVER_MARRIED", T.StringType()),
                             T.StructField("WORK_TYPE", T.StringType()),
                             T.StructField("RESIDENCE_TYPE", T.StringType()),
                             T.StructField("AVG_GLUCOSE_LEVEL", T.FloatType()),
                             T.StructField("BMI", T.StringType()),
                             T.StructField("SMOKING_STATUS", T.StringType()),
                             T.StructField("STROKE", T.IntegerType())
                        ])

table_name = "STROKE"

##### Create empty tables

In [31]:

session.create_dataframe([[None]*len(load_schema.names)], schema=load_schema)\
       .na.drop()\
       .write\
       .mode("overwrite") \
       .save_as_table(table_name)

##### Load the Stroke data into Snowflake

In [32]:
csv_file_format_options = {"FIELD_OPTIONALLY_ENCLOSED_BY": "'\"'", "skip_header": 1}

loadresults = session.read.option("SKIP_HEADER", 1)\
                     .option("FIELD_OPTIONALLY_ENCLOSED_BY", "\042")\
                     .option("COMPRESSION", "GZIP")\
                     .option("NULL_IF", "\\\\N")\
                     .option("NULL_IF", "NULL")\
                     .schema(load_schema)\
                     .csv('@'+data_stage_name + '/stroke/')\
                     .copy_into_table(table_name, format_type_options=csv_file_format_options)

In [33]:
loadresults

[Row(file='data_stage/stroke/train_strokes.csv.gz', status='LOADED', rows_parsed=43400, rows_loaded=43400, error_limit=1, errors_seen=0, first_error=None, first_error_line=None, first_error_character=None, first_error_column_name=None)]

In [34]:
stroke_snowdf = session.table(table_name)
stroke_snowdf.limit(10).to_pandas()

Unnamed: 0,ID,GENDER,AGE,HYPERTENSION,HEART_DISEASE,EVER_MARRIED,WORK_TYPE,RESIDENCE_TYPE,AVG_GLUCOSE_LEVEL,BMI,SMOKING_STATUS,STROKE
0,30669,Male,3,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14,0,0,No,Never_worked,Rural,161.28,19.1,,0
5,32257,Female,47,0,0,Yes,Private,Urban,210.95,50.1,,0
6,52800,Female,52,0,0,Yes,Private,Urban,77.59,17.7,formerly smoked,0
7,41413,Female,75,0,1,Yes,Self-employed,Rural,243.53,27.0,never smoked,0
8,15266,Female,32,0,0,Yes,Private,Rural,77.67,32.3,smokes,0
9,28674,Female,74,1,0,Yes,Self-employed,Urban,205.84,54.6,never smoked,0


In [35]:
stroke_snowdf.count()

43400