In [31]:
# Snowpark for Python
from snowflake.snowpark import Session
from snowflake.snowpark.version import VERSION
from snowflake.snowpark.functions import udf
import snowflake.snowpark.functions as F

# Snowpark ML
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.registry import Registry
from snowflake.ml._internal.utils import identifier
import snowflake.ml.modeling.preprocessing as snowml
from snowflake.snowpark.types import DecimalType

import pandas as pd
import numpy as np

In [2]:
connection_parameters = {
    "account": "ug94937.us-east4.gcp",
    "user": "ADITYASINGH",
    "password": os.environ.get('SF_Password'),
    "role": "ADITYASINGH",  # optional
    "warehouse": "FOSFOR_INSIGHT_WH",  # optional
#     "authenticator": "externalbrowser", # optional
    "database": "FIRST_DB",  # optional
    "schema": "PUBLIC",  # optional
} 

In [3]:
# Make a Snowpark Connection

################################################################################################################
#  You can also use the SnowSQL Client to configure your connection params:
#  https://docs.snowflake.com/en/user-guide/snowsql-install-config.html
#
#  >>> from snowflake.ml.utils import connection_params
#  >>> session = Session.builder.configs(connection_params.SnowflakeLoginOptions()
#  >>> ).create()   
#
#  NOTE: If you have named connection params then specify the connection name
#  Example:
#  
#  >>> session = Session.builder.configs(
#  >>> connection_params.SnowflakeLoginOptions(connection_name='connections.snowml')
#  >>> ).create()
#
#################################################################################################################

# Edit the connection.json before creating the session object below
# Create Snowflake Session object
# connection_parameters = json.load(open('connection.json'))
session = Session.builder.configs(connection_parameters).create()
session.sql_simplifier_enabled = True

snowflake_environment = session.sql('SELECT current_user(), current_version()').collect()
snowpark_version = VERSION

# Current Environment Details
print('\nConnection Established with the following parameters:')
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))


Connection Established with the following parameters:
User                        : ADITYASINGH
Role                        : "ADITYASINGH"
Database                    : "FIRST_DB"
Schema                      : "PUBLIC"
Warehouse                   : "FOSFOR_INSIGHT_WH"
Snowflake version           : 8.20.10
Snowpark for Python version : 1.17.0


In [14]:
data = pd.read_csv('/data/diamonds.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [16]:
data=data.drop("Unnamed: 0" , axis =1)
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [23]:
data.columns = map(str.upper, data.columns)
data.head()

Unnamed: 0,CARAT,CUT,COLOR,CLARITY,DEPTH,TABLE,PRICE,X,Y,Z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [24]:
#moving data to snowflake df
diamonds_df = session.create_dataframe(data)
diamonds_df.show()

------------------------------------------------------------------------------------------------
|"CARAT"  |"CUT"      |"COLOR"  |"CLARITY"  |"DEPTH"  |"TABLE"  |"PRICE"  |"X"   |"Y"   |"Z"   |
------------------------------------------------------------------------------------------------
|0.23     |Ideal      |E        |SI2        |61.5     |55.0     |326      |3.95  |3.98  |2.43  |
|0.21     |Premium    |E        |SI1        |59.8     |61.0     |326      |3.89  |3.84  |2.31  |
|0.23     |Good       |E        |VS1        |56.9     |65.0     |327      |4.05  |4.07  |2.31  |
|0.29     |Premium    |I        |VS2        |62.4     |58.0     |334      |4.2   |4.23  |2.63  |
|0.31     |Good       |J        |SI2        |63.3     |58.0     |335      |4.34  |4.35  |2.75  |
|0.24     |Very Good  |J        |VVS2       |62.8     |57.0     |336      |3.94  |3.96  |2.48  |
|0.24     |Very Good  |I        |VVS1       |62.3     |57.0     |336      |3.95  |3.98  |2.47  |
|0.26     |Very Good  |H      

In [25]:
# Categorize all the features for modeling
CATEGORICAL_COLUMNS = ["CUT", "COLOR", "CLARITY"]
CATEGORICAL_COLUMNS_OE = ["CUT_OE", "COLOR_OE", "CLARITY_OE"] # To name the ordinal encoded columns
NUMERICAL_COLUMNS = ["CARAT", "DEPTH", "TABLE", "X", "Y", "Z"]

LABEL_COLUMNS = ['PRICE']
OUTPUT_COLUMNS = ['PREDICTED_PRICE']

In [26]:
diamonds_df.show()

------------------------------------------------------------------------------------------------
|"CARAT"  |"CUT"      |"COLOR"  |"CLARITY"  |"DEPTH"  |"TABLE"  |"PRICE"  |"X"   |"Y"   |"Z"   |
------------------------------------------------------------------------------------------------
|0.23     |Ideal      |E        |SI2        |61.5     |55.0     |326      |3.95  |3.98  |2.43  |
|0.21     |Premium    |E        |SI1        |59.8     |61.0     |326      |3.89  |3.84  |2.31  |
|0.23     |Good       |E        |VS1        |56.9     |65.0     |327      |4.05  |4.07  |2.31  |
|0.29     |Premium    |I        |VS2        |62.4     |58.0     |334      |4.2   |4.23  |2.63  |
|0.31     |Good       |J        |SI2        |63.3     |58.0     |335      |4.34  |4.35  |2.75  |
|0.24     |Very Good  |J        |VVS2       |62.8     |57.0     |336      |3.94  |3.96  |2.48  |
|0.24     |Very Good  |I        |VVS1       |62.3     |57.0     |336      |3.95  |3.98  |2.47  |
|0.26     |Very Good  |H      

In [29]:
###PREPROCESSING
# Normalize the CARAT column
snowml_mms = snowml.MinMaxScaler(input_cols=["carat"], output_cols=["CARAT_NORM"])
normalized_diamonds_df = snowml_mms.fit(diamonds_df).transform(diamonds_df)

# Reduce the number of decimals
new_col = normalized_diamonds_df.col("CARAT_NORM").cast(DecimalType(7, 6))
normalized_diamonds_df = normalized_diamonds_df.with_column("CARAT_NORM", new_col)

normalized_diamonds_df.show()

---------------------------------------------------------------------------------------------------------------
|"CARAT"  |"CUT"      |"COLOR"  |"CLARITY"  |"DEPTH"  |"TABLE"  |"PRICE"  |"X"   |"Y"   |"Z"   |"CARAT_NORM"  |
---------------------------------------------------------------------------------------------------------------
|0.23     |Ideal      |E        |SI2        |61.5     |55.0     |326      |3.95  |3.98  |2.43  |0.006237      |
|0.21     |Premium    |E        |SI1        |59.8     |61.0     |326      |3.89  |3.84  |2.31  |0.002079      |
|0.23     |Good       |E        |VS1        |56.9     |65.0     |327      |4.05  |4.07  |2.31  |0.006237      |
|0.29     |Premium    |I        |VS2        |62.4     |58.0     |334      |4.2   |4.23  |2.63  |0.018711      |
|0.31     |Good       |J        |SI2        |63.3     |58.0     |335      |4.34  |4.35  |2.75  |0.022869      |
|0.24     |Very Good  |J        |VVS2       |62.8     |57.0     |336      |3.94  |3.96  |2.48  |0.008316

In [32]:
# Encode CUT and CLARITY preserve ordinal importance
categories = {
    "CUT": np.array(["IDEAL", "PREMIUM", "VERY_GOOD", "GOOD", "FAIR"]),
    "CLARITY": np.array(["IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1", "I2", "I3"]),
}
snowml_oe = snowml.OrdinalEncoder(input_cols=["CUT", "CLARITY"], output_cols=["CUT_OE", "CLARITY_OE"], categories=categories)
ord_encoded_diamonds_df = snowml_oe.fit(normalized_diamonds_df).transform(normalized_diamonds_df)

# Show the encoding
print(snowml_oe._state_pandas)

ord_encoded_diamonds_df.show()

ValueError: (2112) Found unknown categories during fit:
  COLUMN_NAME UNKNOWN_VALUE
0         CUT          Fair
1         CUT     Very Good
2         CUT         Ideal
3         CUT          Good
4         CUT       Premium