In [None]:
USE SNOWPUBLIC.NOTEBOOKS;
USE ROLE PUBLIC;

CREATE FILE FORMAT IF NOT EXISTS csvformat2 
    SKIP_HEADER = 1 
    TYPE = 'CSV';


-- create external stage with the csv format to stage the diamonds dataset
CREATE STAGE IF NOT EXISTS diamond_assets 
    FILE_FORMAT = csvformat2 
    URL = 's3://sfquickstarts/intro-to-machine-learning-with-snowpark-ml-for-python/diamonds.csv';

CREATE OR REPLACE TABLE SNOWPUBLIC.NOTEBOOKS.DIAMONDS2 (
	CARAT NUMBER(38,2),
	CUT VARCHAR(16777216),
	COLOR VARCHAR(16777216),
	CLARITY VARCHAR(16777216),
	DEPTH NUMBER(38,1),
	"TABLE" NUMBER(38,1),
	PRICE NUMBER(38,0),
	X NUMBER(38,2),
	Y NUMBER(38,2),
	Z NUMBER(38,2)
);

COPY INTO DIAMONDS2
FROM @diamond_assets;



# Welcome to the Notebooks Container Runtime!

In this notebook, we will go through the basics of using Notebooks Container Runtime. We will install packages, load data, train a model, and look at logs. 

In [None]:
import warnings
warnings.filterwarnings("ignore")

from snowflake.snowpark.context import get_active_session
session = get_active_session()

# Add a query tag to the session. This helps with debugging and performance monitoring.
session.query_tag = {"origin":"sf_sit-is", "name":"aiml_notebooks_container_runtime", "version":{"major":1, "minor":0}, "attributes":{"is_quickstart":1, "source":"notebook"}}


The Container Runtime for Snowflake Notebooks includes pre-installed common packages including SnowparkML and other OSS packages.

In [None]:
!pip freeze

Notebooks Container Runtime, along with External Access Integrations give us the flexibility to `pip install` packages from anywhere, including popular package repositories such as pypi. You can install whatever packages you need by running `!pip install <package_name>` directly in the Notebook.

We have configured this notebook to allow pypi urls with an External Access Integration. 

In [None]:
## Commenting out for now !pip install seaborn

Just like Notebooks on the Warehouse Runtime, we can intermingle both SQL and Python cells:

Let's visualize some of our data using the `seaborn` package that we installed above:

In [None]:
diamonds_df = session.table("DIAMONDS2")
diamonds_df.show()

In [None]:
from snowflake.ml.data.data_connector import DataConnector
data_connector = DataConnector.from_dataframe(diamonds_df)
df = data_connector.to_pandas()

import seaborn as sns

# Create a visualization
sns.histplot(
    data=df,
    x="PRICE"
)

Now, let's train a basic `XGBRegressor` machine learning model. The ML Container Runtime for Snowflake Notebooks includes pre-installed common packages for doing machine learning tasks, including SnowparkML and other OSS packages.

In [None]:
import time
from xgboost import XGBRegressor

CATEGORICAL_COLUMNS = ["CUT", "COLOR", "CLARITY"]
NUMERICAL_COLUMNS = ["CARAT", "DEPTH", "X", "Y", "Z"]
LABEL_COLUMNS = ['PRICE']

model = XGBRegressor(max_depth=400)

t0 = time.time()
model.fit(df[NUMERICAL_COLUMNS], df[LABEL_COLUMNS])
t1 = time.time()

print(f"Fit in {t1-t0} seconds.")

In [None]:
# Import necessary libraries
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt
from xgboost import XGBRegressor

# Train/test split
X = df[NUMERICAL_COLUMNS]
y = df[LABEL_COLUMNS]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
model = XGBRegressor(max_depth=400)
model.fit(X_train, y_train)

# Predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Metrics
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)

# ---- Display metrics in a simple matplotlib table ----
metrics_df = pd.DataFrame({
    "Metric": ["RMSE", "R²", "MAE"],
    "Train": [train_rmse, train_r2, train_mae],
    "Test": [test_rmse, test_r2, test_mae]
})

fig, ax = plt.subplots(figsize=(6,2))
ax.axis('off')
tbl = ax.table(cellText=metrics_df.round(3).values,
               colLabels=metrics_df.columns,
               loc='center')
tbl.auto_set_font_size(False)
tbl.set_fontsize(10)
tbl.scale(1.2, 1.2)
plt.title("Model Performance Metrics", fontsize=12, pad=20)
plt.show()

# ---- Scatter plot: Predicted vs Actual ----
test_results = pd.DataFrame({
    'Actual Price': y_test['PRICE'].values,
    'Predicted Price': y_pred_test
})

scatter_chart = alt.Chart(test_results).mark_circle().encode(
    x=alt.X('Actual Price', title='Actual Price ($)'),
    y=alt.Y('Predicted Price', title='Predicted Price ($)')
).properties(
    title='Predicted vs Actual Diamond Prices',
    width=600,
    height=400
)

line = alt.Chart(
    pd.DataFrame({'x': [0, test_results['Actual Price'].max()]})
).mark_line(color='red', strokeDash=[5, 5]).encode(x='x', y='x')

scatter_chart + line

# ---- Feature Importance plot (matplotlib) ----
feature_importance = pd.DataFrame({
    'Feature': NUMERICAL_COLUMNS,
    'Importance': model.feature_importances_
}).sort_values(by="Importance", ascending=False)

plt.figure(figsize=(8,5))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.gca().invert_yaxis()
plt.xlabel("Feature Importance")
plt.title("Feature Importance")
plt.show()

