In [None]:
# Install required libraries
# snowflake-ml-python: Provides Snowflake ML modeling framework including XGBoost
# snowflake-snowpark-python: Enables Snowpark DataFrame API for Python

In [None]:
!pip install --upgrade snowflake-ml-python snowflake-snowpark-python snowflake-connector-python

In [9]:
# Load connection parameters from external file (create credential.py with your Snowflake params)
# Example params dict: {'account': '...', 'user': '...', 'password': '...', 'warehouse': '...', 'database': '...', 'schema': '...'}
from credential import params

In [8]:
# Import core libraries for Snowpark session, ML modeling, and data manipulation
from snowflake.snowpark import Session
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.snowpark.functions import col
from snowflake.ml.modeling.preprocessing import LabelEncoder

## Generate Synthetic Neighborhood Visiting Pattern

This simulates the ice cream truck's monthly schedule:
- January: N1 on days 1,8,15,22,29 (every 7th day starting 1st); N2 otherwise. [web:1]
- Feb-Nov: N(day % 7 +1), cycling N1-N7 weekly.
- December: Always N8.

In [11]:
# Define days per month (non-leap year)
month_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

# Dictionary to hold (month, day) -> neighborhood mapping
pre = {}

for i, month_length in enumerate(month_days):
    month = i + 1
    for day in range(1, month_length + 1):
        if month == 1:
            # January: N1 on 1st,8th,15th,22nd,29th; N2 otherwise
            if day % 7 == 1:
                pre[(month, day)] = 1
            else:
                pre[(month, day)] = 2
        elif month <= 11:
            # Feb-Nov: Cycle N1 to N7 based on (day-1) % 7 + 1
            pre[(month, day)] = ((day - 1) % 7) + 1
        else:
            # December: Always N8
            pre[(month, day)] = 8

In [12]:
# Display the pattern (sample)
pre

In [None]:
df_clean_list = [{'MONTH': month, 
                'DAY': day, 
                'Neighborhood': neighborhood} 
                 for (month, day), neighborhood in pre.items()
                 ]

In [None]:
df_clean = session.create_dataframe(df_clean_list)  

In [None]:
df_clean.show()

## Connect to Snowflake and Load Data

Assumes `df_clean` table exists in `test_database.test_schema` with columns: MONTH, DAY, Neighborhood (1-8). [web:1]

In [None]:
#
session = get_active_session()
# Or disable telemetry
session._conn._telemetry_enabled = False  # After session.create()

In [None]:
# Create Snowpark session using credentials
#session = Session.builder.configs(params).create()


In [None]:
df_clean.write.mode("overwrite").save_as_table("tasty_bytes.raw_pos.truck_pattern")

In [None]:
# Load the clean dataset (lazy evaluation - no data pulled locally yet)
snowpark_df = session.table("tasty_bytes.raw_pos.truck_pattern")

In [None]:
# Inspect data
print("First 10 rows:")
snowpark_df.show(n=10)
print(f"Total rows: {snowpark_df.count()}")
print("Data description:")
snowpark_df.describe().show()
print("Neighborhood distribution:")
snowpark_df.group_by("Neighborhood").count().show()

## Data Preprocessing: Label Encoding

Encode Neighborhood (1-8) to 0-7 for XGBoost classifier using Snowpark ML LabelEncoder. [web:6]

In [None]:
# Initialize LabelEncoder for single column
le = LabelEncoder(
    input_cols=['Neighborhood'],
    output_cols=['NEIGHBORHOOD2'],
    drop_input_cols=True  # Drop original Neighborhood column
)

# Fit on data and transform
fitted = le.fit(snowpark_df.select("Neighborhood"))
snowpark_df_prepared = fitted.transform(snowpark_df)

snowpark_df_prepared.show()

## Train-Test Split and Persist

In [None]:
# 90/10 split
train_snowpark_df, test_snowpark_df = snowpark_df_prepared.random_split([0.9, 0.1])

# Persist as tables in Snowflake (overwrite if exists)
train_snowpark_df.write.mode("overwrite").save_as_table("tasty_bytes.raw_pos.truck_pattern_train")
test_snowpark_df.write.mode("overwrite").save_as_table("tasty_bytes.raw_pos.truck_pattern_test")

print("Train/Test tables saved.")

## Train XGBoost Classifier and Evaluate

Predict Neighborhood from MONTH and DAY features.

In [None]:
# Define features and label
FEATURE_COLS = ["MONTH", "DAY"]
LABEL_COLS = ["NEIGHBORHOOD2"]

# Initialize and train XGBoost model natively in Snowflake
xgboost_model = XGBClassifier(input_cols=FEATURE_COLS, label_cols=LABEL_COLS)
xgboost_model.fit(train_snowpark_df)

# Evaluate accuracy on test set
accuracy = xgboost_model.score(test_snowpark_df)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

## Cleanup (Optional)

Close session and remove temp tables.

In [None]:
# Close Snowpark session
session.close()