In [0]:
!pip install requirements.txt

In [0]:
import yaml

# Load the configuration
with open("config.yaml", "r") as config_file:
    config = yaml.safe_load(config_file)

# Access configuration values
catalog = config['catalog']
schema = config['schema']
table = config['table']
artifact_path = config['artifact_path']
registered_model_name = config['registered_model_name']

# Construct full table name
full_table_name = f"{catalog}.{schema}.{table}"

In [0]:
from databricks.feature_engineering import FeatureEngineeringClient
fe = FeatureEngineeringClient(model_registry_uri="databricks-uc")
import mlflow
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import xgboost as xgb

In [0]:
transactions_df = fe.read_table(
  name=full_table_name,
)

In [0]:
feature_lookups = [
    FeatureLookup(
      table_name=full_table_name,
      feature_names=['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      lookup_key='transaction_id',
    )
  ]

with mlflow.start_run():

  # df has columns ['customer_id', 'product_id', 'rating']
  training_set = fe.create_training_set(
    df=transactions_df,
    feature_lookups=feature_lookups,
    label='Class',
    exclude_columns=['transaction_id']
  )

  training_df = training_set.load_df().toPandas()

  # Split the data into features and target
  X = df.drop('Class', axis=1)
  y = df['Class']
  # Split the data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  # Define the preprocessing steps
  preprocessor = ColumnTransformer(
      transformers=[
          ('scaler', StandardScaler(), ['V1', 'V3'])
      ],
      remainder='passthrough'
  )

  # Create the pipeline
  pipeline = Pipeline([
      ('preprocessor', preprocessor),
      ('classifier', xgb.XGBClassifier(random_state=42))
  ])

  # Fit the pipeline
  pipeline.fit(X_train, y_train)
  
  # Make predictions
  y_pred = pipeline.predict(X_test)

  fe.log_model(
    model=model,
    artifact_path=artifact_path,
    flavor=mlflow.sklearn,
    training_set=training_set,
    registered_model_name=registered_model_name
  )