In [0]:
import pandas as pd
import numpy as np
from databricks.feature_engineering import FeatureEngineeringClient
import pyspark.sql.functions as F

In [0]:
#dbutils.library.restartPython()

In [0]:
#-------- Data Ingestion---------------------------
# Read data
df = spark.read.table("customer_shopping_data_2")

# Show the first few rows of the DataFrame
df.show()

# Print the schema (column names and data types)
df.printSchema()

In [0]:
retail_df = df.toPandas()
retail_df.head()

In [0]:
#-------- Segment Customers ---------------------------
# Calculate total revenue for each customer
customer_revenue = retail_df.groupby('customer_id').agg({'price': 'sum'}).rename(columns={'price': 'total_revenue'})
#print(customer_segment.head())

customer_revenue.rename(columns={'price': 'total_revenue'}, inplace=True)

# Calculate percentile thresholds
p33 = customer_revenue['total_revenue'].quantile(0.33)
p66 = customer_revenue['total_revenue'].quantile(0.66)

# Define a function to segment customers
def segment_customer(total_revenue):
    if total_revenue <= p33:
        return 'Low-Value'
    elif total_revenue <= p66:
        return 'Medium-Value'
    else:
        return 'High-Value'

# Apply the function to create the new column
customer_revenue['customer_segment'] = customer_revenue['total_revenue'].apply(segment_customer)

# Merge df1 with df2 on the 'customer_id' column
# 'how='left'' ensures all customers from df1 are kept
merged_df = pd.merge(retail_df, customer_revenue, on='customer_id', how='left')

print("Merged DataFrame:")
print(merged_df.head())

In [0]:
#ordinal field for segment column
# Define the mapping dictionary
segment_mapping = {
    'Low-Value': 1,
    'Medium-Value': 2,
    'High-Value': 3
}

# Apply the mapping to create a new column with ordinal values
merged_df['cust_segment_ordinal'] = merged_df['customer_segment'].map(segment_mapping)
merged_df.head()

In [0]:
%pip install databricks-feature-engineering

In [0]:
spark_df = spark.createDataFrame(merged_df)

# 2. Engineer features from the raw data

# 3. Initialize the Feature Engineering client
fe = FeatureEngineeringClient()

In [0]:
%sql
CREATE CATALOG IF NOT EXISTS retail_catalog;
CREATE SCHEMA IF NOT EXISTS retail_catalog.retail_schema;

In [0]:
# 4. Define the table name and primary keys
# Replace 'your_catalog', 'your_schema', and 'your_table' with your desired names.
feature_table_name = "retail_catalog.retail_schema.features"
primary_keys = ["invoice_no"]

# 5. Create the feature table and write the data

# The `create_table` method takes the DataFrame to infer the schema.
# `write_table` populates the table with data.
fe.create_table(
    name=feature_table_name,
    df=spark_df,
    primary_keys = primary_keys
    #mode="merge" # or 'merge' for incremental updates
)

print(f"Feature table '{feature_table_name}' created and populated successfully.")

# You can now see this table in the Databricks UI under the 'Catalog Explorer'.