In [None]:
# Import python packages
import streamlit as st
import pandas as pd

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


DEV, TEST and PROD deployments will be hold in different SCHEMAS

In [None]:
# Then, we can use the python name to turn cell2 into a Pandas dataframe
session.sql('create or replace schema DATASET').collect()
session.sql('use schema DATASET').collect()


In [None]:
use schema DATASET

Here we are using the power of Cortex LLMs to create some feedback comments

In [None]:
import pandas as pd
import random
from datetime import datetime, timedelta
from snowflake.snowpark import types as T
from snowflake.snowpark import functions as F
from snowflake import snowpark

def uc01_feature_engineering_generation(db, sc, sales_tb, feedback_tb, cur_date, table_name):

    # Function to create features that define a profile of customer behavior based on latest purchases
    # Features created for a given date (cur_date)
    # New features added into table (table_name)

    table_name = f'{db}.{sc}.{table_name}'
    
    # Load data
    customers_tbl = '.'.join([db, sc,'CUSTOMERS'])
    sales_tbl = '.'.join([db, sc, sales_tb])
    feedback_tbl = '.'.join([db, sc, feedback_tb])
    
    customers_df = session.table(customers_tbl)
    sales_df = session.table(sales_tbl)
    sales_df_last_tran = session.table(sales_tbl)

    feedback_sentiment_df = session.table(feedback_tbl)    

    # we are only doing feature engineering for transactions before cur_date
    
    sales_df_last_tran = sales_df_last_tran.filter(F.col("transaction_date") < F.lit(cur_date))

    sales_df = sales_df.filter(F.col("transaction_date") < F.lit(cur_date ))
        
    # count only feedback before cur_date
    
    feedback_sentiment_df = feedback_sentiment_df.filter(F.col("chat_date") < F.lit(cur_date))
    
    sales_agg_df = (
        sales_df_last_tran.group_by("customer_id")
        .agg(
            F.max("transaction_date").alias("last_purchase_date"),
            F.sum("total_amount").alias("total_customer_value")
        )
    )
    
    def custom_column_naming(input_col, agg, window):
        return f"{agg}_{input_col}_{window.replace('-', 'past_')}"
                                                   
    sales_agg_orders_df = sales_df.analytics.time_series_agg(
            time_col="transaction_date",
            aggs={"total_amount": ["SUM", "COUNT"]},
            windows=["-7D","-1MM", "-2MM", "-3MM"],
            sliding_interval="1D",
            group_by=["CUSTOMER_ID"],
            col_formatter = custom_column_naming)

    sales_agg_last_purchase = sales_agg_df.join(
        sales_agg_orders_df,
        (sales_agg_df.last_purchase_date == sales_agg_orders_df.transaction_date) &
        (sales_agg_df.CUSTOMER_ID == sales_agg_orders_df.CUSTOMER_ID),
        "left").select(
            sales_agg_df["customer_id"].alias("CUSTOMER_ID"),
            sales_agg_df["total_customer_value"],
            sales_agg_df["last_purchase_date"],
            sales_agg_orders_df["SUM_TOTAL_AMOUNT_PAST_7D"],
            sales_agg_orders_df["SUM_TOTAL_AMOUNT_PAST_1MM"],
            sales_agg_orders_df["SUM_TOTAL_AMOUNT_PAST_2MM"],
            sales_agg_orders_df["SUM_TOTAL_AMOUNT_PAST_3MM"],
            sales_agg_orders_df["COUNT_TOTAL_AMOUNT_PAST_7D"],
            sales_agg_orders_df["COUNT_TOTAL_AMOUNT_PAST_1MM"],
            sales_agg_orders_df["COUNT_TOTAL_AMOUNT_PAST_2MM"],
            sales_agg_orders_df["COUNT_TOTAL_AMOUNT_PAST_3MM"]
        )

    #  feedback data

    latest_feedback_df = (feedback_sentiment_df.group_by("customer_id")
            .agg(F.max("chat_date").alias("chat_date")))
    
    feedback_agg_df = feedback_sentiment_df.analytics.moving_agg(
            aggs={"SENTIMENT": ["MIN", "AVG"]},
            window_sizes=[2, 3, 4],
            order_by=["chat_date"],
            group_by=["CUSTOMER_ID"])

    
    feedback_agg_latest_df = latest_feedback_df.join(
        feedback_agg_df, "customer_id", "left").select(
            latest_feedback_df["CUSTOMER_ID"].alias("CUSTOMER_ID"),
            feedback_agg_df["SENTIMENT_MIN_2"],
            feedback_agg_df["SENTIMENT_MIN_3"],
            feedback_agg_df["SENTIMENT_MIN_4"],
            feedback_agg_df["SENTIMENT_AVG_2"],
            feedback_agg_df["SENTIMENT_AVG_3"],
            feedback_agg_df["SENTIMENT_AVG_4"],         
        )

    feedback_agg_latest_df.show(10)
    
    # Join tables
    features_df = (
        customers_df.join(sales_agg_last_purchase, "customer_id", "left")
        .join(feedback_agg_latest_df, "customer_id", "left")
        .select(
            customers_df["customer_id"],
            customers_df["age"],
            customers_df["gender"],
            customers_df["location"],
            customers_df["customer_segment"],
            sales_agg_last_purchase["last_purchase_date"],
            feedback_agg_latest_df["SENTIMENT_MIN_2"],
            feedback_agg_latest_df["SENTIMENT_MIN_3"],
            feedback_agg_latest_df["SENTIMENT_MIN_4"],
            feedback_agg_latest_df["SENTIMENT_AVG_2"],
            feedback_agg_latest_df["SENTIMENT_AVG_3"],
            feedback_agg_latest_df["SENTIMENT_AVG_4"],
            sales_agg_last_purchase["SUM_TOTAL_AMOUNT_PAST_7D"],
            sales_agg_last_purchase["SUM_TOTAL_AMOUNT_PAST_1MM"],
            sales_agg_last_purchase["SUM_TOTAL_AMOUNT_PAST_2MM"],
            sales_agg_last_purchase["SUM_TOTAL_AMOUNT_PAST_3MM"],
            sales_agg_last_purchase["COUNT_TOTAL_AMOUNT_PAST_7D"].alias("COUNT_ORDERS_PAST_7D"),
            sales_agg_last_purchase["COUNT_TOTAL_AMOUNT_PAST_1MM"].alias("COUNT_ORDERS_PAST_1MM"),
            sales_agg_last_purchase["COUNT_TOTAL_AMOUNT_PAST_2MM"].alias("COUNT_ORDERS_PAST_2MM"),
            sales_agg_last_purchase["COUNT_TOTAL_AMOUNT_PAST_3MM"].alias("COUNT_ORDERS_PAST_3MM"),
            F.datediff("day", sales_agg_df["last_purchase_date"], F.lit(cur_date)).alias("DAYS_SINCE_LAST_PURCHASE"),
            F.lit(cur_date).alias("TIMESTAMP")
        ).filter(sales_agg_df["last_purchase_date"].isNotNull()  # Avoid customers never purchased
        ).dropDuplicates(["customer_id", "TIMESTAMP"])  # Ensure one combination of customer_id and TIMESTAMP

    )
    
    # Fill with 0 those where we have no data (so neutral feedback and zero iterations and amount)
    columns_to_fill = [
        "SENTIMENT_MIN_2", "SENTIMENT_MIN_3", "SENTIMENT_MIN_4", "SENTIMENT_AVG_2",
        "SENTIMENT_AVG_3", "SENTIMENT_AVG_4",
        "SUM_TOTAL_AMOUNT_PAST_7D", "SUM_TOTAL_AMOUNT_PAST_1MM", "SUM_TOTAL_AMOUNT_PAST_2MM", "SUM_TOTAL_AMOUNT_PAST_3MM",
        "COUNT_ORDERS_PAST_7D", "COUNT_ORDERS_PAST_1MM", "COUNT_ORDERS_PAST_2MM", "COUNT_ORDERS_PAST_3MM"
    ]
    
    for column in columns_to_fill:
        features_df = features_df.fillna({column: 0})
    
    # Write to Snowflake Table
    features_df.write.mode("append").save_as_table(table_name)

    print (f'Created table {table_name}')


In [None]:
comments_sql= """
        create or replace table comments_temp (id number, comment VARCHAR) as 
        
        select 1, snowflake.cortex.complete ('mistral-large2', 'write comment complaining about the product SkiBoots123. They were broken after 2 days of usage. You are very dissatified')
        UNION
        select 2, snowflake.cortex.complete ('mistral-large2', 'write comment complaining about a defect in a recent purchase indicating you are not satisfied and will not buy in the shop again. Do not indicate any date, product or shop ')
        UNION
        select 3, snowflake.cortex.complete ('mistral-large2', 'write comment complaining about a recent shipment where the package was broken. Do not indicate any date, product or shop ')
        UNION
        select 4, snowflake.cortex.complete ('mistral-large2', 'write comment where you complain a litte bit about support not calling you back. Do not indicate any date, product or shop ')
        UNION
        select 5, snowflake.cortex.complete ('mistral-large2', 'write a neutral coment about a recent call you had with support. Do not indicate any date, product or shop ')
        UNION
        select 6, snowflake.cortex.complete ('mistral-large2', 'write a neutral comment about a recent purchase you have done. Do not indicate any date, product or shop ')
        UNION
        select 7, snowflake.cortex.complete ('mistral-large2', 'write a comment indicating you are satisfied with a recent purchase. Do not indicate any date, product or shop ')
        UNION
        select 8, snowflake.cortex.complete ('mistral-large2', 'write a comment indicating you are satisfied with a recent purchase and you will recommend the shop. Do not indicate any date, product or shop ')
        UNION
        select 9, snowflake.cortex.complete ('mistral-large2', 'write a comment indicating you are satisfied with a recent support received by a shop assistant. Do not indicate any date, product or shop ')

        """

comments_tb = session.sql(comments_sql).collect()

In [None]:
select * from comments_temp;

In [None]:
drop table if exists customers;
drop table if exists sales;
drop table if exists feedback_raw;
drop table if exists feedback_sentiment;

Function to add some fake data. This will generate customers, transactions and feedback that will be used later to understand customer purchase behaviours and detect when a customer is on high probability of not buying again so we can take some actions

In [None]:
import pandas as pd
import random
from datetime import datetime, timedelta
from snowflake.snowpark import types as T
from snowflake.snowpark import functions as F

num_customers = 5000
num_transactions = num_customers * 15
num_feedback_reports = num_customers * 2

payment_methods = ['Credit Card', 'Paypal', 'Apple Pay']
product_segments = ['Fashion', 'Electronics', 'Beauty', 'Groceries', 'Home', 'Toys', 'Books']
locations = ['Madrid', 'Barcelona', 'Paris', 'London', 'Munich', 'Rome', 'NY', 'Lisbon', 'SFO', 'Denver', 'Atlanta', 'Chicago', 'LAS']
customer_segments = ['Planitum', 'Gold', 'Silver', 'Standard']


def generate_data (first_transaction_date, period):

    customers = []
    for i in range(num_customers):
        customer_id = f"CUST-{i}"
        age = random.randint(18, 99)
        gender = 'Male' if random.choice([True, False]) else 'Female'
        location = random.choice(locations)
        signup_date = first_transaction_date - timedelta(days=random.randint(365,900))
        customer_segment = random.choice(customer_segments)
        
        customers.append ([customer_id, age, gender, location, signup_date, customer_segment])

    df_customers = pd.DataFrame(customers, columns= [
        "CUSTOMER_ID", "AGE", "GENDER", "LOCATION", "SIGNUP_DATE", "CUSTOMER_SEGMENT"
    ])

    customers = session.create_dataframe(df_customers).drop_duplicates()
    customers = customers.with_column("signup_date", F.col("signup_date").cast(T.DateType()))
    customers.write.mode("overwrite").save_as_table("CUSTOMERS")

    transactions = []
    for i in range(num_transactions):
        transaction_id = f"TRANS-{i+1}"
        customer_id = f"CUST-{random.randint(1, num_customers)}"
        transaction_date = first_transaction_date + timedelta(days=random.randint(0,period))
        total_amount = round(random.uniform(50, 5000), 2)
        num_items = random.randint(1, 10)
        discount_applied = bool(random.randint(0, 1))
        payment_method = random.choice(payment_methods)
        
        transactions.append([transaction_id, customer_id, transaction_date, total_amount, num_items, discount_applied, payment_method])

    # Creating the DataFrame for transactions
    df_transactions = pd.DataFrame(transactions, columns=[
        "TRANSACTION_ID", "CUSTOMER_ID", "TRANSACTION_DATE", "TOTAL_AMOUNT", 
        "NUM_ITEMS", "DISCOUNT_APPLIED", "PAYMENT_METHOD"
    ])

    transactions = session.create_dataframe(df_transactions).drop_duplicates()
    transactions = transactions.with_column("transaction_date", F.col("transaction_date").cast(T.DateType()))
    transactions.write.mode("overwrite").save_as_table("SALES")
    
   # Feedback data generation

    comments_df = session.table("comments_temp")
    
    feedback_df = session.sql(f"""
        SELECT 
        'FB-' || TO_CHAR(SEQ8()) AS feedback_id,
        'CUST-' || TO_CHAR(UNIFORM(1, 5000, RANDOM())) AS customer_id,
        DATEADD(DAY, -UNIFORM(0, 365, RANDOM()), CURRENT_DATE) AS chat_date,
        UNIFORM(1, 9, RANDOM()) AS internal_id
    FROM TABLE(GENERATOR(ROWCOUNT => {num_feedback_reports}))
    """)
    
    final_feedback_df = feedback_df.join(comments_df, feedback_df["internal_id"] == comments_df["id"]) \
                                    .select(feedback_df["feedback_id"], 
                                            feedback_df["customer_id"], 
                                            feedback_df["chat_date"], 
                                            feedback_df["internal_id"], 
                                            comments_df["comment"])
    
    final_feedback_df.write.mode("overwrite").save_as_table("FEEDBACK_RAW")


To simulate the data feeds, we are going to generate 3 months of data from 18 months since now that will be used as a baseline to them simulate more sales for customers

In [None]:
import calendar

num_days_per_batch = 31

now = datetime.now()
last_day = calendar.monthrange(now.year, now.month)[1]
last_day_of_month = datetime(now.year, now.month, last_day).date()

#Go back 1 year ago + 3 months
first_timestamp = last_day_of_month - timedelta(days=14* num_days_per_batch)

# Generate data for 3 months
generate_data(first_timestamp, num_days_per_batch * 3)  

In [None]:
select min(transaction_date), max(transaction_date) from sales;

Here we can use the power of LLMs to understand customer feedback and provide a sentiment score that we will be using later to determine probability of churn


In [None]:
from snowflake.cortex import sentiment
import snowflake.snowpark.functions as F


feedback_raw_df = session.table("feedback_raw")

feedback_sentiment_df = feedback_raw_df.with_columns(["sentiment"], [sentiment(F.col("comment"))])

feedback_sentiment_df.write.mode("overwrite").save_as_table("feedback_sentiment")

### Feature Engineering

This function will create a profile for each customer who has already made a purchase in the shop. For a given timestamp, it will analyze the past transactions and feedback and using Snowflake analytical functions will generate features that will be used to traind and predict models.


In [None]:
sales_df = session.table("sales")

first_sale_timestamp = sales_df.select(F.min(F.col("transaction_date"))).collect()[0][0]

last_sale_timestamp = sales_df.select(F.max(F.col("transaction_date"))).collect()[0][0]

print (f'First sale:{first_sale_timestamp}')
print (f'Last sale: {last_sale_timestamp}')

### Generate monthly purchase based on customer behaviors

In [None]:
import pandas as pd
import random
from datetime import datetime, timedelta
from snowflake.snowpark import types as T
from snowflake.snowpark import functions as F
from snowflake import snowpark

def add_new_top_sales (session: snowpark.Session, churn_base_table: str, 
                       new_sales_table:str , days_window: int):

    def add_transactions (df, last_sales_timestamp, days_window):
    
        transactions = []
        
        for customer in df:

            for i in range(random.randint(1,5)): #between 1 and 10 transactions in the period
                customer_id = customer["CUSTOMER_ID"]
                    
                payment_methods = ['Credit Card', 'Paypal', 'Apple Pay']
            
                transaction_id = f"TRANS-N"
                customer_id = customer_id
                
                #if round(random.randint(1,10)) >= 6: # 60% buy in the entire period
                days=random.randint(1,days_window)
                #else: # the rest buy in the first quarter
                #    days=random.randint(1,round(days_window/4)+1)

                transaction_date = last_sales_timestamp + timedelta(days=random.randint(1,days))
                
                total_amount = round(random.uniform(50, 5000), 2)
                num_items = random.randint(1, 5)
                discount_applied = bool(random.randint(0, 1))
                payment_method = random.choice(payment_methods)
                    
                transactions.append([transaction_id, customer_id, transaction_date, total_amount, num_items, discount_applied, payment_method])
        
        # Creating the DataFrame for transactions
        df_transactions = pd.DataFrame(transactions, columns=[
            "TRANSACTION_ID", "CUSTOMER_ID", "TRANSACTION_DATE", "TOTAL_AMOUNT", 
            "NUM_ITEMS", "DISCOUNT_APPLIED", "PAYMENT_METHOD"
        ])
        
        df_transactions["TRANSACTION_DATE"] = pd.to_datetime(df_transactions["TRANSACTION_DATE"], errors="coerce")
    
        transactions = session.create_dataframe(df_transactions).drop_duplicates()
        transactions = transactions.with_column("TRANSACTION_DATE", F.to_date(F.col("TRANSACTION_DATE")))

       # transactions = transactions.with_column("transaction_date", F.col("transaction_date").cast(T.DateType()))
        transactions.write.mode("append").save_as_table(new_sales_table)
     
        num_tran = transactions.count()
        print (f'added {num_tran} transactions')

    churn_df = session.table(churn_base_table)
        
    latest_timestamp = churn_df.select(F.max(F.col("TIMESTAMP"))).collect()[0][0]
    
    churn_df = churn_df.filter(F.col("TIMESTAMP") == F.lit(latest_timestamp))

    num_customers = churn_df.select("CUSTOMER_ID").distinct().count()

    
    all_customers_ordered_sentiment = churn_df.select("CUSTOMER_ID").orderBy(F.col("SENTIMENT_AVG_2").desc()).collect()

    churn_rate = random.uniform(0.45, 0.55)
    num_churn = int(num_customers * churn_rate)

    churn_customers = all_customers_ordered_sentiment[:num_churn]
    active_customers = all_customers_ordered_sentiment[num_churn:]
    
    last_sales_timestamp = session.table("sales").select(F.max(F.col("transaction_date"))).collect()[0][0]
    

    print ("Add for active customers")
    add_transactions(active_customers, last_sales_timestamp, days_window)


    def add_sentiment(df, happy):
        #add sentiment but no transactions for chun customers
        feedback = []
    
        for customer in df:
            feedback_id = 'FEEDBACK-N'
            customer_id = customer["CUSTOMER_ID"]
            customer_id = customer_id
            chat_date = last_sales_timestamp + timedelta(days=random.randint(1,days_window))
            if happy:
                internal_id = random.randint(6, 9) #from angry to neutral
            else:
                internal_id = random.randint(1, 4) #from angry to neutral
            
            feedback.append([feedback_id, customer_id, chat_date, internal_id])
    
        df_feedback = pd.DataFrame(feedback, columns=[
                "FEEDBACK_ID", "CUSTOMER_ID", "CHAT_DATE",
                "INTERNAL_ID"
        ])
    
        feedback = session.create_dataframe(df_feedback).drop_duplicates()
        feedback = feedback.with_column("chat_date", F.col("chat_date").cast(T.DateType()))
        feedback = feedback.with_column("customer_id", F.col("customer_id").cast(T.StringType()))
    
        feedback.write.mode("overwrite").save_as_table("temp_feedback")
    
        feedback_df = session.table("temp_feedback")
        
        comments_df = session.table("comments_temp")
        
        temp_feedback_df = feedback_df.join(comments_df, feedback_df["internal_id"] == comments_df["id"]) \
                                        .select(feedback_df["feedback_id"], 
                                                feedback_df["customer_id"], 
                                                feedback_df["chat_date"], 
                                                feedback_df["internal_id"], 
                                                comments_df["comment"])
      
        feedback_sentiment_df = temp_feedback_df.with_columns(["sentiment"], [sentiment(F.col("comment"))])
      
        feedback_sentiment_df.write.mode("append").save_as_table('feedback_sentiment')

    print ('Adding feedback for churn customers')
    add_sentiment(churn_customers, False)
    print ('Adding feedback for happy customers')
    add_sentiment(active_customers, True)
    
    return "Transactions added"

#session.sproc.register(
#    func=add_new_top_sales,
#    name="add_new_top_sales_sproc",
#    replace=True,
#    is_permanent=True,
#    stage_location="@ML_STAGE",
#    packages=['snowflake-snowpark-python', 'snowflake-ml-python'],
#    return_type=T.StringType()
#)



In [None]:
# Build features for the timestamp of the last sales. This will create the first customer 
# behavior profile in the training baseline baseline_features_builing_dataset
# With theat profile, we call the function to add more sales based on that customer profile

db = session.get_current_database()
sc = session.get_current_schema()
table_features = 'baseline_features_builing_dataset'

session.sql(f'use schema {sc}').collect()
session.sql(f'drop table if exists {table_features}').collect()

sales_df = session.table("sales")
n_transactions = sales_df.count()
print (n_transactions)

feedback_sentiment_df = session.table("feedback_sentiment")
num_reviews = feedback_sentiment_df.count()
print (f'num reviews: {num_reviews}')

for i in range(7):

    # Customers profiles fora given timestmp (last sales date)
    sales_df = session.table("sales")

    last_sale_timestamp = sales_df.select(F.max(F.col("transaction_date"))).collect()[0][0]

    print (f'Building features for timestamp: {last_sale_timestamp}')
    uc01_feature_engineering_generation(db, sc, 'SALES', 'FEEDBACK_SENTIMENT', last_sale_timestamp, table_features)
    
    #add 30 more days of sales to thee sales table, based on the last
    #profile of custoemr_churn_testing
    print (f'adding more sales')
    add_new_top_sales (session, table_features, 'sales',30)

    n_transactions = sales_df.count()
    print (n_transactions)
    

In [None]:
select timestamp,  count(*) from baseline_features_builing_dataset
group by timestamp
order by timestamp desc;


In [None]:
create table baseline  clone baseline_features_builing_dataset;

In [None]:
db = session.get_current_database()
sc = session.get_current_schema()
print (f'database: {db}, schema: {sc}')

session.call('UTILS.uc_01_label_churn_sproc', db, sc, 'baseline_features_builing_dataset', 
             'baseline_features_builing_dataset_labeled', 30 )


In [None]:
SELECT 
    TIMESTAMP,
    SUM(CASE WHEN churned = 0 THEN 1 ELSE 0 END) AS not_churned,
    SUM(CASE WHEN churned = 1 THEN 1 ELSE 0 END) AS churned
FROM baseline_features_builing_dataset_labeled
GROUP BY TIMESTAMP
ORDER BY TIMESTAMP;

In [None]:
df = session.table('baseline_features_builing_dataset_labeled')

timestamp_to_drop = df.select(F.min(F.col("timestamp"))).collect()[0][0]

sql_cmd = f"""
        delete from sales where transaction_date < '{timestamp_to_drop}'
        """

session.sql(sql_cmd).collect()




In [None]:
SELECT 
    TIMESTAMP,
    SUM(CASE WHEN churned = 0 THEN 1 ELSE 0 END) AS not_churned,
    SUM(CASE WHEN churned = 1 THEN 1 ELSE 0 END) AS churned
FROM baseline_features_builing_dataset_labeled
GROUP BY TIMESTAMP
ORDER BY TIMESTAMP;

In [None]:
describe table sales;

In [None]:
describe table baseline_features_builing_dataset;

In [None]:
import pandas as pd
import random
from datetime import datetime, timedelta
from snowflake.snowpark import types as T
from snowflake.snowpark import functions as F

num_customers = 5000
num_transactions = num_customers * 15
num_feedback_reports = num_customers * 2


payment_methods = ['Credit Card', 'Paypal', 'Apple Pay']
product_segments = ['Fashion', 'Electronics', 'Beauty', 'Groceries', 'Home', 'Toys', 'Books']
# Drift, less locations
locations = ['Madrid', 'Barcelona', 'Paris']
# Drift, less segments
customer_segments = ['Standard', 'Gold']


def generate_data_skew (first_transaction_date, period):

    customers = []
    for i in range(num_customers):
        customer_id = f"CUST2-{i}"
        age = random.randint(18, 99)
        gender = 'Male' if random.choice([True, False]) else 'Female'
        location = random.choice(locations)
        signup_date = first_transaction_date - timedelta(days=random.randint(365,900))
        customer_segment = random.choice(customer_segments)
        
        customers.append ([customer_id, age, gender, location, signup_date, customer_segment])

    df_customers = pd.DataFrame(customers, columns= [
        "CUSTOMER_ID", "AGE", "GENDER", "LOCATION", "SIGNUP_DATE", "CUSTOMER_SEGMENT"
    ])

    customers = session.create_dataframe(df_customers).drop_duplicates()
    customers = customers.with_column("signup_date", F.col("signup_date").cast(T.DateType()))
    customers.write.mode("overwrite").save_as_table("NEW_CUSTOMERS")

    transactions = []
    for i in range(num_transactions):
        transaction_id = f"TRANS2-{i+1}"
        customer_id = f"CUST2-{random.randint(1, num_customers)}"
        transaction_date = first_transaction_date + timedelta(days=random.randint(0,period))
        total_amount = round(random.uniform(10, 500), 2)  ######## DRIFT - CUSTOMERS SPENING MUCH LESS MONEY
        num_items = random.randint(1, 10)
        discount_applied = bool(random.randint(0, 1))
        payment_method = random.choice(payment_methods)
        
        transactions.append([transaction_id, customer_id, transaction_date, total_amount, num_items, discount_applied, payment_method])

    # Creating the DataFrame for transactions
    df_transactions = pd.DataFrame(transactions, columns=[
        "TRANSACTION_ID", "CUSTOMER_ID", "TRANSACTION_DATE", "TOTAL_AMOUNT", 
        "NUM_ITEMS", "DISCOUNT_APPLIED", "PAYMENT_METHOD"
    ])

    transactions = session.create_dataframe(df_transactions).drop_duplicates()
    transactions = transactions.with_column("transaction_date", F.col("transaction_date").cast(T.DateType()))
    transactions.write.mode("overwrite").save_as_table("NEW_SALES")
    
   # Feedback data generation

    comments_df = session.table("comments_temp")
    
    feedback_df = session.sql(f"""
        SELECT 
        'FB-' || TO_CHAR(SEQ8()) AS feedback_id,
        'CUST-' || TO_CHAR(UNIFORM(1, 5000, RANDOM())) AS customer_id,
        DATEADD(DAY, -UNIFORM(0, 365, RANDOM()), CURRENT_DATE) AS chat_date,
        UNIFORM(1, 9, RANDOM()) AS internal_id
    FROM TABLE(GENERATOR(ROWCOUNT => {num_feedback_reports}))
    """)
    
    final_feedback_df = feedback_df.join(comments_df, feedback_df["internal_id"] == comments_df["id"]) \
                                    .select(feedback_df["feedback_id"], 
                                            feedback_df["customer_id"], 
                                            feedback_df["chat_date"], 
                                            feedback_df["internal_id"], 
                                            comments_df["comment"])
    
    final_feedback_df.write.mode("overwrite").save_as_table("NEW_FEEDBACK_RAW2")


In [None]:
num_days_per_batch = 31

sales_df = session.table("SALES")

last_sale_timestamp = sales_df.select(F.max(F.col("transaction_date"))).collect()[0][0]

print (f'Starting at {last_sale_timestamp} ')
# Add some skew data for 1 batches
generate_data_skew(last_sale_timestamp, num_days_per_batch * 1)  

In [None]:
from snowflake.cortex import sentiment
import snowflake.snowpark.functions as F


feedback_raw_df = session.table("new_feedback_raw2")

feedback_sentiment_df = feedback_raw_df.with_columns(["sentiment"], [sentiment(F.col("comment"))])

feedback_sentiment_df.write.mode("overwrite").save_as_table("new_feedback_sentiment")

In [None]:
import pandas as pd
import random
from datetime import datetime, timedelta
from snowflake.snowpark import types as T
from snowflake.snowpark import functions as F
from snowflake import snowpark

def add_new_top_sales_skew (session: snowpark.Session, churn_base_table: str, 
                       new_sales_table:str , days_window: int):

    def add_transactions (df, last_sales_timestamp, days_window):
    
        transactions = []
        
        for customer in df:

            for i in range(random.randint(1,5)): #between 1 and 10 transactions in the period
                customer_id = customer["CUSTOMER_ID"]
                    
                payment_methods = ['Credit Card', 'Paypal', 'Apple Pay']
            
                transaction_id = f"TRANS-N"
                customer_id = customer_id
                
                #if round(random.randint(1,10)) >= 6: # 60% buy in the entire period
                days=random.randint(5,days_window)
                #else: # the rest buy in the first quarter
                #    days=random.randint(1,round(days_window/4)+1)

                transaction_date = last_sales_timestamp + timedelta(days=random.randint(1,days))
                
                total_amount = round(random.uniform(50, 5000), 2)
                num_items = random.randint(1, 5)
                discount_applied = bool(random.randint(0, 1))
                payment_method = random.choice(payment_methods)
                    
                transactions.append([transaction_id, customer_id, transaction_date, total_amount, num_items, discount_applied, payment_method])
        
        # Creating the DataFrame for transactions
        df_transactions = pd.DataFrame(transactions, columns=[
            "TRANSACTION_ID", "CUSTOMER_ID", "TRANSACTION_DATE", "TOTAL_AMOUNT", 
            "NUM_ITEMS", "DISCOUNT_APPLIED", "PAYMENT_METHOD"
        ])
        
        df_transactions["TRANSACTION_DATE"] = pd.to_datetime(df_transactions["TRANSACTION_DATE"], errors="coerce")
    
        transactions = session.create_dataframe(df_transactions).drop_duplicates()
        transactions = transactions.with_column("TRANSACTION_DATE", F.to_date(F.col("TRANSACTION_DATE")))

       # transactions = transactions.with_column("transaction_date", F.col("transaction_date").cast(T.DateType()))
        transactions.write.mode("append").save_as_table(new_sales_table)
     
        num_tran = transactions.count()
        print (f'added {num_tran} transactions')

    
    churn_df = session.table(churn_base_table)
        
    latest_timestamp = churn_df.select(F.max(F.col("TIMESTAMP"))).collect()[0][0]
    
    churn_df = churn_df.filter(F.col("TIMESTAMP") == F.lit(latest_timestamp))

    num_customers = churn_df.select("CUSTOMER_ID").distinct().count()

    all_customers_ordered_sentiment = churn_df.select("CUSTOMER_ID").orderBy(F.col("SENTIMENT_AVG_2").desc()).collect()

    churn_rate = random.uniform(0.45, 0.55)
    num_churn = int(num_customers * churn_rate)

    churn_customers = all_customers_ordered_sentiment[:num_churn]
    active_customers = all_customers_ordered_sentiment[num_churn:]
    
    last_sales_timestamp = session.table(new_sales_table).select(F.max(F.col("transaction_date"))).collect()[0][0]
    

    print ("Add for active customers")
    add_transactions(active_customers, last_sales_timestamp, days_window)


    def add_sentiment(df, happy):
        #add sentiment but no transactions for chun customers
        feedback = []
    
        for customer in df:
            feedback_id = 'FEEDBACK-N'
            customer_id = customer["CUSTOMER_ID"]
            customer_id = customer_id
            chat_date = last_sales_timestamp + timedelta(days=random.randint(1,days_window))
            if happy:
                internal_id = random.randint(6, 9) #from angry to neutral
            else:
                internal_id = random.randint(1, 4) #from angry to neutral
            
            feedback.append([feedback_id, customer_id, chat_date, internal_id])
    
        df_feedback = pd.DataFrame(feedback, columns=[
                "FEEDBACK_ID", "CUSTOMER_ID", "CHAT_DATE",
                "INTERNAL_ID"
        ])
    
        feedback = session.create_dataframe(df_feedback).drop_duplicates()
        feedback = feedback.with_column("chat_date", F.col("chat_date").cast(T.DateType()))
        feedback = feedback.with_column("customer_id", F.col("customer_id").cast(T.StringType()))
    
        feedback.write.mode("overwrite").save_as_table("temp_feedback")
    
        feedback_df = session.table("temp_feedback")
        
        comments_df = session.table("comments_temp")
        
        temp_feedback_df = feedback_df.join(comments_df, feedback_df["internal_id"] == comments_df["id"]) \
                                        .select(feedback_df["feedback_id"], 
                                                feedback_df["customer_id"], 
                                                feedback_df["chat_date"], 
                                                feedback_df["internal_id"], 
                                                comments_df["comment"])
      
        feedback_sentiment_df = temp_feedback_df.with_columns(["sentiment"], [sentiment(F.col("comment"))])
      
        feedback_sentiment_df.write.mode("append").save_as_table('new_feedback_sentiment')

    print ('Adding feedback for churn customers')
    add_sentiment(churn_customers, False)
    print ('Adding feedback for happy customers')
    add_sentiment(active_customers, True)
    
    return "Transactions added"

In [None]:
# Build features for the timestamp of the last sales. This will create the first customer 
# behavior profile in the training baseline baseline_features_builing_dataset
# With theat profile, we call the function to add more sales based on that customer profile

db = session.get_current_database()
sc = session.get_current_schema()
table_features = 'baseline_features_builing_dataset'

session.sql(f'use schema {sc}').collect()

sales_df = session.table("NEW_SALES")
n_transactions = sales_df.count()
print (n_transactions)

print (f'num reviews: {num_reviews}')

for i in range(4):

    # Customers profiles fora given timestmp (last sales date)
    sales_df = session.table("NEW_SALES")

    last_sale_timestamp = sales_df.select(F.max(F.col("transaction_date"))).collect()[0][0]

    print (f'Building features for timestamp: {last_sale_timestamp}')
    uc01_feature_engineering_generation(db, sc, 'NEW_SALES', 'NEW_FEEDBACK_SENTIMENT', last_sale_timestamp, table_features)
    
    #add 30 more days of sales to thee sales table, based on the last
    #profile of custoemr_churn_testing
    print (f'adding more sales')
    add_new_top_sales_skew (session, table_features, 'NEW_SALES',30)

    n_transactions = sales_df.count()
    print (n_transactions)
    

In [None]:
select min(transaction_date), max(transaction_date) from NEW_SALES;

In [None]:
select min(transaction_date), max(transaction_date) from SALES;

We have simulated sales based on some customer profile. We are going to write customer, sales and sentiment on CSV tables that will be used later to simulate a pipeline ingesting data and training ML models producing inference to detect possible customer churn.

In [None]:
CREATE OR REPLACE STAGE CSV
  DIRECTORY = (ENABLE = TRUE)
  ENCRYPTION = ( TYPE = 'SNOWFLAKE_SSE' );

In [None]:
import snowflake.snowpark as snowpark
import  snowflake.snowpark.functions as F
import datetime


def unload_table_by_month(session: snowpark.Session, stage_name: str, table: str,
                                columns: str, date_column: str):
    # Read sales data
    df = session.table(table)

    # Ensure transaction_date is in DATE format
    #df = df.with_column("transaction_date", to_date(col("transaction_date")))

    # Extract year and week for partitioning
    df = df.with_column("year", F.year(F.col(date_column)))
    df = df.with_column("month", F.month(F.col(date_column)))

    # Get distinct year-week pairs
    months = df.select("year", "month").distinct().collect()

    # Iterate over each week and export CSV
    for row in months:
        y, m = row["YEAR"], row["MONTH"]
        output_file = f"{table}_{y}_{m}.csv"
        query = f"""
            COPY INTO @{stage_name}/{output_file}
            FROM (SELECT {columns}           
            FROM {table} WHERE year({date_column}) = {y} AND month({date_column}) = {m})
            FILE_FORMAT = (TYPE = 'CSV' FIELD_OPTIONALLY_ENCLOSED_BY='"')
            SINGLE = TRUE;
        """
        session.sql(query).collect()
        print(f"Exported {output_file} to {stage_name}")

    return f"CSV export completed to {stage_name}"

In [None]:
sales_columns = """TRANSACTION_ID,
                    CUSTOMER_ID,
                    TRANSACTION_DATE,
                    DISCOUNT_APPLIED,
                    NUM_ITEMS,
                    PAYMENT_METHOD, 
                    TOTAL_AMOUNT""" 

unload_table_by_month (session, 'CSV', 'sales', sales_columns, 'transaction_date')


In [None]:
feedback_columns = """CHAT_DATE,
                    COMMENT,
                    CUSTOMER_ID,
                    FEEDBACK_ID,
                    INTERNAL_ID
                    """ 

unload_table_by_month (session, 'CSV', 'feedback_raw', 
                       feedback_columns, 'chat_date')

In [None]:
feedback_sentiment_columns = """
                    FEEDBACK_ID,
                    CHAT_DATE,
                    CUSTOMER_ID,
                    INTERNAL_ID,
                    COMMENT,
                    SENTIMENT
                    """ 

unload_table_by_month (session, 'CSV', 'feedback_sentiment', 
                       feedback_sentiment_columns, 'chat_date')

In [None]:
COPY INTO @CSV/customers.csv
            FROM (SELECT
                CUSTOMER_ID, AGE, CUSTOMER_SEGMENT, GENDER, LOCATION, SIGNUP_DATE
            FROM customers)
            FILE_FORMAT = (TYPE = 'CSV' FIELD_OPTIONALLY_ENCLOSED_BY='"')
            SINGLE = TRUE;

In [None]:
COPY INTO @CSV/new_customers.csv
            FROM (SELECT
                CUSTOMER_ID, AGE, CUSTOMER_SEGMENT, GENDER, LOCATION, SIGNUP_DATE
            FROM new_customers)
            FILE_FORMAT = (TYPE = 'CSV' FIELD_OPTIONALLY_ENCLOSED_BY='"')
            SINGLE = TRUE;

In [None]:
sales_columns = """TRANSACTION_ID,
                    CUSTOMER_ID,
                    TRANSACTION_DATE,
                    DISCOUNT_APPLIED,
                    NUM_ITEMS,
                    PAYMENT_METHOD, 
                    TOTAL_AMOUNT""" 

unload_table_by_month (session, 'CSV', 'new_sales', sales_columns, 'transaction_date')

In [None]:
feedback_columns = """CHAT_DATE,
                    COMMENT,
                    CUSTOMER_ID,
                    FEEDBACK_ID,
                    INTERNAL_ID
                    """ 

unload_table_by_month (session, 'CSV', 'new_feedback_raw2', 
                       feedback_columns, 'chat_date')

In [None]:
ls @CSV;

In [None]:
remove @csv/new_sales_2024_12.csv