In [None]:
# Re-import necessary packages after execution environment reset
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import joblib

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
# Restore DuckDB connection
import duckdb

conn = duckdb.connect("./data/reviews.db")

In [5]:
# Re-execute the query to get model data
query = """
SELECT
    r.rating,
    r.isrecommended,
    r.TotalPositiveFeedbackCount,
    coalesce(r.helpfulness,0.0) as helpfulness,
    r.product_id,
    p.price_usd,
    p.ingredients
FROM (select 'P87985432' as product_id,* from reviews_clean) r
JOIN products_raw p ON r.product_id = p.product_id
WHERE r.rating IS NOT NULL AND p.ingredients IS NOT NULL AND p.price_usd IS NOT NULL
"""
df_model = conn.execute(query).df()

# Create target
df_model["purchase_intent"] = (df_model["isrecommended"]) & (df_model["rating"] >= 4)

# Ingredient flags
ingredient_keywords = ["paraben", "fragrance", "talc", "phthalate", "formaldehyde", "mineral oil"]
for ingr in ingredient_keywords:
    col = f'has_{ingr.replace(" ", "_")}'
    df_model[col] = df_model["ingredients"].str.contains(ingr, case=False, na=False)

# Features and target
feature_cols = ["rating", "TotalPositiveFeedbackCount", "helpfulness", "price_usd"] + [
    f'has_{ingr.replace(" ", "_")}' for ingr in ingredient_keywords
]

In [27]:
# Save model and features

model_path = "./data/purchase_intent_model.pkl"
features_path = "./data/purchase_intent_features.pkl"


# Define scoring function
def predict_purchase_intent(review_input):
    loaded_model: LogisticRegression = joblib.load(model_path)
    feature_list = joblib.load(features_path)

    for col in feature_list:
        if col not in review_input:
            review_input[col] = 0

    input_df = pd.DataFrame([review_input])[feature_list]
    probability = loaded_model.predict_proba(input_df)[0][1]
    prediction = loaded_model.predict(input_df)[0]

    return {"predicted_label": bool(prediction), "probability": probability}


# Use real examples
examples = df_model[feature_cols + ["product_id"]].sample(10, random_state=1)
example_predictions = []

for _, row in examples.iterrows():
    input_data = row[feature_cols].to_dict()
    result = predict_purchase_intent(input_data)

    example_predictions.append(
        {
            "product_id": row["product_id"],
            "rating": row["rating"],
            "prediction": result["predicted_label"],
            "probability": result["probability"],
        }
    )

# Display predictions
pd.DataFrame(example_predictions)

Unnamed: 0,product_id,rating,prediction,probability
0,P87985432,5,True,0.999975
1,P87985432,5,True,0.999975
2,P87985432,4,True,0.960205
3,P87985432,5,True,0.999975
4,P87985432,2,False,9e-06
5,P87985432,5,True,0.999975
6,P87985432,5,True,0.999975
7,P87985432,5,True,0.999975
8,P87985432,2,False,9e-06
9,P87985432,5,True,0.999975
