
# Corporate Pricing ML Demo — Annotated (Colab-Ready)

This notebook:
1. **Generates** synthetic corporate pricing data with a hidden **nonlinear** pattern.
2. **Trains** a Random Forest to learn that pattern.
3. **Visualizes** the discovered structure with Matplotlib, and **annotates** the "sweet spot."

The hidden signal features a **ring-shaped sweet spot** in the `(discount_pct, competitor_index)` plane, plus a non-linear **seasonality × discount** interaction.


In [None]:

# (Colab) Optional installs. Usually preinstalled.
# %pip install -q scikit-learn matplotlib pandas numpy


In [None]:

# --- 1) Data Generation ---
import numpy as np
import pandas as pd

rng = np.random.default_rng(42)
n = 6000

# Features
product_id = rng.integers(1000, 1100, size=n)  # 100 SKUs
region = rng.choice(["NA","EU","APAC","LATAM","MEA"], size=n, p=[0.32,0.28,0.2,0.12,0.08])
month = rng.integers(1, 13, size=n)
season_pos = 2*np.pi*(month-1)/12.0  # cyclical encoding
base_price = rng.normal(100, 15, size=n).clip(40, 180)
discount_pct = np.clip(rng.normal(0.15, 0.10, size=n), 0.0, 0.5)  # 0–50%
competitor_index = np.clip(rng.normal(1.00, 0.10, size=n), 0.75, 1.25)  # 1.0 ~ parity
marketing_spend = np.clip(rng.lognormal(mean=2.5, sigma=0.6, size=n), 5, 150)
product_age_months = np.clip(rng.integers(1, 60, size=n), 1, None)
stock_days = np.clip(rng.normal(25, 7, size=n), 5, 60)

# Normalize for latent construction
mkt_norm = (marketing_spend - marketing_spend.min())/(marketing_spend.max()-marketing_spend.min())
disc = discount_pct
comp = competitor_index

# Hidden nonlinear signal:
#  (1) A ring-shaped "sweet spot" where distance from center ~ r0
r = np.sqrt((disc-0.22)**2 + (comp-1.05)**2)
ring = np.exp(-((r-0.10)**2)/(2*(0.035**2)))  # narrow ring

#  (2) Seasonal/discount interaction
seasonal = 0.9*np.sin(3*season_pos + 6*disc)

#  (3) Localized "marketing × competitor" bump
bump = 1.4*np.exp(-(((mkt_norm-0.6)**2 + (comp-1.02)**2)/(2*0.06**2)))

# Region effect (categorical)—mild
region_map = {"NA":0.10, "EU":0.05, "APAC":0.08, "LATAM":0.12, "MEA":0.06}
region_eff = np.vectorize(region_map.get)(region)

# Base demand + price elasticity + noise
base_demand = 50 + 6*region_eff + 8*np.sin(season_pos)
price_elasticity = -0.25*(base_price*(1-discount_pct) - 90)
latent_signal = 18*ring + 10*seasonal + 14*bump
noise = rng.normal(0, 4.0, size=n)

# Demand index (target)
demand_index = (base_demand + price_elasticity + latent_signal + noise)

# Realized sales (not used by RF but realistic)
sales_qty = np.clip(demand_index + rng.normal(0, 3.0, size=n), 0, None)

df = pd.DataFrame({
    "product_id": product_id,
    "region": region,
    "month": month,
    "base_price": base_price,
    "discount_pct": discount_pct,
    "competitor_index": competitor_index,
    "marketing_spend": marketing_spend,
    "product_age_months": product_age_months,
    "stock_days": stock_days,
    "demand_index": demand_index,
    "sales_qty": sales_qty
})

df.to_csv("pricing_data.csv", index=False)
print("Saved pricing_data.csv with", len(df), "rows.")
df.head()


In [None]:

# --- 2) Model Training & Pattern Discovery ---
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load
df = pd.read_csv("pricing_data.csv")

target = "demand_index"
features = ["region","month","base_price","discount_pct","competitor_index",
            "marketing_spend","product_age_months","stock_days"]

X = df[features]
y = df[target]

# Preprocess: one-hot for region, pass-through numerics
cat_features = ["region"]
num_features = [c for c in features if c not in cat_features]

pre = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
        ("num", "passthrough", num_features)
    ]
)

rf = RandomForestRegressor(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=0
)

pipe = Pipeline([("pre", pre), ("rf", rf)])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=0
)

pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)

print(f"R^2 on test: {r2_score(y_test, pred):.3f}")

# Feature importance (aggregate back to original features)
ohe = pipe.named_steps["pre"].named_transformers_["cat"]
cat_names = list(ohe.get_feature_names_out(cat_features))
final_feature_names = cat_names + num_features

importances = pipe.named_steps["rf"].feature_importances_
fi = pd.DataFrame({"feature": final_feature_names, "importance": importances})        .sort_values("importance", ascending=False)
print("\nTop feature importances:")
print(fi.head(10))

# Create a prediction grid over (discount_pct, competitor_index) holding others at medians/mode
disc_grid = np.linspace(df["discount_pct"].quantile(0.01),
                        df["discount_pct"].quantile(0.99), 100)
comp_grid = np.linspace(df["competitor_index"].quantile(0.01),
                        df["competitor_index"].quantile(0.99), 100)
D, C = np.meshgrid(disc_grid, comp_grid)

# Baseline row with medians/mode
baseline = {
    "region": df["region"].mode()[0],
    "month": int(df["month"].median()),
    "base_price": df["base_price"].median(),
    "marketing_spend": df["marketing_spend"].median(),
    "product_age_months": df["product_age_months"].median(),
    "stock_days": df["stock_days"].median()
}

grid_df = pd.DataFrame({
    "region": np.repeat(baseline["region"], D.size),
    "month": np.repeat(baseline["month"], D.size),
    "base_price": np.repeat(baseline["base_price"], D.size),
    "discount_pct": D.ravel(),
    "competitor_index": C.ravel(),
    "marketing_spend": np.repeat(baseline["marketing_spend"], D.size),
    "product_age_months": np.repeat(baseline["product_age_months"], D.size),
    "stock_days": np.repeat(baseline["stock_days"], D.size)
})

grid_pred = pipe.predict(grid_df).reshape(D.shape)

# Save grid predictions for the plotting step
np.savez("rf_surface.npz",
         disc_grid=disc_grid, comp_grid=comp_grid, grid_pred=grid_pred)

fi.to_csv("feature_importances.csv", index=False)
print("\nSaved rf_surface.npz and feature_importances.csv")


In [None]:

# --- 3) Visualization (Matplotlib) ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load surface and sample of raw data for overlay
data = np.load("rf_surface.npz")
disc_grid = data["disc_grid"]
comp_grid = data["comp_grid"]
Z = data["grid_pred"]

df = pd.read_csv("pricing_data.csv")

# Take a small random sample for overlay
sample = df.sample(800, random_state=1)

fig, ax = plt.subplots(figsize=(8,6))

# Heatmap of RF predictions (nonlinear structure)
im = ax.imshow(
    Z, origin="lower",
    extent=[disc_grid.min(), disc_grid.max(), comp_grid.min(), comp_grid.max()],
    aspect="auto"
)
cbar = plt.colorbar(im, ax=ax)
cbar.set_label("Predicted demand index (RF)")

# Overlay actual points colored by realized demand
sc = ax.scatter(sample["discount_pct"], sample["competitor_index"],
                c=sample["demand_index"], s=12, alpha=0.7, edgecolors="none")
cbar2 = plt.colorbar(sc, ax=ax, fraction=0.046, pad=0.04)
cbar2.set_label("Actual demand index (sample)")

ax.set_xlabel("Discount percentage")
ax.set_ylabel("Competitor price index")
ax.set_title("Machine Learning Uncovers Nonlinear Pricing Pattern")

plt.tight_layout()
plt.savefig("rf_discovered_pattern.png", dpi=160)
plt.show()

print("Saved rf_discovered_pattern.png")


In [None]:

# --- 4) Annotated Sweet Spot ---
# Two approaches:
# (A) Draw the *known* hidden ring (theoretical center & radius used in data gen).
# (B) Auto-detect a high-demand ridge via a high-percentile contour of model predictions.

import numpy as np
import matplotlib.pyplot as plt

data = np.load("rf_surface.npz")
disc_grid = data["disc_grid"]
comp_grid = data["comp_grid"]
Z = data["grid_pred"]

fig, ax = plt.subplots(figsize=(8,6))

# Base heatmap
im = ax.imshow(
    Z, origin="lower",
    extent=[disc_grid.min(), disc_grid.max(), comp_grid.min(), comp_grid.max()],
    aspect="auto"
)
plt.colorbar(im, ax=ax, label="Predicted demand index (RF)")

# (A) Known hidden ring (from data-generation parameters)
center = (0.22, 1.05)
radius = 0.10

# Draw dashed circle (style only; no explicit colors to keep it generic)
circle = plt.Circle(center, radius, fill=False, linewidth=2.0, linestyle="--")
ax.add_patch(circle)

# Label
ax.annotate(
    "Sweet spot region",
    xy=(center[0] + radius, center[1]),
    xytext=(center[0] + 0.15, center[1] + 0.10),
    arrowprops=dict(shrink=0.05, width=1.5, headwidth=8),
    fontsize=11, fontweight="bold",
    bbox=dict(boxstyle="round,pad=0.3", alpha=0.4)
)

# (B) Auto-detected ridge via percentile contour
import numpy as np
level = np.percentile(Z, 95)  # high-demand boundary
CS = ax.contour(
    np.linspace(disc_grid.min(), disc_grid.max(), Z.shape[1]),
    np.linspace(comp_grid.min(), comp_grid.max(), Z.shape[0]),
    Z, levels=[level], linewidths=2.0, linestyles=":"
)
ax.clabel(CS, inline=True, fmt="High-demand ridge", fontsize=9)

ax.set_xlabel("Discount percentage")
ax.set_ylabel("Competitor price index")
ax.set_title("Annotated Sweet Spot in Pricing Space")

plt.tight_layout()
plt.savefig("rf_discovered_pattern_annotated.png", dpi=160)
plt.show()

print("Saved rf_discovered_pattern_annotated.png")



## Notes
- The dashed circle marks the **true** sweet spot ring used to generate the data.
- The dotted contour marks an **automatically detected** high-demand ridge from the model's predictions (95th percentile).
- Try changing the percentile (e.g., 90/97.5) or swapping in other models (e.g., Gradient Boosting) to compare.
