In [1]:
# 2.1 Basic setup
import pandas as pd
import numpy as np
import datetime as dt
import random

# NLP and modeling
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Optional: for nicer displays
pd.set_option('display.max_columns', 100)

# 2.2 Reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [2]:
# 3.1 Regions and channels based on user preferences
regions = ["USA", "UK", "Canada", "Other"]
channels = ["Online", "Large chains", "Small stores", "Mixed"]  # Mixed could be aggregated

# 3.2 Demographic groups of interest
demographics = ["Families", "Seniors", "Young adults", "Teens"]

# 3.3 Time frame: last 10 years
start_date = dt.date(2016, 1, 1)
end_date = dt.date(2025, 12, 31)

# 3.4 Monthly periods
dates = pd.date_range(start=start_date, end=end_date, freq='MS')  # month start

In [3]:
def simulate_sales_data(dates, regions, channels, base_sales=100_000):
    records = []
    for date in dates:
        for region in regions:
            for channel in channels:
                # Simulate seasonality: higher sales in summer months (e.g., Jun-Aug)
                month = date.month
                seasonal_factor = 1.3 if month in [6,7,8] else 1.0
                # Simulate trend: slow growth over the years
                years_from_start = (date.year - dates[0].year)
                trend_factor = 1 + 0.03 * years_from_start  # ~3% per year
                
                # Random noise
                noise = np.random.normal(loc=1.0, scale=0.15)
                
                sales = base_sales * seasonal_factor * trend_factor * noise
                # Profit margin percentage: vary by channel and random noise
                base_margin = {
                    "Online": 0.25,
                    "Large chains": 0.20,
                    "Small stores": 0.22,
                    "Mixed": 0.21
                }[channel]
                margin_noise = np.random.normal(0, 0.02)
                profit_margin = min(max(base_margin + margin_noise, 0.05), 0.40)  # clamp
                
                records.append({
                    "date": date,
                    "region": region,
                    "channel": channel,
                    "sales_usd": sales,
                    "profit_margin": profit_margin
                })
    return pd.DataFrame(records)

sales_df = simulate_sales_data(dates, regions, channels)
sales_df.head()

Unnamed: 0,date,region,channel,sales_usd,profit_margin
0,2016-01-01,USA,Online,107450.712295,0.247235
1,2016-01-01,USA,Large chains,109715.328072,0.230461
2,2016-01-01,USA,Small stores,96487.699379,0.215317
3,2016-01-01,USA,Mixed,123688.192233,0.225349
4,2016-01-01,UK,Online,92957.884211,0.260851


In [4]:
# 5.1 Example phrases associated with demographics (simplified)
demo_text_snippets = {
    "Families": [
        "Great juice for family dinners.",
        "Kids love the mango flavor, perfect for parties.",
        "Affordable and tasty for our household."
    ],
    "Seniors": [
        "Easy to drink and not too sweet.",
        "Enjoyed with lunch, good price for pensioners.",
        "Refreshing and light, suits older taste buds."
    ],
    "Young adults": [
        "Perfect for my gym bag after workout.",
        "Love the bold flavor during weekend hangouts.",
        "Trendy drink, great for social events."
    ],
    "Teens": [
        "Awesome taste for after school.",
        "Vibrant flavor, everyone in class talks about it.",
        "Cool looking can, fits my backpack."
    ]
}

# 5.2 Function to randomly assign text to records
def generate_text_for_record(expected_demo=None):
    # Choose demographic bias if specified
    if expected_demo and expected_demo in demo_text_snippets:
        options = demo_text_snippets[expected_demo]
    else:
        # random pick across all
        options = [txt for lst in demo_text_snippets.values() for txt in lst]
    return random.choice(options)

# 5.3 Attach one text snippet per record for demonstration
#   In reality, you might have multiple texts per period or aggregated comments.
sales_df['customer_text'] = sales_df.apply(
    lambda row: generate_text_for_record(), axis=1
)

sales_df.sample(5)

Unnamed: 0,date,region,channel,sales_usd,profit_margin,customer_text
1268,2022-08-01,UK,Online,175853.542712,0.268369,"Trendy drink, great for social events."
1084,2021-08-01,Other,Online,144252.10466,0.259327,Great juice for family dinners.
1017,2021-04-01,Canada,Large chains,133288.620904,0.255193,Awesome taste for after school.
944,2020-12-01,USA,Online,104039.498969,0.249711,"Cool looking can, fits my backpack."
1884,2025-10-01,Other,Online,139662.587299,0.238942,Awesome taste for after school.


In [5]:
# 6.1 Basic keyword-based demographic indicator columns
def keyword_to_demo_flags(text):
    text = text.lower()
    flags = {}
    flags["Families"] = int(bool(re.search(r"family|kids|household", text)))
    flags["Seniors"] = int(bool(re.search(r"pensioners|older|senior|light", text)))
    flags["Young adults"] = int(bool(re.search(r"gym|trend|social|weekend", text)))
    flags["Teens"] = int(bool(re.search(r"school|class|backpack|cool", text)))
    return flags

demo_flags_df = sales_df['customer_text'].apply(keyword_to_demo_flags).apply(pd.Series)
sales_df = pd.concat([sales_df, demo_flags_df], axis=1)
sales_df.head()

Unnamed: 0,date,region,channel,sales_usd,profit_margin,customer_text,Families,Seniors,Young adults,Teens
0,2016-01-01,USA,Online,107450.712295,0.247235,"Vibrant flavor, everyone in class talks about it.",0,0,0,1
1,2016-01-01,USA,Large chains,109715.328072,0.230461,"Kids love the mango flavor, perfect for parties.",1,0,0,0
2,2016-01-01,USA,Small stores,96487.699379,0.215317,Great juice for family dinners.,1,0,0,0
3,2016-01-01,USA,Mixed,123688.192233,0.225349,"Cool looking can, fits my backpack.",0,0,0,1
4,2016-01-01,UK,Online,92957.884211,0.260851,"Enjoyed with lunch, good price for pensioners.",0,1,0,0


In [14]:
import pandas as pd
import csv

df = pd.read_csv(
    "Rubicondrinksales.csv",
    sep=",",                 # or change to ';' if needed
    engine="python",
    quoting=csv.QUOTE_NONE,  # ignore broken quotes
    on_bad_lines="skip"      # skip corrupted rows
)

print(df.head())
print(df.columns)


                      import csv
0                # Data to write
1                       data = [
2                              ]
3               # Write CSV file
4      writer = csv.writer(file)
Index(['import csv'], dtype='object')


In [17]:
# =========================
# 1. Imports
# =========================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# =========================
# 2. Set Random Seed
# =========================
RANDOM_SEED = 42

# =========================
# 3. Example Dataset
# (Replace this with your real dataset)
# =========================
# Creating dummy regression dataset
np.random.seed(RANDOM_SEED)

df = pd.DataFrame({
    "feature1": np.random.rand(100),
    "feature2": np.random.rand(100),
    "feature3": np.random.rand(100),
    "target": np.random.rand(100)
})

# =========================
# 4. Define Features (X) and Target (y)
# =========================
X = df.drop("target", axis=1)
y = df["target"]

# =========================
# 5. Train/Test Split
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=RANDOM_SEED
)

# =========================
# 6. Random Forest Regressor
# =========================
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    random_state=RANDOM_SEED,
    n_jobs=-1
)

# =========================
# 7. Train Model
# =========================
rf.fit(X_train, y_train)

# =========================
# 8. Predictions
# =========================
y_pred = rf.predict(X_test)

# =========================
# 9. Evaluation
# =========================
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R² Score:", r2)

Mean Squared Error: 0.09507802509027001
R² Score: 0.07906159240808563


In [18]:
# 9.1 Predictions
y_pred = rf.predict(X_test)

# 9.2 Metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test MSE: {mse:.6f}")
print(f"Test R^2: {r2:.4f}")

Test MSE: 0.095078
Test R^2: 0.0791


In [19]:
# 10.1 Extract feature importances
importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': importances
}).sort_values(by='importance', ascending=False)

# Display top 20 features
feature_importance_df.head(20)

Unnamed: 0,feature,importance
0,feature1,0.384636
1,feature2,0.350734
2,feature3,0.26463


In [20]:
# 11.1 Example: baseline record
baseline = X_test.iloc[0:1].copy()
baseline_pred = rf.predict(baseline)[0]

# 11.2 Modify a demographic flag or channel indicator
# Suppose add a Teens flag
if 'Teens' in baseline.columns:
    baseline_mod = baseline.copy()
    baseline_mod['Teens'] = 1
    pred_mod = rf.predict(baseline_mod)[0]
    print("Baseline prediction:", baseline_pred)
    print("Modified prediction with Teens=1:", pred_mod)

In [22]:
import joblib

# 12.1 Save model
joblib.dump(rf, "rf_profit_margin_model.pkl")

# 12.2 Save feature importance
feature_importance_df.to_csv("feature_importance.csv", index=False)

# 12.3 Recreate and save modeling dataset
model_df = X.copy()
model_df["profit_margin"] = y

model_df.to_csv("rubicon_model_data.csv", index=False)