In [1]:
# Cell 1: basic imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ML & NLP
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Text processing
import re

In [2]:
# Cell: create synthetic sales & margin data for demonstration
np.random.seed(42)

# Example dimensions
n_samples = 1000

# Regions: UK, USA, Australia, Canada
regions = ['UK', 'USA', 'Australia', 'Canada']

# Demographics: gender, age group, income level, urban/rural
gender = ['Male', 'Female', 'Other']
age_groups = ['18-24', '25-34', '35-44', '45-54', '55+']
income_levels = ['Low', 'Medium', 'High']
urban_rural = ['Urban', 'Suburban', 'Rural']

data = pd.DataFrame({
    'region': np.random.choice(regions, size=n_samples, p=[0.4, 0.25, 0.2, 0.15]),
    'gender': np.random.choice(gender, size=n_samples),
    'age_group': np.random.choice(age_groups, size=n_samples),
    'income_level': np.random.choice(income_levels, size=n_samples),
    'urban_rural': np.random.choice(urban_rural, size=n_samples),
    # Simulate monthly sales volume or value
    'monthly_sales_value': np.random.gamma(shape=2.0, scale=5000, size=n_samples),
})

# Simulate profit margin as a percent, with some region-level variation
region_margin_base = {'UK': 0.18, 'USA': 0.15, 'Australia': 0.17, 'Canada': 0.16}
data['profit_margin'] = data['region'].map(region_margin_base) + np.random.normal(0, 0.02, n_samples)
# Ensure within 0..1
data['profit_margin'] = data['profit_margin'].clip(0, 1)

data.head()

Unnamed: 0,region,gender,age_group,income_level,urban_rural,monthly_sales_value,profit_margin
0,UK,Other,45-54,High,Urban,10592.966313,0.201963
1,Canada,Female,55+,High,Urban,6974.78115,0.152404
2,Australia,Other,45-54,High,Suburban,16483.015305,0.175693
3,USA,Other,25-34,Low,Urban,10559.961377,0.170219
4,UK,Male,45-54,High,Rural,13517.439917,0.168483


In [3]:
# Synthetic text comments: in real use, replace with actual scraped or collected text
comments = [
    "Love the fruity taste, perfect for summer!",
    "Too sweet and expensive for daily drinking.",
    "Great alternative when going alcohol-free.",
    "Not widely available in my area.",
    "Enjoyed it at a BBQ, refreshing and fun.",
    "Packaging is better this year, looks nicer."
]

data['comment'] = np.random.choice(comments, size=n_samples)
data.head()

Unnamed: 0,region,gender,age_group,income_level,urban_rural,monthly_sales_value,profit_margin,comment
0,UK,Other,45-54,High,Urban,10592.966313,0.201963,"Enjoyed it at a BBQ, refreshing and fun."
1,Canada,Female,55+,High,Urban,6974.78115,0.152404,"Packaging is better this year, looks nicer."
2,Australia,Other,45-54,High,Suburban,16483.015305,0.175693,"Packaging is better this year, looks nicer."
3,USA,Other,25-34,Low,Urban,10559.961377,0.170219,"Packaging is better this year, looks nicer."
4,UK,Male,45-54,High,Rural,13517.439917,0.168483,Not widely available in my area.


In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

data['comment_clean'] = data['comment'].apply(clean_text)

In [5]:
from sklearn.preprocessing import OneHotEncoder

# Categorical columns
cat_cols = ['region', 'gender', 'age_group', 'income_level', 'urban_rural']
# Numeric columns (if any besides text)
num_cols = []  # for now, monthly_sales_value is target, not feature; adjust as needed

# Column transformer for pre-processing
preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(stop_words='english', max_features=100), 'comment_clean'),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ],
    remainder='drop'
)

In [6]:
# Features X and target y
X = data[['comment_clean'] + cat_cols]
y = data['profit_margin']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline combining preprocessing and model
rf_pipeline = Pipeline([
    ('pre', preprocessor),
    ('rf', RandomForestRegressor(n_estimators=200, random_state=42))
])

# Train
rf_pipeline.fit(X_train, y_train)

# Predict
y_pred = rf_pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Test MSE: {mse:.5f}")

Test MSE: 0.00065


In [7]:
# Create binary target: top 25% margin vs others
threshold = data['profit_margin'].quantile(0.75)
data['margin_high'] = (data['profit_margin'] >= threshold).astype(int)

X = data[['comment_clean'] + cat_cols]
y = data['margin_high']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_clf_pipeline = Pipeline([
    ('pre', preprocessor),
    ('rf', RandomForestClassifier(n_estimators=200, random_state=42))
])

rf_clf_pipeline.fit(X_train, y_train)
y_pred_class = rf_clf_pipeline.predict(X_test)

print(classification_report(y_test, y_pred_class))

              precision    recall  f1-score   support

           0       0.78      0.82      0.80       151
           1       0.34      0.29      0.31        49

    accuracy                           0.69       200
   macro avg       0.56      0.55      0.56       200
weighted avg       0.67      0.69      0.68       200

