In [None]:
"""
marketing_mix_model_project.py

I implement a complete marketing mix modeling (MMM) workflow using a synthetic
dataset. It covers data loading, exploratory analysis, adstock and saturation
transformations, linear regression modeling, ROI calculation, and a simple scenario
simulation.
"""

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm


In [None]:
def load_data(filepath: str) -> pd.DataFrame:
    """Load the marketing dataset from a CSV file.

    Args:
        filepath: Path to the CSV file.

    Returns:
        A pandas DataFrame containing the data.
    """
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Dataset not found at {filepath}")
    df = pd.read_csv(filepath, parse_dates=["date"])
    return df


def summary_statistics(df: pd.DataFrame) -> None:
    """Print basic summary statistics and correlations.

    Args:
        df: DataFrame containing the marketing data.
    """
    print("\n===== Summary Statistics =====\n")
    print(df.describe())
    print("\n===== Correlation Matrix =====\n")
    print(df.corr(numeric_only=True))


In [None]:
def plot_time_series(df: pd.DataFrame, output_dir: str) -> None:
    """Plot time series of spend and sales variables and save figures.

    Args:
        df: DataFrame with marketing data.
        output_dir: Directory where the plots will be saved.
    """
    os.makedirs(output_dir, exist_ok=True)
    numeric_cols = [col for col in df.columns if col not in {"date", "promotion", "holiday"}]
    for col in numeric_cols:
        plt.figure(figsize=(10, 3))
        plt.plot(df["date"], df[col])
        plt.title(f"{col} over time")
        plt.xlabel("Date")
        plt.ylabel(col)
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f"{col}_timeseries.png"))
        plt.close()


def adstock(series: pd.Series, decay: float = 0.5) -> np.ndarray:
    """Apply adstock (carryover) transformation to a series.

    The adstock function models how advertising impact carries over into future
    periods.  For each observation, it adds the current spend to a portion of
    the previous period's effect, scaled by the decay factor.

    Args:
        series: Input spend series.
        decay: Decay factor between 0 and 1 (e.g., 0.5 means half of the
               previous effect carries into the current period).

    Returns:
        A numpy array containing the adstocked series.
    """
    result = []
    carryover = 0.0
    for x in series:
        carryover = x + carryover * decay
        result.append(carryover)
    return np.array(result)


def transform_media_variables(df: pd.DataFrame, media_cols: list, decay: float = 0.5) -> pd.DataFrame:
    """Create adstock and saturation-transformed columns for each media variable.

    Args:
        df: The original DataFrame.
        media_cols: List of columns representing media spend.
        decay: Adstock decay factor.

    Returns:
        DataFrame with additional columns for adstock and saturated variables.
    """
    transformed_df = df.copy()
    for col in media_cols:
        adstock_col = f"{col}_adstock"
        sat_col = f"{col}_sat"
        transformed_df[adstock_col] = adstock(transformed_df[col], decay=decay)
        # Apply a log1p transformation to capture diminishing returns
        transformed_df[sat_col] = np.log1p(transformed_df[adstock_col])
    return transformed_df

In [None]:
def build_linear_model(df: pd.DataFrame, feature_cols: list, target_col: str) -> sm.regression.linear_model.RegressionResultsWrapper:
    """Fit an OLS regression model.

    Args:
        df: DataFrame containing the data.
        feature_cols: List of feature column names to include in the model.
        target_col: Name of the response variable.

    Returns:
        A fitted statsmodels OLS model.
    """
    X = df[feature_cols]
    X = sm.add_constant(X)
    y = df[target_col]
    model = sm.OLS(y, X).fit()
    return model


def calculate_roi(model: sm.regression.linear_model.RegressionResultsWrapper, df: pd.DataFrame, media_cols: list) -> dict:
    """Calculate ROI for each media channel.

    ROI is defined as the model coefficient multiplied by the total adstocked &
    saturated spend divided by the total original spend.  This provides an
    estimate of incremental sales per dollar spent.

    Args:
        model: Fitted regression model.
        df: DataFrame containing the data.
        media_cols: List of media spend columns.

    Returns:
        Dictionary mapping channel names to ROI values.
    """
    roi = {}
    for col in media_cols:
        sat_col = f"{col}_sat"
        coef = model.params.get(sat_col, 0.0)
        transformed_sum = df[sat_col].sum()
        original_sum = df[col].sum()
        if original_sum > 0:
            roi_value = coef * transformed_sum / original_sum
        else:
            roi_value = np.nan
        roi[col] = roi_value
    return roi

In [None]:
def scenario_simulation(model: sm.regression.linear_model.RegressionResultsWrapper, base_df: pd.DataFrame, media_cols: list, scenario_spend: dict) -> float:
    """Simulate the predicted sales under a new spend scenario.

    This function applies the adstock and saturation transformations to the
    proposed spend values, then uses the model coefficients to predict
    incremental sales relative to the base case.

    Args:
        model: Fitted regression model.
        base_df: DataFrame containing the original data (used for non-media
                 variables and to compute baseline values).
        media_cols: List of media spend columns.
        scenario_spend: Dictionary mapping media columns to new spend levels.

    Returns:
        Predicted total sales under the scenario.
    """
    # Copy the last observed row and update spends
    last_row = base_df.iloc[-1].copy()
    for col, new_value in scenario_spend.items():
        last_row[col] = new_value
        # Recompute adstock and saturation for the last observation
        # Build a full series that includes the new spend value at the end.  Use
        # pd.concat instead of the deprecated Series.append method.
        full_series = pd.concat([base_df[col], pd.Series([new_value])], ignore_index=True)
        adstock_value = adstock(full_series, decay=0.5)[-1]
        last_row[f"{col}_adstock"] = adstock_value
        last_row[f"{col}_sat"] = np.log1p(adstock_value)
    # Prepare features as a DataFrame (single row) to align shapes with model
    feature_cols = [c for c in model.params.index if c != 'const']
    X_new = last_row[feature_cols].to_frame().T
    # Add constant term; specify has_constant to avoid duplicate constant
    X_new = sm.add_constant(X_new, has_constant='add')
    predicted_sales = float(model.predict(X_new).iloc[0])
    return predicted_sales



In [None]:
def main():
    """Run the full MMM workflow on the synthetic dataset."""
    # Locate dataset
    # dataset_path = os.path.join(os.path.dirname(__file__), 'synthetic_marketing_data.csv')
    # if not os.path.exists(dataset_path):
        # Try shared folder path
    dataset_path = '/content/MMM/synthetic_marketing_data.csv'

    df = load_data(dataset_path)

    # Basic exploration
    summary_statistics(df)
    plot_time_series(df, output_dir='/content/plots') # Keep this for now, will fix later if needed

    # Identify media spend columns
    media_cols = ['tv_spend', 'radio_spend', 'social_spend', 'search_spend']

    # Transform media variables
    df_transformed = transform_media_variables(df, media_cols, decay=0.5)

    # Define features for modeling
    feature_cols = [f"{col}_sat" for col in media_cols] + ['price', 'promotion', 'holiday']

    # Fit linear model
    model = build_linear_model(df_transformed, feature_cols, target_col='sales')
    print("\n===== Model Summary =====\n")
    print(model.summary())

    # Calculate ROI
    roi = calculate_roi(model, df_transformed, media_cols)
    print("\n===== Estimated ROI by channel (synthetic example) =====\n")
    for channel, value in roi.items():
        print(f"{channel}: {value:.2f} incremental sales per $1 spent")

    # Scenario simulation example
    scenario = {
        'tv_spend': df['tv_spend'].iloc[-1] * 1.2,       # increase TV spend by 20%
        'radio_spend': df['radio_spend'].iloc[-1],       # keep radio the same
        'social_spend': df['social_spend'].iloc[-1] * 0.8, # reduce social by 20%
        'search_spend': df['search_spend'].iloc[-1]      # keep search the same
    }
    predicted_sales = scenario_simulation(model, df_transformed, media_cols, scenario)
    print("\n===== Scenario Simulation =====\n")
    print("Baseline last-week sales:", df['sales'].iloc[-1])
    print("Predicted sales under scenario:", round(predicted_sales, 2))
    print("Change vs. baseline:", round(predicted_sales - df['sales'].iloc[-1], 2))

In [None]:
if __name__ == '__main__':
    main()


===== Summary Statistics =====

                      date       tv_spend   radio_spend   social_spend  \
count                  104     104.000000    104.000000     104.000000   
mean   2023-12-28 12:00:00   99576.388601  50864.935389   67426.208526   
min    2023-01-02 00:00:00   51612.920670  20161.283874   20554.540841   
25%    2023-07-01 06:00:00   81602.941711  37460.072827   42260.049896   
50%    2023-12-28 12:00:00  100612.950668  53168.822603   63659.051984   
75%    2024-06-25 18:00:00  118654.775361  65126.869358   97991.863958   
max    2024-12-23 00:00:00  149535.848203  79765.179642  119303.326109   
std                    NaN   24807.997149  17233.083453   31102.775929   

       search_spend       price   promotion     holiday          sales  
count    104.000000  104.000000  104.000000  104.000000     104.000000  
mean   29949.564910   11.927601    0.076923    0.038462  254187.003233  
min    10141.288439   10.778462    0.000000    0.000000  170601.051283  
25%    2