In [1]:
import pandas as pd
import numpy as np
import random
from faker import Faker

In [2]:
#set random seed for reproducibility
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
#selected campaign_id
CAMPAIGN_ID = 301

#initialize faker with a seed
fake = Faker()
Faker.seed(SEED)


In [3]:
#loading the dataset

path = "campaign_performance.xlsx"

df_campaigns = pd.read_excel(path,sheet_name="campaigns")
df_metrics = pd.read_excel(path, sheet_name="campaign_metrics")
df_transactions = pd.read_excel(path, sheet_name="transactions")

In [4]:
preferences = {
    "Generation Z": {
        "Age Range": (18, 27),
        "Shopping Preference": {
            "Male": {
                "Groceries": [0.3, 0.1],  # Weight: 30%, Spending deviation: 10%
                "Health and Beauty": [0.15, 0.05],
                "Electronics": [0.55, 0.3],
            },
            "Female": {
                "Groceries": [0.3, 0.1],
                "Health and Beauty": [0.45, 0.15],
                "Electronics": [0.25, 0.2],
            },
        },
    },
    "Millennials": {
        "Age Range": (28, 40),
        "Shopping Preference": {
            "Male": {
                "Groceries": [0.35, 0.1],
                "Health and Beauty": [0.15, 0.1],
                "Electronics": [0.5, 0.25],
            },
            "Female": {
                "Groceries": [0.4, 0.15],
                "Health and Beauty": [0.4, 0.2],
                "Electronics": [0.2, 0.2],
            },
        },
    },
    "Boomers": {
        "Age Range": (41, 65),
        "Shopping Preference": {
            "Male": {
                "Groceries": [0.5, 0.2],
                "Health and Beauty": [0.4, 0.1],
                "Electronics": [0.1, 0.15],
            },
            "Female": {
                "Groceries": [0.5, 0.25],
                "Health and Beauty": [0.4, 0.15],
                "Electronics": [0.1, 0.1],
            },
        },
    },
}


age_distribution = [
        (18, 19, 7.36), 
        (20, 24, 8.68),
        (25, 29, 8.92),
        (30, 34, 6.97),
        (35, 39, 6.91),
        (40, 44, 5.15),
        (45, 49, 4.16),
        (50, 54, 2.99),
        (55, 59, 2.42),
        (60, 65, 1.83) 
    ]


In [5]:
class CustomerGenerator:
    def __init__(self, age_distribution):
        self.age_distribution = self._normalize_age_distribution(age_distribution)
        self.fake = fake

    def _normalize_age_distribution(self, age_distribution):
        """Normalize the age distribution probabilities to sum to 100%"""
        total_prob = sum(prob for _, _, prob in age_distribution)
        return [(start, end, (prob / total_prob) * 100) for start, end, prob in age_distribution]

    def _generate_age(self):
        """Generate a random age based on the normalized distribution"""
        rand_value = random.uniform(0, 100)
        cumulative_probability = 0

        for start_age, end_age, probability in self.age_distribution:
            cumulative_probability += probability
            if rand_value <= cumulative_probability:
                age = random.randint(start_age, end_age)
                return max(18, min(age, 65))

    def generate_customer(self):
        """Generate a new customer with random attributes"""
        return {
            "customer_id": self.fake.uuid4(),
            "gender": random.choice(["Male", "Female"]),
            "age": self._generate_age()
        }


class TransactionGenerator:
    def __init__(self, preferences):
        self.preferences = preferences

    def _get_generation(self, age):

        """Determine generation based on age"""

        for gen_name, gen_info in self.preferences.items():
            if gen_info["Age Range"][0] <= age <= gen_info["Age Range"][1]:
                return gen_name
        raise ValueError(f"Age {age} does not fit into any defined generation in preferences.")

    def _get_product_category(self, generation, gender):

        """Select a product category based on generational and gender preferences"""

        gen_preferences = self.preferences[generation]["Shopping Preference"][gender]
        return random.choices(
            population=list(gen_preferences.keys()),
            weights=[prefs[0] for prefs in gen_preferences.values()],
            k=1
        )[0]

    def _calculate_adjusted_amount(self, base_amount, generation, gender, product_category):

        """Calculate adjusted transaction amount based on preferences"""

        spending_deviation = self.preferences[generation]["Shopping Preference"][gender][product_category][1]
        return round(base_amount * random.uniform(1 - spending_deviation, 1 + spending_deviation))

    def generate_transaction(self, customer, base_amount, campaign_id, date, transaction_id):
        """Generate a new transaction for a customer"""
        generation = self._get_generation(customer["age"])
        product_category = self._get_product_category(generation, customer["gender"])
        adjusted_amount = self._calculate_adjusted_amount(base_amount, generation, customer["gender"], product_category)

        return {
            "campaign_id": campaign_id,
            "transaction_id": transaction_id,
            "amount": adjusted_amount,
            "transaction_date": date,
            "customer_id": customer["customer_id"],
            "gender": customer["gender"],
            "age": customer["age"],
            "product_category": product_category,
        }


class CampaignDataset:
    def __init__(self, df_campaigns: pd.DataFrame, df_metrics: pd.DataFrame, df_transactions: pd.DataFrame):
        self.campaigns = df_campaigns
        self.metrics = df_metrics
        self.transactions = df_transactions
        self.daily_metrics = None  # To store simulated daily metrics after processing

    def _merge_campaigns_and_metrics(self):
        """Merge campaign data with metrics data based on campaign_id"""
        merged = pd.merge(self.campaigns, self.metrics, on="campaign_id", how="inner")
        if merged.empty:
            raise ValueError("No matching campaign data found in metrics.")
        return merged
    
    def _get_daily_weight(self, date):
        """
        Calculate weight multiplier for a given date based on day of week
        and typical daily patterns.
        """
        # Day of week weights (Monday = 0, Sunday = 6)
        dow_weights = {
            0: 0.9,  # Monday
            1: 0.95, # Tuesday
            2: 1.0,  # Wednesday
            3: 1.0,  # Thursday
            4: 1.1,  # Friday
            5: 1.3,  # Saturday
            6: 1.2   # Sunday
        }
        
        # Get base weight for day of week
        day_of_week = date.weekday()
        base_weight = dow_weights[day_of_week]
            
        return base_weight

    def simulate_daily_metrics(self, campaign_id):

        """Simulate daily metrics for a campaign."""

        merged_data = self._merge_campaigns_and_metrics()
        campaign = merged_data[merged_data["campaign_id"] == campaign_id].iloc[0]
        start_date, end_date = campaign["start_date"], campaign["end_date"]
        total_days = (end_date - start_date).days + 1

        total_impressions = campaign["Impressions"]
        total_clicks = campaign["Clicks"]
        base_ctr = total_clicks/total_impressions

        daily_weights = [self._get_daily_weight(start_date + pd.Timedelta(days=i)) 
                        for i in range(total_days)]
        total_weight = sum(daily_weights)

        # Normalize weights and calculate base daily impressions
        daily_weights = [w/total_weight for w in daily_weights]
        base_daily_impressions = [int(total_impressions * w) for w in daily_weights]
        
        
        # Add random variation while preserving the pattern
        variation_factor = 0.02  # 20% maximum variation

        daily_impressions = [
            int(imp + (imp * random.gauss(0, variation_factor)))
            for imp in base_daily_impressions
        ]

        # Calculate clicks with slight CTR variations
        daily_clicks = [
            int(impression * (base_ctr + random.gauss(0, variation_factor)))
            for impression in daily_impressions
        ]

        # Scale clicks and impressions to sum up to the total campaign metrics
        scale_factor = total_clicks / sum(daily_clicks)
        daily_clicks = [int(click * scale_factor) for click in daily_clicks]
            
        scale_factor = total_impressions / sum(daily_impressions)
        daily_impressions = [int(imp * scale_factor) for imp in daily_impressions]


        # Add possible deficits to the maximum value so that the CTR is not affected
        impressions_deficit = total_impressions - sum(daily_impressions)
        clicks_deficit = total_clicks - sum(daily_clicks)
        max_impressions_index = daily_impressions.index(max(daily_impressions))
        daily_impressions[max_impressions_index] = daily_impressions[max_impressions_index] + impressions_deficit
        daily_clicks[max_impressions_index] = daily_clicks[max_impressions_index] + clicks_deficit

        daily_data = pd.DataFrame({
            "campaign_id": campaign_id,
            "date": [start_date + pd.Timedelta(days=i) for i in range(total_days)],
            "daily_impressions": daily_impressions,
            "daily_clicks": daily_clicks,
        })

        self.daily_metrics = daily_data
        return daily_data

    def generate_synthetic_transactions(self, campaign_id: int, preferences: dict, age_distribution: list, click_conversion: float):
        """Generate synthetic transactions for a campaign based on daily metrics."""
        if self.daily_metrics is None or self.daily_metrics.empty:
            raise ValueError("Daily metrics not generated. Please run simulate_daily_metrics first.")

        daily_data = self.daily_metrics[self.daily_metrics["campaign_id"] == campaign_id]
        if daily_data.empty:
            raise ValueError(f"No daily metrics found for campaign_id {campaign_id}")

        customer_gen = CustomerGenerator(age_distribution)
        transaction_gen = TransactionGenerator(preferences)
        new_rows = []
        next_transaction_id = int(self.transactions["transaction_id"].max() + 1) if not self.transactions.empty else 1

        for _, row in daily_data.iterrows():
            num_transactions = int(row["daily_clicks"] * click_conversion)
            for _ in range(num_transactions):
                customer = customer_gen.generate_customer()
                new_transaction = transaction_gen.generate_transaction(
                    customer=customer,
                    base_amount=random.randint(75, 500),  # Randomize base amount
                    campaign_id=campaign_id,
                    date=row["date"],
                    transaction_id=next_transaction_id + len(new_rows),
                )
                new_rows.append(new_transaction)

        return pd.concat([self.transactions[self.transactions['campaign_id'] == CAMPAIGN_ID], pd.DataFrame(new_rows)], ignore_index=True)


In [6]:
campaign_dataset = CampaignDataset(df_campaigns, df_metrics, df_transactions)


campaign_dataset.simulate_daily_metrics(CAMPAIGN_ID)
extrapolated_transaction = campaign_dataset.generate_synthetic_transactions(CAMPAIGN_ID, preferences, age_distribution, 0.07)

In [7]:
daily_metrics = campaign_dataset.daily_metrics

daily_metrics["CTR"] = daily_metrics["daily_clicks"]/daily_metrics["daily_impressions"]

grouped_transactions = extrapolated_transaction.groupby(by=["campaign_id", "transaction_date"]).count()[["transaction_id"]].rename(columns={"transaction_id":"unique_transactions"})
pd.merge(daily_metrics, grouped_transactions, how="left", left_on=["campaign_id", "date"], right_on=["campaign_id", "transaction_date"])

Unnamed: 0,campaign_id,date,daily_impressions,daily_clicks,CTR,unique_transactions
0,301,2024-01-01,2839,223,0.078549,15
1,301,2024-01-02,2995,310,0.103506,21
2,301,2024-01-03,3156,393,0.124525,27
3,301,2024-01-04,3208,319,0.099439,22
4,301,2024-01-05,3472,335,0.096486,24
5,301,2024-01-06,3989,436,0.109301,31
6,301,2024-01-07,3822,267,0.069859,18
7,301,2024-01-08,2832,262,0.092514,18
8,301,2024-01-09,2992,324,0.108289,22
9,301,2024-01-10,3171,367,0.115736,25
