#### Generate "Creator (Influencer) Profiles"

In [3]:
import pandas as pd
import numpy as np
from faker import Faker
import random

# Initialize Faker and set seed for reproducibility
fake = Faker()
random.seed(42)
np.random.seed(42)

# Define our JD-specific constants
MARKETS = ['UK', 'France', 'Germany', 'UAE', 'USA']
CATEGORIES = ['Skincare', 'Makeup', 'Haircare', 'Personal Care']
REWARD_TYPES = ['Sample Product', 'Full Size Set', 'Exclusive Kit']

print("Libraries imported and business constants defined.")

Libraries imported and business constants defined.


In [4]:
num_creators = 500

creators_data = {
    'creator_id': [f"CR-{i:04d}" for i in range(1, num_creators + 1)],
    'name': [fake.name() for _ in range(num_creators)],
    'market': [random.choice(MARKETS) for _ in range(num_creators)],
    'category': [random.choice(CATEGORIES) for _ in range(num_creators)],
    'follower_count': [random.randint(1200, 9500) for _ in range(num_creators)],
    'join_date': [fake.date_between(start_date='-2y', end_date='-1y') for _ in range(num_creators)]
}

df_creators = pd.DataFrame(creators_data)

# Save to your manual folder
df_creators.to_csv("data/raw/dim_creators.csv", index=False)
print(f"Generated {num_creators} creator profiles. Saved to data/raw/dim_creators.csv")
df_creators.head()

Generated 500 creator profiles. Saved to data/raw/dim_creators.csv


Unnamed: 0,creator_id,name,market,category,follower_count,join_date
0,CR-0001,Robert Anthony,UK,Makeup,7448,2024-06-11
1,CR-0002,Dawn Wood,UK,Skincare,7081,2024-10-05
2,CR-0003,Michelle Berry,Germany,Haircare,6047,2024-06-03
3,CR-0004,Mark Cole,France,Haircare,6037,2024-09-03
4,CR-0005,Steven Munoz,France,Skincare,1559,2024-10-02


#### Generating Campaign Performance

In [5]:
num_posts = 2500
posts_data = []

for _ in range(num_posts):
    # Pick a random creator to "post"
    creator = df_creators.sample(1).iloc[0]
    cid = creator['creator_id']
    mkt = creator['market']
    cat = creator['category']
    f_count = creator['follower_count']
    
    # INDUSTRY LOGIC: Define base engagement rates
    # Beauty campaigns usually see 3% - 7% ER for Nano-influencers
    base_er = random.uniform(0.03, 0.07)
    
    # MARKET BIAS: UAE and France markets are currently high-growth
    if mkt in ['UAE', 'France']:
        base_er += 0.02
        
    # CATEGORY BIAS: Makeup gets more Likes, Skincare gets more Saves
    engagement_vol = int(f_count * base_er * random.uniform(0.7, 1.3))
    
    if cat == 'Skincare':
        likes = int(engagement_vol * 0.5)
        saves = int(engagement_vol * 0.4) # High saves for skincare routines
        comments = int(engagement_vol * 0.1)
    else:
        likes = int(engagement_vol * 0.8)
        saves = int(engagement_vol * 0.05)
        comments = int(engagement_vol * 0.15)

    posts_data.append({
        'post_id': f"P-{fake.uuid4()[:6].upper()}",
        'creator_id': cid,
        'date': fake.date_between(start_date='-180d', end_date='today'),
        'reward_type': random.choice(REWARD_TYPES),
        'likes': likes,
        'comments': comments,
        'saves': saves,
        'impressions': int(f_count * random.uniform(1.1, 2.5))
    })

df_posts = pd.DataFrame(posts_data)
df_posts.to_csv("data/raw/fact_campaign_posts.csv", index=False)
print(f"Generated {num_posts} campaign posts. Saved to data/raw/fact_campaign_posts.csv")

Generated 2500 campaign posts. Saved to data/raw/fact_campaign_posts.csv
