In [74]:
# pip install Faker


In [75]:
os.chdir('/Users/bsid24082/Documents/ROI/')

In [76]:
# Importing all the libraries
import numpy as np
import pandas as pd
from faker import Faker
import random

In [77]:
# Initialize Faker for realistic data
fake = Faker()

# --- 1. influencers dataset ---
def generate_influencers(num_influencers=50):
    influencers = []
    categories = ['Fitness', 'Nutrition', 'Lifestyle', 'Beauty', 'Health']
    platforms = ['Instagram', 'YouTube', 'X', 'Threads']
    genders = ['Male', 'Female', 'Non-binary']

    for i in range(1, num_influencers + 1):
        influencers.append({
            'ID': i,
            'name': fake.name(),
            'category': random.choice(categories),
            'gender': random.choice(genders),
            'follower_count': random.randint(10000, 5000000),
            'platform': random.choice(platforms)
        })
    return pd.DataFrame(influencers)



In [83]:
# --- 2. posts dataset ---
def generate_posts(influencers_df, num_posts=200):
    posts = []
    influencer_ids = influencers_df['ID'].tolist()

    for _ in range(num_posts):
        influencer_id = random.choice(influencer_ids)
        influencer = influencers_df[influencers_df['ID'] == influencer_id].iloc[0]
        follower_count = influencer['follower_count']
        platform = influencer['platform']
        date = fake.date_between(start_date='-90d', end_date='today')

        # Realistic engagement based on follower count
        reach = int(follower_count * random.uniform(0.1, 0.3)) # Reach is 10-30% of followers
        likes = int(reach * random.uniform(0.02, 0.1)) # Likes are 2-10% of reach
        comments = int(likes * random.uniform(0.05, 0.2)) # Comments are 5-20% of likes

        posts.append({
            'influencer_id': influencer_id,
            'platform': platform,
            'date': date,
            'URL': fake.url(),
            'caption': fake.sentence(),
            'reach': reach,
            'likes': likes,
            'comments': comments
        })
    return pd.DataFrame(posts)

In [84]:
# --- 3. tracking_data dataset ---
def generate_tracking_data(influencers_df, num_records=500):
    tracking_data = []
    influencer_ids = influencers_df['ID'].tolist()
    products = ['MuscleBlaze_Whey_Protein', 'HKVitals_Multivitamin', 'Gritzo_Super_Milk', 'MuscleTech_Nitrotech_Whey_Protein', 'HKVitals_ACV', 'HKVitals_Omega3', 'MuscleBlaze_Creatine']
    
    # Simulate a baseline of organic sales
    for _ in range(100):
        tracking_data.append({
            'source': 'organic',
            'campaign': 'n/a',
            'influencer_id': np.nan,
            'user_id': fake.uuid4(),
            'product': random.choice(products),
            'date': fake.date_between(start_date='-60d', end_date='today'),
            'orders': random.randint(1, 3),
            'revenue': random.randint(400, 5000)
        })

    # Simulate influencer-driven sales, with revenue scaled by influencer size
    for _ in range(num_records):
        influencer_id = random.choice(influencer_ids)
        influencer = influencers_df[influencers_df['ID'] == influencer_id].iloc[0]
        
        # Revenue is proportional to follower count
        follower_factor = influencer['follower_count'] / 100000
        revenue_per_order = random.randint(400, 5000) * follower_factor
        
        tracking_data.append({
            'source': 'influencer',
            'campaign': f'Campaign_{random.randint(1, 5)}',
            'influencer_id': influencer_id,
            'user_id': fake.uuid4(),
            'product': random.choice(products),
            'date': fake.date_between(start_date='-90d', end_date='today'),
            'orders': random.randint(1, 5),
            'revenue': max(random.randint(400, 15000), revenue_per_order) # Ensures a minimum and caps at a max
        })
    return pd.DataFrame(tracking_data)



In [85]:
# --- 4. payouts dataset ---
def generate_payouts(influencers_df, tracking_data_df):
    payouts = []
    influencer_orders = tracking_data_df.groupby('influencer_id')['orders'].sum().reset_index()
    
    for _, influencer in influencers_df.iterrows():
        basis = random.choice(['post', 'order'])
        rate = 0
        total_payout = 0
        orders = influencer_orders[influencer_orders['influencer_id'] == influencer['ID']]['orders'].iloc[0] if not influencer_orders[influencer_orders['influencer_id'] == influencer['ID']].empty else 0
        
        if basis == 'post':
            # Payout rate is proportional to follower count
            rate = int(influencer['follower_count'] / 500) # A larger influencer costs more per post
            total_payout = rate * random.randint(2, 10)
        elif basis == 'order':
            rate = random.randint(10, 150)
            total_payout = rate * orders

        payouts.append({
            'influencer_id': influencer['ID'],
            'basis': basis,
            'rate': rate,
            'orders': orders,
            'total_payout': total_payout
        })
    return pd.DataFrame(payouts)


In [86]:
# --- Main execution ---
if __name__ == "__main__":
    # Generate dataframes
    influencers_df = generate_influencers()
    posts_df = generate_posts(influencers_df)
    tracking_data_df = generate_tracking_data(influencers_df)
    payouts_df = generate_payouts(influencers_df, tracking_data_df)

    # Save to CSV files
    influencers_df.to_csv('influencers.csv', index=False)
    posts_df.to_csv('posts.csv', index=False)
    tracking_data_df.to_csv('tracking_data.csv', index=False)
    payouts_df.to_csv('payouts.csv', index=False)

    print("Simulated data has been generated and saved to CSV files.")


Simulated data has been generated and saved to CSV files.


In [87]:
import os
print(os.getcwd())

/Users/bsid24082/Documents/ROI
