In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Number of entries
num_entries = 100_000

# Possible colony locations
locations = ["Boulders Beach", "Stony Point", "Dassen Island", "Robben Island", "Bird Island"]

# Generate random timestamps within a 2-year range
start_time = datetime(2022, 1, 1)
timestamps = [start_time + timedelta(minutes=random.randint(0, 2 * 365 * 24 * 60)) for _ in range(num_entries)]

# Generate RFID tags (10-digit numbers)
rfids = [f'{random.randint(1000000000, 9999999999)}' for _ in range(num_entries)]

# Weights (in grams), normally distributed around 3000g ± 300g
weights = np.random.normal(loc=3000, scale=300, size=num_entries).clip(1800, 4500)

# Sex (some may be unknown)
sexes = np.random.choice(['M', 'F', 'Unknown'], size=num_entries, p=[0.45, 0.45, 0.1])

# Age category
age_categories = np.random.choice(['Chick', 'Juvenile', 'Adult'], size=num_entries, p=[0.2, 0.3, 0.5])

# Measurement validity (simulate some invalid entries)
valid_flags = np.random.choice([True, False], size=num_entries, p=[0.95, 0.05])

# Locations
colony_locations = np.random.choice(locations, size=num_entries)

# Construct DataFrame
df = pd.DataFrame({
    'timestamp': timestamps,
    'location': colony_locations,
    'rfid': rfids,
    'weight_g': weights.astype(int),
    'sex': sexes,
    'age_category': age_categories,
    'valid_measurement': valid_flags
})

# Save to CSV
df.to_csv("dummy_penguin_weights_100k.csv", index=False)
print("100k-entry dummy dataset saved as 'dummy_penguin_weights_100k.csv'")
