In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Number of entries
num_entries = 100_000

# Possible colony locations
locations = ["Boulders Beach", "Stony Point", "Dassen Island", "Robben Island", "Bird Island"]

# Generate random timestamps within a 2-year range
start_time = datetime(2022, 1, 1)
timestamps = [start_time + timedelta(minutes=random.randint(0, 2 * 365 * 24 * 60)) for _ in range(num_entries)]

# Generate RFID tags (10-digit numbers)
rfids = [f'{random.randint(1000000000, 9999999999)}' for _ in range(num_entries)]

# Weights (in grams), normally distributed around 3000g ± 300g
weights = np.random.normal(loc=3000, scale=300, size=num_entries).clip(1800, 4500)

# Sex (some may be unknown)
sexes = np.random.choice(['M', 'F', 'Unknown'], size=num_entries, p=[0.45, 0.45, 0.1])

# Age category
age_categories = np.random.choice(['Chick', 'Juvenile', 'Adult'], size=num_entries, p=[0.2, 0.3, 0.5])

# Measurement validity (simulate some invalid entries)
valid_flags = np.random.choice([True, False], size=num_entries, p=[0.95, 0.05])

# Locations
colony_locations = np.random.choice(locations, size=num_entries)

# Construct DataFrame
df = pd.DataFrame({
    'timestamp': timestamps,
    'location': colony_locations,
    'rfid': rfids,
    'weight_g': weights.astype(int),
    'sex': sexes,
    'age_category': age_categories,
    'valid_measurement': valid_flags
})

# Save to CSV
df.to_csv("dummy_penguin_weights_100k.csv", index=False)
print("100k-entry dummy dataset saved as 'dummy_penguin_weights_100k.csv'")


100k-entry dummy dataset saved as 'dummy_penguin_weights_100k.csv'


In [3]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

# 1. Load and filter the data
df = pd.read_csv("dummy_penguin_weights_100k.csv")
df = df[df['valid_measurement'] == True]  # Use only valid data

# 2. Select features
X = df[['weight_g']]  # you can add more features later

# 3. Normalize the data (optional but helps)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4. Train Isolation Forest
model = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
model.fit(X_scaled)

# 5. Predict anomalies
df['anomaly'] = model.predict(X_scaled)  # -1 = anomaly, 1 = normal
anomalies = df[df['anomaly'] == -1]

# 6. Show some anomalies
print(anomalies[['timestamp', 'location', 'rfid', 'weight_g']].head())



               timestamp       location        rfid  weight_g
55   2022-08-01 16:24:00    Bird Island  4211421873      2141
252  2022-12-06 14:07:00    Stony Point  5641961670      3805
301  2022-12-27 13:55:00    Stony Point  3434443855      3807
583  2023-09-01 04:45:00    Bird Island  5440534126      1982
592  2023-03-03 21:49:00  Robben Island  4662180488      3828


In [5]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import IsolationForest

# Assume df is your DataFrame already loaded with columns: timestamp, location, rfid, weight_g, sex, age_category

# 1. Extract time features
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour_of_day'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek

# 2. Define categorical and numerical columns
categorical_cols = ['location', 'sex', 'age_category']
numerical_cols = ['weight_g', 'hour_of_day', 'day_of_week']

# 3. Create preprocessing pipeline: one-hot encode categorical, scale numerical
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ])

# 4. Fit and transform the data
X = preprocessor.fit_transform(df)

# Note: X is a sparse matrix if OneHotEncoder returns sparse; convert to dense if needed
if hasattr(X, "toarray"):
    X = X.toarray()

# 5. Train Isolation Forest model
model = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
model.fit(X)

# 6. Predict anomalies (-1 = anomaly, 1 = normal)
df['anomaly'] = model.predict(X)

# 7. Extract anomalies
anomalies = df[df['anomaly'] == -1]

# 8. Show some anomalies
print(anomalies[['timestamp', 'location', 'rfid', 'weight_g']].head())

              timestamp        location        rfid  weight_g
128 2022-03-06 08:31:00     Bird Island  9090307981      2593
213 2022-10-31 23:56:00   Robben Island  3134069470      3087
338 2022-03-15 01:52:00     Stony Point  3559797170      2508
465 2023-07-30 21:08:00     Stony Point  6126293435      3301
723 2022-05-30 22:06:00  Boulders Beach  1312332384      2785
