In [21]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import math

In [26]:
# Set random seed for reproducibility
np.random.seed(42)

# Parameters
start_date = datetime(2010, 1, 1)
end_date = datetime(2025, 5, 31, 18, 0)
hours = int((end_date - start_date).total_seconds() / (6 * 3600)) + 1  # ~21,552 rows
collapse_counts = {
    2010: 42, 2011: 19, 2012: 24, 2013: 24, 2014: 13, 2015: 10,
    2016: 28, 2017: 24, 2018: 13, 2019: 10, 2020: 4, 2021: 4,
    2022: 12, 2023: 12, 2024: 4, 2025: 3
}
total_collapses = sum(collapse_counts.values())  # 246 collapses

In [27]:
total_collapses

246

In [28]:
# Generate timestamps (6-hourly)
timestamps = [start_date + timedelta(hours=6 * i) for i in range(hours)]
df = pd.DataFrame({'Timestamp': timestamps})

# Generate independent parameters
df['Voltage (V)'] = np.random.normal(330000, 5000, hours)  # 330kV ±5kV
df['Power Factor'] = np.random.uniform(0.8, 0.95, hours)  # 0.8–0.95
df['Grid Supply (kW)'] = np.random.normal(4000000, 400000, hours)  # ~4,000 MW
df['Grid Frequency (Hz)'] = np.random.normal(50, 0.2, hours)  # 50 Hz ±0.2
df['Generation Capacity (MW)'] = np.random.normal(4000, 500, hours)  # ~4,000 MW

# Derive Power Consumption (kW): Grid Supply ± deviation
df['Power Consumption (kW)'] = df['Grid Supply (kW)'] * np.random.normal(1.05, 0.1, hours)

# Derive Current (A): I = P / (V * PF * sqrt(3))
df['Current (A)'] = (df['Power Consumption (kW)'] * 1000) / (df['Voltage (V)'] * df['Power Factor'] * np.sqrt(3))

# Derive Reactive Power (kVAR): Q = P * tan(acos(PF))
df['Reactive Power (kVAR)'] = df['Power Consumption (kW)'] * np.tan(np.arccos(df['Power Factor']))

# Weather: Wet (Apr–Oct: 28°C, 70%), Dry (Nov–Mar: 32°C, 35%)
df['Temperature (°C)'] = [np.random.normal(28, 3) if 4 <= t.month <= 10 else np.random.normal(32, 3) for t in df['Timestamp']]
df['Humidity (%)'] = [np.random.normal(70, 10) if 4 <= t.month <= 10 else np.random.normal(35, 10) for t in df['Timestamp']]

# Faults and events
df['Transformer Fault'] = np.random.choice([0, 1], hours, p=[0.98, 0.02])
df['Line Trip Events'] = np.random.choice([0, 1], hours, p=[0.95, 0.05])
df['Overload Condition'] = (df['Power Consumption (kW)'] > df['Grid Supply (kW)'] * 1.1).astype(int)

In [30]:
# Specific collapse dates
specific_collapses = [
    datetime(2024, 12, 11, 6 * np.random.randint(0, 4)),  # Omotosho-Ikeja West line trip, vandalism
    datetime(2025, 1, 11, 6 * np.random.randint(0, 4)),   # Line tripping
    datetime(2025, 2, 12, 6 * np.random.randint(0, 4)),   # Line tripping
    datetime(2025, 3, 7, 6 * np.random.randint(0, 4))     # Assumed line tripping
]
collapse_indices = set()
for collapse_date in specific_collapses:
    closest_idx = df.index[df['Timestamp'] == df['Timestamp'].iloc[(df['Timestamp'] - collapse_date).abs().argmin()]].tolist()
    if closest_idx:
        collapse_indices.add(closest_idx[0])

# Distribute remaining collapses by year
for year, count in collapse_counts.items():
    year_indices = set(df[df['Timestamp'].dt.year == year].index)
    year_specific_collapses = sum(1 for idx in collapse_indices if df.loc[idx, 'Timestamp'].year == year)
    year_remaining = count - year_specific_collapses
    if year_remaining > 0:
        available_indices = list(year_indices - collapse_indices)
        if len(available_indices) >= year_remaining:
            selected = np.random.choice(available_indices, size=year_remaining, replace=False)
            collapse_indices.update(selected)

# Ensure exactly 246 collapses
collapse_indices = list(collapse_indices)
if len(collapse_indices) < total_collapses:
    available_indices = [i for i in df.index if i not in collapse_indices]
    extra_indices = np.random.choice(available_indices, size=total_collapses - len(collapse_indices), replace=False)
    collapse_indices.extend(extra_indices)
elif len(collapse_indices) > total_collapses:
    collapse_indices = collapse_indices[:total_collapses]

In [31]:
df['Grid Collapse Events'] = 0
df.loc[collapse_indices, 'Grid Collapse Events'] = 1

# Add anomalies for collapse events
for idx in collapse_indices:
    df.loc[idx, 'Voltage (V)'] *= np.random.uniform(0.85, 0.95)  # Voltage drop
    df.loc[idx, 'Grid Frequency (Hz)'] += np.random.uniform(-0.5, 0.5)  # Frequency deviation
    df.loc[idx, 'Power Consumption (kW)'] *= np.random.uniform(1.1, 1.3)  # Increase demand
    df.loc[idx, 'Overload Condition'] = 1 if df.loc[idx, 'Power Consumption (kW)'] > df.loc[idx, 'Grid Supply (kW)'] * 1.1 else df.loc[idx, 'Overload Condition']
    df.loc[idx, 'Transformer Fault'] = 1 if np.random.rand() > 0.5 else df.loc[idx, 'Transformer Fault']
    df.loc[idx, 'Line Trip Events'] = 1 if np.random.rand() > 0.4 else df.loc[idx, 'Line Trip Events']
    # Recalculate derived parameters
    df.loc[idx, 'Current (A)'] = (df.loc[idx, 'Power Consumption (kW)'] * 1000) / (df.loc[idx, 'Voltage (V)'] * df.loc[idx, 'Power Factor'] * np.sqrt(3))
    df.loc[idx, 'Reactive Power (kVAR)'] = df.loc[idx, 'Power Consumption (kW)'] * np.tan(np.arccos(df.loc[idx, 'Power Factor']))

In [32]:
# Round numerical columns
df['Voltage (V)'] = df['Voltage (V)'].round(2)
df['Current (A)'] = df['Current (A)'].round(2)
df['Power Consumption (kW)'] = df['Power Consumption (kW)'].round(2)
df['Reactive Power (kVAR)'] = df['Reactive Power (kVAR)'].round(2)
df['Power Factor'] = df['Power Factor'].round(3)
df['Grid Supply (kW)'] = df['Grid Supply (kW)'].round(2)
df['Grid Frequency (Hz)'] = df['Grid Frequency (Hz)'].round(2)
df['Generation Capacity (MW)'] = df['Generation Capacity (MW)'].round(2)
df['Temperature (°C)'] = df['Temperature (°C)'].round(1)
df['Humidity (%)'] = df['Humidity (%)'].round(1)

In [36]:
df['Grid Collapse Events'].value_counts()

0    22274
1      246
Name: Grid Collapse Events, dtype: int64

In [None]:
# Save to CSV
df.to_csv('synthetic_nigeria_grid_data_2010_may2025_6hourly_derived.csv', index=False)

print(f"Generated synthetic dataset with {len(df)} rows and {df['Grid Collapse Events'].sum()} collapse events, saved to 'synthetic_nigeria_grid_data_2010_may2025_6hourly_derived.csv'.")