In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import math

In [3]:
# Set random seed for reproducibility
np.random.seed(42)

# Parameters
start_date = datetime(2010, 1, 1)
end_date = datetime(2025, 5, 31)
days = int((end_date - start_date).days) + 1 # ~5,627 days
collapse_counts = {
    2010: 42, 2011: 19, 2012: 24, 2013: 24, 2014: 13, 2015: 10,
    2016: 28, 2017: 24, 2018: 13, 2019: 10, 2020: 4, 2021: 4,
    2022: 12, 2023: 12, 2024: 4, 2025: 3
}
total_collapses = sum(collapse_counts.values()) # 246 collapses

In [4]:
total_collapses

246

In [5]:
# Generate timestamps (daily, 00:00)
timestamps = [start_date + timedelta(days=i) for i in range(days)]
df = pd.DataFrame({'Timestamp': timestamps})

# Generate independent parameters
df['Voltage (V)'] = np.random.normal(330000, 5000, days) # 330kV ±5kV
df['Power Factor'] = np.random.uniform(0.8, 0.95, days) # 0.8–0.95
df['Grid Supply (kW)'] = np.random.normal(4000000, 400000, days) # ~4,000 MW
df['Grid Frequency (Hz)'] = np.random.normal(50, 0.2, days) # 50 Hz ±0.2
df['Generation Capacity (MW)'] = np.random.normal(4000, 500, days) # ~4,000 MW

# Derive Power Consumption (kW): Grid Supply ± deviation
df['Power Consumption (kW)'] = df['Grid Supply (kW)'] * np.random.normal(1.05, 0.1, days)

# Derive Current (A): I = P / (V * PF * sqrt(3))
df['Current (A)'] = (df['Power Consumption (kW)'] * 1000) / (df['Voltage (V)'] * df['Power Factor'] * np.sqrt(3))

# Derive Reactive Power (kVAR): Q = P * tan(acos(PF))
df['Reactive Power (kVAR)'] = df['Power Consumption (kW)'] * np.tan(np.arccos(df['Power Factor']))

# Weather: Wet (Apr–Oct: 28°C, 70%), Dry (Nov–Mar: 32°C, 35%)
df['Temperature (°C)'] = [np.random.normal(28, 3) if 4 <= t.month <= 10 else np.random.normal(32, 3) for t in df['Timestamp']]
df['Humidity (%)'] = [np.random.normal(70, 10) if 4 <= t.month <= 10 else np.random.normal(35, 10) for t in df['Timestamp']]

# Faults and events
df['Transformer Fault'] = np.random.choice([0, 1], days, p=[0.98, 0.02])
df['Line Trip Events'] = np.random.choice([0, 1], days, p=[0.95, 0.05])
df['Overload Condition'] = (df['Power Consumption (kW)'] > df['Grid Supply (kW)'] * 1.1).astype(int)

In [6]:
# Specific collapse dates
specific_collapses = [
    datetime(2024, 12, 11),  # Omotosho-Ikeja West line trip, vandalism
    datetime(2025, 1, 11),   # Line tripping
    datetime(2025, 2, 12),   # Line tripping
    datetime(2025, 3, 7)     # Assumed line tripping
]
collapse_indices = set()
for collapse_date in specific_collapses:
    closest_idx = df.index[df['Timestamp'] == df['Timestamp'].iloc[(df['Timestamp'] - collapse_date).abs().argmin()]].tolist()
    if closest_idx:
        collapse_indices.add(closest_idx[0])

# Distribute remaining collapses by year
for year, count in collapse_counts.items():
    year_indices = set(df[df['Timestamp'].dt.year == year].index)
    year_specific_collapses = sum(1 for idx in collapse_indices if df.loc[idx, 'Timestamp'].year == year)
    year_remaining = count - year_specific_collapses
    if year_remaining > 0:
        available_indices = list(year_indices - collapse_indices)
        if len(available_indices) >= year_remaining:
            selected = np.random.choice(available_indices, size=year_remaining, replace=False)
            collapse_indices.update(selected)

# Ensure exactly 246 collapses
collapse_indices = list(collapse_indices)
if len(collapse_indices) < total_collapses:
    available_indices = [i for i in df.index if i not in collapse_indices]
    extra_indices = np.random.choice(available_indices, size=total_collapses - len(collapse_indices), replace=False)
    collapse_indices.extend(extra_indices)
elif len(collapse_indices) > total_collapses:
    collapse_indices = collapse_indices[:total_collapses]

In [7]:
df['Grid Collapse Events'] = 0
df.loc[collapse_indices, 'Grid Collapse Events'] = 1

# Add anomalies for collapse events
for idx in collapse_indices:
    df.loc[idx, 'Voltage (V)'] *= np.random.uniform(0.85, 0.95)  # Voltage drop
    df.loc[idx, 'Grid Frequency (Hz)'] += np.random.uniform(-0.5, 0.5)  # Frequency deviation
    df.loc[idx, 'Power Consumption (kW)'] *= np.random.uniform(1.1, 1.3)  # Increase demand
    df.loc[idx, 'Overload Condition'] = 1 if df.loc[idx, 'Power Consumption (kW)'] > df.loc[idx, 'Grid Supply (kW)'] * 1.1 else df.loc[idx, 'Overload Condition']
    df.loc[idx, 'Transformer Fault'] = 1 if np.random.rand() > 0.5 else df.loc[idx, 'Transformer Fault']
    df.loc[idx, 'Line Trip Events'] = 1 if np.random.rand() > 0.4 else df.loc[idx, 'Line Trip Events']
    # Recalculate derived parameters
    df.loc[idx, 'Current (A)'] = (df.loc[idx, 'Power Consumption (kW)'] * 1000) / (df.loc[idx, 'Voltage (V)'] * df.loc[idx, 'Power Factor'] * np.sqrt(3))
    df.loc[idx, 'Reactive Power (kVAR)'] = df.loc[idx, 'Power Consumption (kW)'] * np.tan(np.arccos(df.loc[idx, 'Power Factor']))

In [8]:
# Round numerical columns
df['Voltage (V)'] = df['Voltage (V)'].round(2)
df['Current (A)'] = df['Current (A)'].round(2)
df['Power Consumption (kW)'] = df['Power Consumption (kW)'].round(2)
df['Reactive Power (kVAR)'] = df['Reactive Power (kVAR)'].round(2)
df['Power Factor'] = df['Power Factor'].round(3)
df['Grid Supply (kW)'] = df['Grid Supply (kW)'].round(2)
df['Grid Frequency (Hz)'] = df['Grid Frequency (Hz)'].round(2)
df['Generation Capacity (MW)'] = df['Generation Capacity (MW)'].round(2)
df['Temperature (°C)'] = df['Temperature (°C)'].round(1)
df['Humidity (%)'] = df['Humidity (%)'].round(1)

In [9]:
df

Unnamed: 0,Timestamp,Voltage (V),Power Factor,Grid Supply (kW),Grid Frequency (Hz),Generation Capacity (MW),Power Consumption (kW),Current (A),Reactive Power (kVAR),Temperature (°C),Humidity (%),Transformer Fault,Line Trip Events,Overload Condition,Grid Collapse Events
0,2010-01-01,332483.57,0.942,4308020.04,49.87,5015.48,4447274.99,8199.24,1586454.64,33.1,28.5,0,0,0,0
1,2010-01-02,329308.68,0.820,4602458.90,50.15,3352.51,4419303.10,9450.10,3086000.31,29.7,26.2,0,0,0,0
2,2010-01-03,333238.44,0.916,3824460.00,50.17,4029.83,4591651.55,8682.85,2008252.36,33.1,48.3,0,0,1,0
3,2010-01-04,337615.15,0.840,3544375.40,50.12,3092.85,4391616.83,8943.47,2839887.80,27.8,32.4,0,0,1,0
4,2010-01-05,328829.23,0.940,3016732.23,49.79,3786.62,3116647.78,5821.18,1130789.14,41.8,31.5,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5625,2025-05-27,329512.37,0.882,3737369.21,49.98,4187.15,4052514.62,8051.83,2166841.57,31.4,62.5,0,0,0,0
5626,2025-05-28,331639.06,0.894,4363044.72,49.89,3946.00,4935999.93,9610.39,2471898.16,28.9,79.5,0,0,1,0
5627,2025-05-29,334762.39,0.940,3670311.02,49.74,2974.73,3253897.31,5968.23,1177900.65,32.9,73.2,0,0,0,0
5628,2025-05-30,325945.53,0.845,4317455.72,49.94,4653.05,3852725.77,8077.52,2439650.33,22.7,74.2,0,0,0,0


In [10]:
df['Grid Collapse Events'].value_counts()

0    5384
1     246
Name: Grid Collapse Events, dtype: int64

In [None]:






# Save to CSV
df.to_csv('synthetic_nigeria_grid_data_2010_may2025_daily_derived.csv', index=False)

print(f"Generated synthetic dataset with {len(df)} rows and {df['Grid Collapse Events'].sum()} collapse events, saved to 'synthetic_nigeria_grid_data_2010_may2025_daily_derived.csv'.")