In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
import dask.dataframe as dd


In [5]:

# Load Eaglei outages (using Dask for large dataset handling)
eaglei = dd.read_csv('/home/adelechinda/home/projects/powerup/data/eaglei_outages.csv', 
                    dtype={'fips_code': 'int32',
                           'county': 'object',
                           'state': 'object',
                           'sum': 'float32',
                           'run_start_time': 'object'})

In [6]:

# Create date column from run_start_time
eaglei['date'] = dd.to_datetime(eaglei['run_start_time']).dt.date

# Aggregate outages by county and date
outage_events = eaglei.groupby(['fips_code', 'county', 'state', 'date'])\
                      .agg({'sum': 'sum'})\
                      .rename(columns={'sum': 'outage_duration'})\
                      .compute()

  return get_meta_library(args[0]).to_datetime(*args, **kwargs)


In [7]:

# Create binary target (1 if outage occurred)
outage_events['outage_occurred'] = (outage_events['outage_duration'] > 0).astype(int)

In [8]:

# Load VTEC data with proper datetime handling
vtec = pd.read_csv('/home/adelechinda/home/projects/powerup/data/vtec.csv', parse_dates=['ISSUED'])
vtec['date'] = vtec['ISSUED'].dt.date

  vtec = pd.read_csv('/home/adelechinda/home/projects/powerup/data/vtec.csv', parse_dates=['ISSUED'])


In [9]:

# Create weather event features
vtec_features = vtec.groupby(['WFO', 'date']).agg({
    'WINDTAG': 'max',
    'HAILTAG': 'max',
    'IS_EMERGENCY': 'sum',
    'PHENOM': lambda x: x.str.contains('Wind').sum()
}).rename(columns={'PHENOM': 'wind_events'}).reset_index()

In [10]:

# Load NOAA data with proper datetime handling
noaa = pd.read_csv('/home/adelechinda/home/projects/powerup/data/noaa.csv', parse_dates=['BGN_DATE'], encoding="ISO-8859-1")
noaa['date'] = noaa['BGN_DATE'].dt.date

  noaa = pd.read_csv('/home/adelechinda/home/projects/powerup/data/noaa.csv', parse_dates=['BGN_DATE'], encoding="ISO-8859-1")


In [11]:

# Create severity features from NOAA
noaa_features = noaa.groupby(['STATE', 'COUNTYNAME', 'date']).agg({
    'MAG': 'max',
    'FATALITIES': 'sum',
    'INJURIES': 'sum',
    'EVTYPE': lambda x: x.str.contains('Storm').sum()
}).rename(columns={'EVTYPE': 'storm_events'}).reset_index()


In [12]:

# Safe Data Merging
merged_data = pd.merge(
    outage_events.reset_index(),
    vtec_features,
    left_on=['state', 'date'],
    right_on=['WFO', 'date'],
    how='left'
)

In [13]:

final_data = pd.merge(
    merged_data,
    noaa_features,
    left_on=['state', 'county', 'date'],
    right_on=['STATE', 'COUNTYNAME', 'date'],
    how='left'
)

In [14]:

# Fill missing values with defaults
final_data.fillna({
    'WINDTAG': 0,
    'HAILTAG': 0,
    'IS_EMERGENCY': 0,
    'wind_events': 0,
    'MAG': 0,
    'FATALITIES': 0,
    'INJURIES': 0,
    'storm_events': 0
}, inplace=True)

In [15]:
# Feature Engineering

# Temporal Features
final_data['date'] = pd.to_datetime(final_data['date'])
final_data['day_of_year'] = final_data['date'].dt.dayofyear
final_data['month'] = final_data['date'].dt.month
final_data['is_weekend'] = final_data['date'].dt.dayofweek.isin([5, 6]).astype(int)

In [16]:

# Weather Severity Composite Score
final_data['weather_severity'] = (
    final_data['WINDTAG'] + 
    final_data['HAILTAG'] + 
    final_data['MAG'] +
    final_data['IS_EMERGENCY'] * 2
)

In [17]:

# Previous Outage Feature (Lag)
final_data = final_data.sort_values(['county', 'date'])
final_data['prev_outage'] = final_data.groupby('county')['outage_occurred'].shift(1).fillna(0)

In [18]:

# Encode Categorical Variables
for col in ['state', 'county', 'WFO']:
    le = LabelEncoder()
    final_data[col] = le.fit_transform(final_data[col].astype(str))

In [19]:

# Select Features and Target
features = [
    'WINDTAG', 'HAILTAG', 'IS_EMERGENCY', 'wind_events',
    'MAG', 'FATALITIES', 'INJURIES', 'storm_events',
    'day_of_year', 'month', 'is_weekend', 'weather_severity', 'prev_outage'
]
X = final_data[features]
y = final_data['outage_occurred']

In [20]:

# Normalize Features (Important for KNN)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [21]:

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [22]:

# Hyperparameter Tuning for KNN (Optimal K Selection)
param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)


In [23]:

# Best Model Training
best_k = grid_search.best_params_['n_neighbors']
knn_best = KNeighborsClassifier(n_neighbors=best_k)
knn_best.fit(X_train, y_train)

In [24]:
# Evaluate the Model
predictions = knn_best.predict(X_test)
print(f"Optimal K: {best_k}")
print("Model Accuracy:", accuracy_score(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Optimal K: 11
Model Accuracy: 0.9344894652416543

Classification Report:
               precision    recall  f1-score   support

           0       0.39      0.07      0.13     69844
           1       0.94      0.99      0.97   1039199

    accuracy                           0.93   1109043
   macro avg       0.67      0.53      0.55   1109043
weighted avg       0.91      0.93      0.91   1109043

