In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

storm_df = pd.read_csv("/kaggle/input/noaa-powout-prism-0-1-is-storm-lag/noaapowoutprism_01_Is_Storm_Lag (1).csv")

# Storm Prediction

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Define features
storm_features = [
    'DEATHS_INDIRECT', 'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT',
    'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'customers_out', 'duration_hours',
    'desc_word_count', 'desc_char_count',
    'has_tornado', 'has_hail', 'has_flood', 'has_wind', 'has_tree',
    'has_power', 'has_damage', 'has_outage', 'has_broken', 'has_blown',
    'tmin', 'tmax', 'tavg', 'ppt'
]

# Prepare X and y
X = storm_df[storm_features]
y = storm_df['is_storm_lagged'].astype(int)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [3]:
# Train Random Forest model on all data with best hyperparameters obtained using optuna
rf_model = RandomForestClassifier(
    n_estimators=188,
    max_depth=50,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='log2',
    bootstrap=False,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_scaled, y)

# Predict on all data
predictions = rf_model.predict(X_scaled)

# Create new DataFrame with predictions
storm_df_with_predictions = storm_df.copy()
storm_df_with_predictions['predicted_storm'] = predictions

In [4]:
storm_df_with_1_predictions = storm_df_with_predictions[storm_df_with_predictions['predicted_storm'] == 1].copy()

# Severity Prediction

In [8]:
# Filter out unexpected class labels
valid_classes = [0, 1, 2]
storm_df_with_1_predictions = storm_df_with_1_predictions[storm_df_with_1_predictions['severity_class'].isin(valid_classes)]

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

severity_features = [
    # Original impact features
    'DEATHS_INDIRECT', 'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT',
    'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'customers_out', 'duration_hours',

    # NLP-derived features
    'desc_word_count', 'desc_char_count',
    'has_tornado', 'has_hail', 'has_flood', 'has_wind', 'has_tree',
    'has_power', 'has_damage', 'has_outage', 'has_broken', 'has_blown',

    #prism features
    'tmin', 'tmax', 'tavg', 'ppt'
]

# Features and target
X = storm_df_with_1_predictions[severity_features]
y = storm_df_with_1_predictions['severity_class']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = scaler.transform(X)

In [11]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier

# Train XGBoost classifier on all data
xgb_model = XGBClassifier(
    n_estimators=403,
    max_depth=10,
    learning_rate=0.09065400280278058,
    subsample=0.933968095670629,
    colsample_bytree=0.5647574078202744,
    gamma=0.00017586655077512627,
    min_child_weight=2,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
xgb_model.fit(X_scaled, y)

# Predict on the full dataset
predictions = xgb_model.predict(X_scaled)

storm_data_with_severity = storm_df_with_1_predictions.copy()
storm_data_with_severity['severity_predicted'] = predictions

In [12]:
# Create three DataFrames based on predicted severity
df_low = storm_data_with_severity[storm_data_with_severity['severity_predicted'] == 0]
df_medium = storm_data_with_severity[storm_data_with_severity['severity_predicted'] == 1]
df_high = storm_data_with_severity[storm_data_with_severity['severity_predicted'] == 2]

# Outage Prediction

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import lightgbm as lgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Define outage features (common for all models)
outage_features = [
    'tmin', 'tmax', 'tavg', 'ppt',
    'has_tornado', 'has_hail', 'has_flood', 'has_wind', 'has_tree',
    'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'duration_hours',
    'desc_word_count', 'desc_char_count'
]

# Derive is_outage target (assuming customers_out > 0 indicates an outage)
for df in [df_low, df_medium, df_high]:
    df['is_outage'] = (df['customers_out'] > 0).astype(int)

2025-05-02 13:14:33.495024: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746191673.717526      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746191673.786328      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_outage'] = (df['customers_out'] > 0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value i

In [14]:
# Extract non-storm rows from storm_df_with_predictions
non_storm_data = storm_df_with_predictions[storm_df_with_predictions['predicted_storm'] == 0].copy()
non_storm_data['is_outage'] = 0  # No outage for non-storm events
non_storm_data['severity_predicted'] = 10  # Placeholder for non-storm rows

In [15]:
# Randomly split non-storm rows across the three DataFrames
non_storm_split = np.array_split(non_storm_data.sample(frac=1, random_state=42), 3)
non_storm_low, non_storm_medium, non_storm_high = non_storm_split

  return bound(*args, **kwds)


In [16]:
# Augment DataFrames with non-storm rows
df_low = pd.concat([df_low, non_storm_low], ignore_index=True)
df_medium = pd.concat([df_medium, non_storm_medium], ignore_index=True)
df_high = pd.concat([df_high, non_storm_high], ignore_index=True)

### Low Severity

In [17]:
if not df_low.empty:
    X_low = df_low[outage_features]
    y_low = df_low['is_outage']
    
    # Train-test split
    X_train_low, X_test_low, y_train_low, y_test_low = train_test_split(
        X_low, y_low, test_size=0.2, stratify=y_low, random_state=42
    )
    
    # Feature scaling
    scaler_low = StandardScaler()
    X_train_low_scaled = scaler_low.fit_transform(X_train_low)
    X_test_low_scaled = scaler_low.transform(X_test_low)

    # Best parameters from Optuna Trial 32
    best_params = {
        'n_estimators': 319,
        'learning_rate': 0.047847333909262976,
        'num_leaves': 281,
        'max_depth': 20,
        'min_child_samples': 22,
        'subsample': 0.527136639688917,
        'colsample_bytree': 0.8326768083509417,
        'random_state': 42
    }

    # Train final LightGBM model
    final_lgb_model = lgb.LGBMClassifier(**best_params)
    final_lgb_model.fit(X_train_low_scaled, y_train_low)
    
    # Predictions
    low_predictions = final_lgb_model.predict(X_test_low_scaled)
    
    # Evaluation
    print("Final Tuned LightGBM (Low Severity) Metrics:")
    print("Accuracy:", accuracy_score(y_test_low, low_predictions))
    print("Classification Report:\n", classification_report(y_test_low, low_predictions))
    print("Confusion Matrix:\n", confusion_matrix(y_test_low, low_predictions))

[LightGBM] [Info] Number of positive: 25874, number of negative: 27885
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007830 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2097
[LightGBM] [Info] Number of data points in the train set: 53759, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.481296 -> initscore=-0.074850
[LightGBM] [Info] Start training from score -0.074850
Final Tuned LightGBM (Low Severity) Metrics:
Accuracy: 0.9529761904761904
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.95      0.95      6971
           1       0.95      0.96      0.95      6469

    accuracy                           0.95     13440
   macro avg       0.95      0.95      0.95     13440
weighted avg       0.95      0.95      0.95     13440

Confusion Matrix:

### Medium Severity

In [18]:
if not df_medium.empty:
    X_medium = df_medium[outage_features]
    y_medium = df_medium['is_outage']
    
    # Train-test split
    X_train_medium, X_test_medium, y_train_medium, y_test_medium = train_test_split(
        X_medium, y_medium, test_size=0.2, stratify=y_medium, random_state=42
    )
    
    # Scale features
    scaler_medium = StandardScaler()
    X_train_medium_scaled = scaler_medium.fit_transform(X_train_medium)
    X_test_medium_scaled = scaler_medium.transform(X_test_medium)
    
    # Train Random Forest
    rf_model = RandomForestClassifier(
        n_estimators=191,
        max_depth=28,
        min_samples_split=6,
        min_samples_leaf=2,
        max_features='log2',
        bootstrap=False,
        random_state=42,
        n_jobs=-1
    )
    rf_model.fit(X_train_medium_scaled, y_train_medium)
    
    # Predict on test set
    medium_predictions = rf_model.predict(X_test_medium_scaled)
    
    # Evaluate
    print("\nRandom Forest (Medium Severity) Metrics:")
    print("Accuracy:", accuracy_score(y_test_medium, medium_predictions))
    print("Classification Report:\n", classification_report(y_test_medium, medium_predictions))
    print("Confusion Matrix:\n", confusion_matrix(y_test_medium, medium_predictions))


Random Forest (Medium Severity) Metrics:
Accuracy: 0.9362767760826013
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94      6958
           1       0.92      0.95      0.93      6020

    accuracy                           0.94     12978
   macro avg       0.94      0.94      0.94     12978
weighted avg       0.94      0.94      0.94     12978

Confusion Matrix:
 [[6459  499]
 [ 328 5692]]


### High Severity

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

if not df_high.empty:
    X_high = df_high[outage_features]
    y_high = df_high['is_outage']

    # Train-test split
    X_train_high, X_test_high, y_train_high, y_test_high = train_test_split(
        X_high, y_high, test_size=0.2, stratify=y_high, random_state=42
    )

    # Scale features
    scaler_high = StandardScaler()
    X_train_high_scaled = scaler_high.fit_transform(X_train_high)
    X_test_high_scaled = scaler_high.transform(X_test_high)

    # Optimized hyperparameters from Optuna
    n_units = [80, 112, 96]
    dropout_rate = 0.1677425146431672
    learning_rate = 0.001715383587455835
    batch_size = 64

    # Build optimized model
    model = Sequential()
    model.add(Dense(n_units[0], activation='relu', input_shape=(len(outage_features),)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(n_units[1], activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(n_units[2], activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])

    model.fit(
        X_train_high_scaled, y_train_high,
        epochs=20,
        batch_size=batch_size,
        validation_split=0.2,
        verbose=0
    )

    # Predict on test set
    high_predictions = (model.predict(X_test_high_scaled) > 0.5).astype(int).flatten()

    # Evaluation
    print("\nOptimized FNN (High Severity) Metrics:")
    print("Accuracy:", accuracy_score(y_test_high, high_predictions))
    print("Classification Report:\n", classification_report(y_test_high, high_predictions))
    print("Confusion Matrix:\n", confusion_matrix(y_test_high, high_predictions))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-05-02 13:16:02.482375: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step

Optimized FNN (High Severity) Metrics:
Accuracy: 0.8671245147805315
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.84      0.87      6958
           1       0.84      0.90      0.87      6438

    accuracy                           0.87     13396
   macro avg       0.87      0.87      0.87     13396
weighted avg       0.87      0.87      0.87     13396

Confusion Matrix:
 [[5829 1129]
 [ 651 5787]]
