In [1]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Load data
train_data = pd.read_csv('/kaggle/input/mlpr-data-split/train.csv')
test_data = pd.read_csv('/kaggle/input/mlpr-data-split/test.csv')
holdout_data = pd.read_csv('/kaggle/input/mlpr-data-split/holdout.csv')

# Combine datasets for consistent encoding
data = pd.concat([train_data, test_data, holdout_data], ignore_index=True)

# Encoding using DAMAGE_PROPERTY (numerical, non-target variable)
categorical_cols = ['EVENT_TYPE', 'stability', 'WFO']
encoding_maps = {}

# Compute encoding mappings from training data using DAMAGE_PROPERTY
for col in categorical_cols:
    if col in train_data.columns:
        encoding_maps[col] = train_data.groupby(col)['ppt'].mean() 

# Apply encoding to combined data
for col in categorical_cols:
    if col in data.columns:
        data[col + '_encoded'] = data[col].map(encoding_maps[col]).fillna(train_data['ppt'].mean())

# Split back into train, test, holdout
train_data = data.iloc[:len(train_data)]
test_data = data.iloc[len(train_data):len(train_data) + len(test_data)]
holdout_data = data.iloc[len(train_data) + len(test_data):]

In [2]:
data = holdout_data

# 1) Storm Prediction

In [3]:
# Define features
storm_features = [
    'DEATHS_INDIRECT', 'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT',
    'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'duration_hours',
    'desc_word_count', 'has_tornado', 'has_hail', 'has_flood', 'has_wind',
    'has_tree', 'has_broken', 'has_blown', 'tmin', 'tmax', 'tavg', 'ppt',
    'MAGNITUDE_IMPUTED', 'STATE_FIPS'
]
target_col = 'is_storm_lagged'

# Load storm model and scaler
storm_model = joblib.load('/kaggle/input/storm_xgb_model/scikitlearn/default/1/storm_xgb_model (1).pkl')
scaler = joblib.load('/kaggle/input/storm_xgb_model/scikitlearn/default/1/storm_scaler (2).pkl')

# Prepare features and scale
X = data[storm_features].values  # Convert to NumPy array to avoid feature name warning
X_scaled = scaler.transform(X)    # Apply scaling

# Storm prediction
storm_preds = storm_model.predict(X_scaled)

# Add predictions to DataFrame
data['predicted_storm'] = storm_preds

# Evaluate model
y_true = data[target_col]
y_pred = data['predicted_storm']
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
conf_matrix = confusion_matrix(y_true, y_pred)

print("Storm Prediction Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_report(y_true, y_pred, zero_division=0))

# Filter storm cases
storm_data = data[data['predicted_storm'] == 1].copy()



Storm Prediction Metrics:
Accuracy : 0.9294
Precision: 0.9345
Recall   : 0.9272
F1 Score : 0.9308

Confusion Matrix:
[[19091  1402]
 [ 1569 19993]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.93      0.93     20493
           1       0.93      0.93      0.93     21562

    accuracy                           0.93     42055
   macro avg       0.93      0.93      0.93     42055
weighted avg       0.93      0.93      0.93     42055



# 2) Severity Prediction

In [4]:
# ----------------- SEVERITY PREDICTION -----------------

# Define severity features
severity_features = [
    'DEATHS_INDIRECT', 'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT',
    'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'duration_hours',
    'desc_word_count', 'has_tornado', 'has_hail', 'has_flood', 'has_wind',
    'has_tree', 'has_broken', 'has_blown', 'tmin', 'tmax', 'tavg',
    'EVENT_TYPE_encoded', 'stability_encoded', 'CZ_FIPS', 'WFO_encoded'
]
target_col = 'severity_class'

# Load severity model and scaler
severity_model = joblib.load('/kaggle/input/severity_lgb_model-1/scikitlearn/default/1/severity_lgb_model (1).pkl')
severity_scaler = joblib.load('/kaggle/input/severity_lgb_model-1/scikitlearn/default/1/severity_scaler (3).pkl')

# Prepare features and scale
X = storm_data[severity_features].values
X_scaled = severity_scaler.transform(X)

# Severity prediction
severity_preds = severity_model.predict(X_scaled)
storm_data['predicted_severity'] = severity_preds

# Evaluate severity model
y_true = storm_data[target_col]
y_pred = storm_data['predicted_severity']
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
conf_matrix = confusion_matrix(y_true, y_pred)

print("Severity Prediction Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_report(y_true, y_pred, zero_division=0))

# Filter severity cases
low_severity_data = storm_data[storm_data['predicted_severity'] == 0].copy()
med_severity_data = storm_data[storm_data['predicted_severity'] == 1].copy()
high_severity_data = storm_data[storm_data['predicted_severity'] == 2].copy()



Severity Prediction Metrics:
Accuracy : 0.9328
Precision: 0.9330
Recall   : 0.9328
F1 Score : 0.9329

Confusion Matrix:
[[6437  151   86    0]
 [ 127 5667  526    0]
 [  63  478 6073    0]
 [   5    0    1 1781]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.97      6674
           1       0.90      0.90      0.90      6320
           2       0.91      0.92      0.91      6614
          10       1.00      1.00      1.00      1787

    accuracy                           0.93     21395
   macro avg       0.94      0.94      0.94     21395
weighted avg       0.93      0.93      0.93     21395



# 3) Outage Prediction

In [5]:
import numpy as np

# Derive is_outage target (assuming customers_out > 0 indicates an outage)
for df in [low_severity_data, med_severity_data, high_severity_data]:
    df['is_outage'] = (df['customers_out'] > 0).astype(int)

# Extract non-storm rows from storm_df_with_predictions
non_storm_data = data[data['predicted_storm'] == 0].copy()
non_storm_data['is_outage'] = 0  # No outage for non-storm events
non_storm_data['predicted_severity'] = 10  # Placeholder for non-storm rows

# Randomly split non-storm rows across the three DataFrames
non_storm_split = np.array_split(non_storm_data.sample(frac=1, random_state=42), 3)
non_storm_low, non_storm_medium, non_storm_high = non_storm_split

# Augment DataFrames with non-storm rows
low_severity_data = pd.concat([low_severity_data, non_storm_low], ignore_index=True)
med_severity_data = pd.concat([med_severity_data, non_storm_medium], ignore_index=True)
high_severity_data = pd.concat([high_severity_data, non_storm_high], ignore_index=True)

  return bound(*args, **kwds)


In [6]:
outage_features = [
    'tmin', 'tmax', 'tavg', 'stability_encoded',
    'EVENT_TYPE_encoded', 'duration_hours', 'desc_word_count',
    'has_tornado', 'has_hail', 'has_flood', 'has_wind', 'has_tree',
    'has_broken', 'has_blown', 'DAMAGE_PROPERTY', 'DAMAGE_CROPS',
    'INJURIES_DIRECT', 'DEATHS_DIRECT', 'CZ_FIPS'
]
target_col = 'is_outage'

### a. Low

In [7]:
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# ----------------- LOAD MODEL AND SCALER -----------------
outage_model = joblib.load('/kaggle/input/low_severity_lgm_model-1/scikitlearn/default/1/low_severity_lgm_model (1).pkl')
scaler = joblib.load('/kaggle/input/low_severity_lgm_model-1/scikitlearn/default/1/low_severity_scaler (3).pkl')

# ----------------- PREPARE FEATURES -----------------
X = low_severity_data[outage_features]

# ----------------- SCALE FEATURES -----------------
X_scaled = scaler.transform(X)

# ----------------- SEVERITY PREDICTION -----------------
outage_pred = outage_model.predict(X_scaled)

# ----------------- TRUE AND PREDICTED LABELS -----------------
target_col = 'is_outage'  # Assuming this is the target column based on prior context
y_true_outage = low_severity_data[target_col]
y_pred_outage = outage_pred

# ----------------- METRIC CALCULATIONS -----------------
accuracy = accuracy_score(y_true_outage, y_pred_outage)
precision = precision_score(y_true_outage, y_pred_outage, average='binary', zero_division=0)
recall = recall_score(y_true_outage, y_pred_outage, average='binary', zero_division=0)
f1 = f1_score(y_true_outage, y_pred_outage, average='binary', zero_division=0)
conf_matrix = confusion_matrix(y_true_outage, y_pred_outage)

# ----------------- OUTPUT -----------------
print("Outage Prediction Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_report(y_true_outage, y_pred_outage, zero_division=0))

Outage Prediction Metrics:
Accuracy : 0.9879
Precision: 0.9799
Recall   : 0.9958
F1 Score : 0.9878

Confusion Matrix:
[[6781  135]
 [  28 6575]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      6916
           1       0.98      1.00      0.99      6603

    accuracy                           0.99     13519
   macro avg       0.99      0.99      0.99     13519
weighted avg       0.99      0.99      0.99     13519



### b. Medium

In [8]:
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# ----------------- LOAD MODEL AND SCALER -----------------
outage_model = joblib.load('/kaggle/input/medium_severity_xgb_model/scikitlearn/default/1/medium_severity_xgb_model.pkl')
scaler = joblib.load('/kaggle/input/medium_severity_xgb_model/scikitlearn/default/1/medium_severity_scaler (1).pkl')

# ----------------- PREPARE FEATURES -----------------
X = med_severity_data[outage_features]

# ----------------- SCALE FEATURES -----------------
X_scaled = scaler.transform(X)

# ----------------- SEVERITY PREDICTION -----------------
outage_pred = outage_model.predict(X_scaled)

# ----------------- TRUE AND PREDICTED LABELS -----------------
target_col = 'is_outage'  # Assuming this is the target column based on prior context
y_true_outage = med_severity_data[target_col]
y_pred_outage = outage_pred

# ----------------- METRIC CALCULATIONS -----------------
accuracy = accuracy_score(y_true_outage, y_pred_outage)
precision = precision_score(y_true_outage, y_pred_outage, average='binary', zero_division=0)
recall = recall_score(y_true_outage, y_pred_outage, average='binary', zero_division=0)
f1 = f1_score(y_true_outage, y_pred_outage, average='binary', zero_division=0)
conf_matrix = confusion_matrix(y_true_outage, y_pred_outage)

# ----------------- OUTPUT -----------------
print("Outage Prediction Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_report(y_true_outage, y_pred_outage, zero_division=0))

Outage Prediction Metrics:
Accuracy : 0.9747
Precision: 0.9587
Recall   : 0.9897
F1 Score : 0.9739

Confusion Matrix:
[[6630  268]
 [  65 6220]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.96      0.98      6898
           1       0.96      0.99      0.97      6285

    accuracy                           0.97     13183
   macro avg       0.97      0.98      0.97     13183
weighted avg       0.98      0.97      0.97     13183



### c. High

In [9]:
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# ----------------- LOAD MODEL AND SCALER -----------------
outage_model = joblib.load('/kaggle/input/high_severity_rf_model-2/scikitlearn/default/1/high_severity_rf_model (2).pkl')
scaler = joblib.load('/kaggle/input/high_severity_rf_model-2/scikitlearn/default/1/high_severity_scaler (2).pkl')

# ----------------- PREPARE FEATURES -----------------
X = high_severity_data[outage_features]

# ----------------- SCALE FEATURES -----------------
X_scaled = scaler.transform(X)

# ----------------- SEVERITY PREDICTION -----------------
outage_pred = outage_model.predict(X_scaled)

# ----------------- TRUE AND PREDICTED LABELS -----------------
target_col = 'is_outage'  # Assuming this is the target column based on prior context
y_true_outage = high_severity_data[target_col]
y_pred_outage = outage_pred

# ----------------- METRIC CALCULATIONS -----------------
accuracy = accuracy_score(y_true_outage, y_pred_outage)
precision = precision_score(y_true_outage, y_pred_outage, average='binary', zero_division=0)
recall = recall_score(y_true_outage, y_pred_outage, average='binary', zero_division=0)
f1 = f1_score(y_true_outage, y_pred_outage, average='binary', zero_division=0)
conf_matrix = confusion_matrix(y_true_outage, y_pred_outage)

# ----------------- OUTPUT -----------------
print("Outage Prediction Metrics (High Severity):")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_report(y_true_outage, y_pred_outage, zero_division=0))

Outage Prediction Metrics (High Severity):
Accuracy : 0.9757
Precision: 0.9640
Recall   : 0.9874
F1 Score : 0.9755

Confusion Matrix:
[[6664  246]
 [  84 6578]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.96      0.98      6910
           1       0.96      0.99      0.98      6662

    accuracy                           0.98     13572
   macro avg       0.98      0.98      0.98     13572
weighted avg       0.98      0.98      0.98     13572

