In [1]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Load data
train_data = pd.read_csv('/kaggle/input/mlpr-data-split/train.csv')
test_data = pd.read_csv('/kaggle/input/mlpr-data-split/test.csv')
holdout_data = pd.read_csv('/kaggle/input/mlpr-data-split/holdout.csv')

# Combine datasets for consistent encoding
data = pd.concat([train_data, test_data, holdout_data], ignore_index=True)

# Encoding using DAMAGE_PROPERTY (numerical, non-target variable)
categorical_cols = ['EVENT_TYPE', 'stability', 'WFO']
encoding_maps = {}

# Compute encoding mappings from training data using DAMAGE_PROPERTY
for col in categorical_cols:
    if col in train_data.columns:
        encoding_maps[col] = train_data.groupby(col)['ppt'].mean() 

# Apply encoding to combined data
for col in categorical_cols:
    if col in data.columns:
        data[col + '_encoded'] = data[col].map(encoding_maps[col]).fillna(train_data['ppt'].mean())

# Split back into train, test, holdout
train_data = data.iloc[:len(train_data)]
test_data = data.iloc[len(train_data):len(train_data) + len(test_data)]
holdout_data = data.iloc[len(train_data) + len(test_data):]

In [2]:
import pandas as pd
import joblib
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Filter for storm events
train_data = train_data[train_data['is_storm_lagged'] == 1]
test_data = test_data[test_data['is_storm_lagged'] == 1]

# Define features and target
severity_features = [
    'DEATHS_INDIRECT', 'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT',
    'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'duration_hours',
    'desc_word_count', 'has_tornado', 'has_hail', 'has_flood', 'has_wind',
    'has_tree', 'has_broken', 'has_blown', 'tmin', 'tmax', 'tavg',
    'EVENT_TYPE_encoded', 'stability_encoded', 'CZ_FIPS', 'WFO_encoded'
]
target_col = 'severity_class'

# Prepare training features and target
X_train = train_data[severity_features]
y_train = train_data[target_col]

# Scale training features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Prepare test features and target
X_test = test_data[severity_features]
y_test = test_data[target_col]
X_test_scaled = scaler.transform(X_test)

# Use best parameters found via Optuna
final_params = {
    'objective': 'multiclass',
    'num_class': len(y_train.unique()),
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'verbose': -1,
    'random_state': 42,
    'n_estimators': 943,
    'max_depth': 14,
    'learning_rate': 0.299798636793099,
    'subsample': 0.6815843772645557,
    'colsample_bytree': 0.967548003596196,
    'min_child_weight': 1.2892385914567714,
    'reg_alpha': 0.009927790289736168,
    'reg_lambda': 1.7396121269192825e-06
}

# Train final LightGBM model
lgb_model = lgb.LGBMClassifier(**final_params)
lgb_model.fit(X_train_scaled, y_train)

# Evaluate model on test data
y_pred = lgb_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Severity Prediction Test Metrics (LightGBM):")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# Save the model and scaler
joblib.dump(lgb_model, 'severity_lgb_model.pkl')
joblib.dump(scaler, 'severity_scaler.pkl')

Severity Prediction Test Metrics (LightGBM):
Accuracy : 0.9308
Precision: 0.9309
Recall   : 0.9308
F1 Score : 0.9308

Confusion Matrix:
[[3109   73   53    0]
 [  76 2698  257    0]
 [  32  239 2881    0]
 [   2    0    0 1152]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      3235
           1       0.90      0.89      0.89      3031
           2       0.90      0.91      0.91      3152
          10       1.00      1.00      1.00      1154

    accuracy                           0.93     10572
   macro avg       0.94      0.94      0.94     10572
weighted avg       0.93      0.93      0.93     10572



['severity_scaler.pkl']