In [None]:
import xarray as xr
import pandas as pd

lightning = xr.open_dataset('lightning_july2024_hourly.nc')
print(lightning)

In [None]:
store1 = 'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3'

ds1 = xr.open_dataset(store1, engine='zarr', chunks={})[['convective_available_potential_energy',
                                                          'total_precipitation',
                                                          '2m_temperature',
                                                          'total_column_water_vapour',
                                                          'vertical_velocity']]
print(ds1)

In [None]:
# July 2024
time_start = '2024-07-01'
time_end = '2024-07-31T23:00:00'

# Tu área (en formato 0-360)
lat_min, lat_max = 20, 55
lon_min, lon_max = 230, 300  # equivale a -130 to -60

# Filtrar
ds_july = ds1.sel(
    time=slice(time_start, time_end),
    latitude=slice(lat_max, lat_min),  # ERA5 va de 90 a -90
    longitude=slice(lon_min, lon_max)
)

print(ds_july)

In [None]:
# Seleccionar solo 500 hPa para vertical velocity
ds_era5 = ds_july.sel(level=500)
print(ds_era5)

In [None]:
# Load lightning data
lightning = xr.open_dataset('lightning_july2024_hourly.nc')

# Problem: different longitude formats
# ERA5: 230 to 300
# Lightning: -130 to -60

# Convert ERA5 to -180 to 180 format
ds_era5_aligned = ds_era5.assign_coords(
    longitude=(ds_era5.longitude - 360).values
)

print("ERA5 lon:", ds_era5_aligned.longitude.values[:3], "...", ds_era5_aligned.longitude.values[-3:])
print("Lightning lon:", lightning.longitude.values[:3], "...", lightning.longitude.values[-3:])

In [None]:
# Find common times between both datasets
common_times = pd.DatetimeIndex(
    set(ds_era5_aligned.time.values) & set(lightning.time.values)
)
common_times = common_times.sort_values()

print(f"ERA5 times: {len(ds_era5_aligned.time)}")
print(f"Lightning times: {len(lightning.time)}")
print(f"Common times: {len(common_times)}")
# Select only common times
era5_final = ds_era5_aligned.sel(time=common_times)
lightning_final = lightning.sel(time=common_times)

print(f"ERA5 final shape: {era5_final.dims}")
print(f"Lightning final shape: {lightning_final.dims}")

In [None]:
# Merge ERA5 and lightning
ds_merged = xr.merge([era5_final, lightning_final])

print(ds_merged)

In [None]:
# Convert to DataFrame
# This takes a few minutes - loading all data

print("Loading data... (this may take a few minutes)")

df = ds_merged.to_dataframe().reset_index()

print(f"Shape: {df.shape}")
print(df.head())


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
# =============================================================
# 5. SAMPLE - 3 million rows (won't crash)
# =============================================================
df_sample = df.sample(n=3_000_000, random_state=42)
print(f"Sample shape: {df_sample.shape}")

# =============================================================
# 6. Prepare features
# =============================================================
features = ['convective_available_potential_energy',
            'total_precipitation',
            '2m_temperature',
            'total_column_water_vapour',
            'vertical_velocity']

X = df_sample[features]
Y = (df_sample['lightning_density'] > 0).astype(int)

print(f"No lightning: {(Y == 0).sum():,}")
print(f"Lightning: {(Y == 1).sum():,}")

# =============================================================
# 7. Split: 70/15/15
# =============================================================
X_temp, X_test, Y_temp, Y_test = train_test_split(X, Y, test_size=0.15, random_state=42)
X_train, X_val, Y_train, Y_val = train_test_split(X_temp, Y_temp, test_size=0.176, random_state=42)

print(f"\nTrain: {len(X_train):,}")
print(f"Validation: {len(X_val):,}")
print(f"Test: {len(X_test):,}")

# =============================================================
# 8. Train
# =============================================================
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

print("\nTraining... (5-10 min)")
rf.fit(X_train, Y_train)
print("Done!")

In [None]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, accuracy_score
import matplotlib.pyplot as plt

# Predictions on validation set
Y_val_pred = rf.predict(X_val)

# Predictions on test set
Y_test_pred = rf.predict(X_test)

# Validation metrics
print("=== VALIDATION RESULTS ===")
print(f"Accuracy: {accuracy_score(Y_val, Y_val_pred):.3f}")
print(classification_report(Y_val, Y_val_pred, target_names=['No Lightning', 'Lightning']))

# Test metrics
print("\n=== TEST RESULTS ===")
print(f"Accuracy: {accuracy_score(Y_test, Y_test_pred):.3f}")
print(classification_report(Y_test, Y_test_pred, target_names=['No Lightning', 'Lightning'])) 

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

ConfusionMatrixDisplay.from_predictions(Y_val, Y_val_pred, display_labels=['No Lightning', 'Lightning'], cmap='Blues', ax=axes[0])
axes[0].set_title('Validation Set', fontweight='bold')

ConfusionMatrixDisplay.from_predictions(Y_test, Y_test_pred, display_labels=['No Lightning', 'Lightning'], cmap='Blues', ax=axes[1])
axes[1].set_title('Test Set', fontweight='bold')

plt.tight_layout()
plt.show()
import pandas as pd

importance = pd.DataFrame({
    'Feature': features,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

print(importance)

fig, ax = plt.subplots(figsize=(8, 5))
ax.barh(importance['Feature'], importance['Importance'], color='steelblue')
ax.set_xlabel('Importance')
ax.set_title('Feature Importance', fontweight='bold')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# =============================================================
# COMPLETE STATISTICS: Classification + Regression
# =============================================================

from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, mean_squared_error, 
                             mean_absolute_error, r2_score)
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# =============================================================
# 1. CLASSIFICATION METRICS
# =============================================================
print("=== CLASSIFICATION METRICS ===")
print(f"Accuracy:              {accuracy_score(Y_test, Y_test_pred):.4f}")
print(f"Precision (Lightning): {precision_score(Y_test, Y_test_pred):.4f}")
print(f"Recall (Lightning):    {recall_score(Y_test, Y_test_pred):.4f}")
print(f"F1-Score (Lightning):  {f1_score(Y_test, Y_test_pred):.4f}")
print(f"ROC-AUC:               {roc_auc_score(Y_test, rf.predict_proba(X_test)[:, 1]):.4f}")

# =============================================================
# 2. REGRESSION MODEL
# =============================================================
print("\nTraining regression model...")

Y_reg = df_sample['lightning_density']
X_train_reg, X_test_reg, Y_train_reg, Y_test_reg = train_test_split(
    X, Y_reg, test_size=0.2, random_state=42
)

rf_reg = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
rf_reg.fit(X_train_reg, Y_train_reg)
Y_pred_reg = rf_reg.predict(X_test_reg)

print("\n=== REGRESSION METRICS ===")
print(f"RMSE: {np.sqrt(mean_squared_error(Y_test_reg, Y_pred_reg)):.4f}")
print(f"MAE:  {mean_absolute_error(Y_test_reg, Y_pred_reg):.4f}")
print(f"R²:   {r2_score(Y_test_reg, Y_pred_reg):.4f}")

In [None]:
from sklearn.metrics import roc_curve, auc

fig, ax = plt.subplots(figsize=(7, 6))

fpr, tpr, _ = roc_curve(Y_test, rf.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)

ax.plot(fpr, tpr, 'b-', linewidth=2, label=f'RF Model (AUC = {roc_auc:.3f})')
ax.plot([0, 1], [0, 1], 'r--', label='Random Guess (AUC = 0.5)')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curve - Lightning Prediction', fontweight='bold')
ax.legend(loc='lower right')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Sort by location and time
df_sorted = df.sort_values(['latitude', 'longitude', 'time']).copy()

# Shift lightning 1 hour forward
df_sorted['lightning_1h'] = df_sorted.groupby(['latitude', 'longitude'])['lightning_density'].shift(-1)

# Drop rows without future data
df_forecast = df_sorted.dropna(subset=['lightning_1h'])

print(f"Original rows: {len(df):,}")
print(f"Forecast rows: {len(df_forecast):,}")

In [None]:
# Sample 3 million
df_forecast_sample = df_forecast.sample(n=3_000_000, random_state=42)

# Features (current ERA5)
X_forecast = df_forecast_sample[features]

# Target (lightning 1 hour later, binary)
Y_forecast = (df_forecast_sample['lightning_1h'] > 0).astype(int)

print(f"No lightning (1h ahead): {(Y_forecast == 0).sum():,}")
print(f"Lightning (1h ahead): {(Y_forecast == 1).sum():,}")

# Split
X_train_f, X_test_f, Y_train_f, Y_test_f = train_test_split(
    X_forecast, Y_forecast, test_size=0.2, random_state=42
)

print(f"\nTrain: {len(X_train_f):,}")
print(f"Test: {len(X_test_f):,}")

In [None]:
# Train forecast model
rf_forecast = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

print("Training forecast model... (5-10 min)")
rf_forecast.fit(X_train_f, Y_train_f)
print("Done!")

# Predictions
Y_pred_f = rf_forecast.predict(X_test_f)

# Metrics
from sklearn.metrics import classification_report, recall_score, roc_auc_score

print("\n=== FORECAST RESULTS (1 hour ahead) ===")
print(f"Recall:  {recall_score(Y_test_f, Y_pred_f):.4f}")
print(f"ROC-AUC: {roc_auc_score(Y_test_f, rf_forecast.predict_proba(X_test_f)[:, 1]):.4f}")

print("\n" + classification_report(Y_test_f, Y_pred_f, target_names=['No Lightning', 'Lightning']))

In [None]:
# Comparison table
print("="*50)
print("MODEL COMPARISON")
print("="*50)
print(f"{'Model':<20} {'ROC-AUC':<10} {'Recall':<10}")
print("-"*50)
print(f"{'Same time (T→T)':<20} {0.7462:<10.3f} {0.5344:<10.3f}")
print(f"{'Forecast (T→T+1)':<20} {roc_auc_score(Y_test_f, rf_forecast.predict_proba(X_test_f)[:, 1]):<10.3f} {recall_score(Y_test_f, Y_pred_f):<10.3f}")