In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
from datetime import datetime

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Initialize Spark Session

In [None]:
# Connect to Spark cluster
spark = SparkSession.builder \
    .master("spark://spark-spark-1:7077") \
    .appName("Inference_Analysis") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

print(f"Spark Version: {spark.version}")
print(f"Spark Master: {spark.sparkContext.master}")

## 2. Define LSTM Model Architecture

Must match the architecture used during training.

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=64, fc_size=32, num_layers=2, dropout=0.2):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        self.fc1 = nn.Linear(hidden_size, fc_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(fc_size, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        # LSTM layer
        lstm_out, (h_n, c_n) = self.lstm(x)
        
        # Use last hidden state
        out = h_n[-1]
        
        # Fully connected layers
        out = self.fc1(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        
        return out

## 3. Load Labeled Data from HDFS

In [None]:
# Load labeled data
labeled_df = spark.read.parquet("hdfs://namenode:9000/user/airflow/weather_data/labeled")
print(f"Total records: {labeled_df.count()}")
print(f"Columns: {labeled_df.columns}")

# Show schema
labeled_df.printSchema()

# Show sample
labeled_df.show(5)

## 4. Prepare Test Data

Use the most recent 20% of data for testing.

In [None]:
# Convert to pandas for easier manipulation
df = labeled_df.toPandas()

# Sort by date and district
df = df.sort_values(['district', 'date'])

# Split by date (80% train, 20% test)
train_size = int(len(df) * 0.8)
train_df = df.iloc[:train_size]
test_df = df.iloc[train_size:]

print(f"Train size: {len(train_df)} ({len(train_df)/len(df)*100:.1f}%)")
print(f"Test size: {len(test_df)} ({len(test_df)/len(df)*100:.1f}%)")
print(f"\nTest date range: {test_df['date'].min()} to {test_df['date'].max()}")

## 5. Create Sequences for LSTM

In [None]:
def create_sequences(data, target_col, sequence_length=30):
    """
    Create sequences for LSTM prediction
    """
    # Select feature columns (exclude date, district, and target columns)
    exclude_cols = ['date', 'district', 'heatwave_label', 'flood_label']
    feature_cols = [col for col in data.columns if col not in exclude_cols]
    
    X, y = [], []
    
    # Group by district
    for district in data['district'].unique():
        district_data = data[data['district'] == district].sort_values('date')
        
        features = district_data[feature_cols].values
        targets = district_data[target_col].values
        
        # Create sequences
        for i in range(len(features) - sequence_length):
            X.append(features[i:i+sequence_length])
            y.append(targets[i+sequence_length])
    
    return np.array(X), np.array(y), feature_cols

# Create test sequences for heatwave prediction
X_test_heat, y_test_heat, feature_cols = create_sequences(test_df, 'heatwave_label', sequence_length=30)

# Create test sequences for flood prediction
X_test_flood, y_test_flood, _ = create_sequences(test_df, 'flood_label', sequence_length=30)

print(f"Test sequences shape: {X_test_heat.shape}")
print(f"Number of features: {len(feature_cols)}")
print(f"Features: {feature_cols}")
print(f"\nHeatwave labels - 0: {(y_test_heat==0).sum()}, 1: {(y_test_heat==1).sum()}")
print(f"Flood labels - 0: {(y_test_flood==0).sum()}, 1: {(y_test_flood==1).sum()}")

## 6. Load Trained Models from HDFS

In [None]:
# Check if models exist in HDFS
import subprocess

result = subprocess.run(
    ['docker', 'exec', 'namenode', 'hdfs', 'dfs', '-ls', '/user/airflow/models/'],
    capture_output=True,
    text=True
)
print("Models in HDFS:")
print(result.stdout)

In [None]:
# Download models from HDFS
import subprocess
import os

# Create local models directory
os.makedirs('models', exist_ok=True)

# Download heatwave model
subprocess.run([
    'docker', 'exec', 'namenode', 'hdfs', 'dfs', '-get',
    '/user/airflow/models/heatwave_lstm.pt',
    '/tmp/heatwave_lstm.pt'
])
subprocess.run(['docker', 'cp', 'namenode:/tmp/heatwave_lstm.pt', 'models/'])

# Download flood model
subprocess.run([
    'docker', 'exec', 'namenode', 'hdfs', 'dfs', '-get',
    '/user/airflow/models/flood_lstm.pt',
    '/tmp/flood_lstm.pt'
])
subprocess.run(['docker', 'cp', 'namenode:/tmp/flood_lstm.pt', 'models/'])

print("✅ Models downloaded successfully!")

In [None]:
# Load models
input_size = X_test_heat.shape[2]  # Number of features

# Heatwave model
heatwave_model = LSTMModel(input_size=input_size, hidden_size=64, fc_size=32)
heatwave_model.load_state_dict(torch.load('models/heatwave_lstm.pt', weights_only=False))
heatwave_model.eval()
print("✅ Heatwave model loaded")

# Flood model
flood_model = LSTMModel(input_size=input_size, hidden_size=64, fc_size=32)
flood_model.load_state_dict(torch.load('models/flood_lstm.pt', weights_only=False))
flood_model.eval()
print("✅ Flood model loaded")

## 7. Run Inference on Test Data

In [None]:
# Convert test data to tensors
X_test_heat_tensor = torch.FloatTensor(X_test_heat)
X_test_flood_tensor = torch.FloatTensor(X_test_flood)

# Run inference
with torch.no_grad():
    # Heatwave predictions
    heatwave_probs = heatwave_model(X_test_heat_tensor).squeeze().numpy()
    heatwave_preds = (heatwave_probs >= 0.5).astype(int)
    
    # Flood predictions
    flood_probs = flood_model(X_test_flood_tensor).squeeze().numpy()
    flood_preds = (flood_probs >= 0.5).astype(int)

print("✅ Inference completed!")
print(f"\nHeatwave predictions: {heatwave_preds.sum()} positive out of {len(heatwave_preds)}")
print(f"Flood predictions: {flood_preds.sum()} positive out of {len(flood_preds)}")

## 8. Evaluation Metrics

In [None]:
# Heatwave model evaluation
print("="*60)
print("HEATWAVE MODEL PERFORMANCE")
print("="*60)
print("\nClassification Report:")
print(classification_report(y_test_heat, heatwave_preds, target_names=['No Heatwave', 'Heatwave']))

# ROC AUC
if len(np.unique(y_test_heat)) > 1:
    roc_auc = roc_auc_score(y_test_heat, heatwave_probs)
    print(f"ROC AUC Score: {roc_auc:.4f}")

print("\n" + "="*60)
print("FLOOD MODEL PERFORMANCE")
print("="*60)
print("\nClassification Report:")
print(classification_report(y_test_flood, flood_preds, target_names=['No Flood', 'Flood']))

# ROC AUC
if len(np.unique(y_test_flood)) > 1:
    roc_auc = roc_auc_score(y_test_flood, flood_probs)
    print(f"ROC AUC Score: {roc_auc:.4f}")

## 9. Confusion Matrices

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Heatwave confusion matrix
cm_heat = confusion_matrix(y_test_heat, heatwave_preds)
sns.heatmap(cm_heat, annot=True, fmt='d', cmap='Reds', ax=axes[0],
            xticklabels=['No Heatwave', 'Heatwave'],
            yticklabels=['No Heatwave', 'Heatwave'])
axes[0].set_title('Heatwave Prediction - Confusion Matrix', fontsize=14, fontweight='bold')
axes[0].set_ylabel('True Label')
axes[0].set_xlabel('Predicted Label')

# Flood confusion matrix
cm_flood = confusion_matrix(y_test_flood, flood_preds)
sns.heatmap(cm_flood, annot=True, fmt='d', cmap='Blues', ax=axes[1],
            xticklabels=['No Flood', 'Flood'],
            yticklabels=['No Flood', 'Flood'])
axes[1].set_title('Flood Prediction - Confusion Matrix', fontsize=14, fontweight='bold')
axes[1].set_ylabel('True Label')
axes[1].set_xlabel('Predicted Label')

plt.tight_layout()
plt.savefig('confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()

## 10. ROC Curves

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Heatwave ROC
if len(np.unique(y_test_heat)) > 1:
    fpr_heat, tpr_heat, _ = roc_curve(y_test_heat, heatwave_probs)
    roc_auc_heat = roc_auc_score(y_test_heat, heatwave_probs)
    
    axes[0].plot(fpr_heat, tpr_heat, color='red', lw=2, label=f'ROC curve (AUC = {roc_auc_heat:.3f})')
    axes[0].plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--', label='Random')
    axes[0].set_xlim([0.0, 1.0])
    axes[0].set_ylim([0.0, 1.05])
    axes[0].set_xlabel('False Positive Rate')
    axes[0].set_ylabel('True Positive Rate')
    axes[0].set_title('Heatwave Prediction - ROC Curve', fontsize=14, fontweight='bold')
    axes[0].legend(loc='lower right')
    axes[0].grid(alpha=0.3)

# Flood ROC
if len(np.unique(y_test_flood)) > 1:
    fpr_flood, tpr_flood, _ = roc_curve(y_test_flood, flood_probs)
    roc_auc_flood = roc_auc_score(y_test_flood, flood_probs)
    
    axes[1].plot(fpr_flood, tpr_flood, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc_flood:.3f})')
    axes[1].plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--', label='Random')
    axes[1].set_xlim([0.0, 1.0])
    axes[1].set_ylim([0.0, 1.05])
    axes[1].set_xlabel('False Positive Rate')
    axes[1].set_ylabel('True Positive Rate')
    axes[1].set_title('Flood Prediction - ROC Curve', fontsize=14, fontweight='bold')
    axes[1].legend(loc='lower right')
    axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('roc_curves.png', dpi=300, bbox_inches='tight')
plt.show()

## 11. Prediction Probability Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Heatwave probability distribution
axes[0].hist(heatwave_probs[y_test_heat == 0], bins=50, alpha=0.6, label='No Heatwave', color='green')
axes[0].hist(heatwave_probs[y_test_heat == 1], bins=50, alpha=0.6, label='Heatwave', color='red')
axes[0].axvline(x=0.5, color='black', linestyle='--', linewidth=2, label='Threshold')
axes[0].set_xlabel('Predicted Probability')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Heatwave Prediction Probabilities', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Flood probability distribution
axes[1].hist(flood_probs[y_test_flood == 0], bins=50, alpha=0.6, label='No Flood', color='green')
axes[1].hist(flood_probs[y_test_flood == 1], bins=50, alpha=0.6, label='Flood', color='blue')
axes[1].axvline(x=0.5, color='black', linestyle='--', linewidth=2, label='Threshold')
axes[1].set_xlabel('Predicted Probability')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Flood Prediction Probabilities', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('probability_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

## 12. Sample Predictions with Actual Labels

In [None]:
# Create results dataframe
results_df = pd.DataFrame({
    'Actual_Heatwave': y_test_heat,
    'Predicted_Heatwave': heatwave_preds,
    'Heatwave_Probability': heatwave_probs,
    'Actual_Flood': y_test_flood,
    'Predicted_Flood': flood_preds,
    'Flood_Probability': flood_probs
})

# Show sample predictions
print("\n" + "="*80)
print("SAMPLE PREDICTIONS")
print("="*80)
print(results_df.head(20))

# Save to CSV
results_df.to_csv('inference_results.csv', index=False)
print("\n✅ Results saved to 'inference_results.csv'")

## 13. False Positives and False Negatives Analysis

In [None]:
# Heatwave analysis
heat_fp = np.where((y_test_heat == 0) & (heatwave_preds == 1))[0]
heat_fn = np.where((y_test_heat == 1) & (heatwave_preds == 0))[0]

print("HEATWAVE MODEL ERROR ANALYSIS")
print("="*60)
print(f"False Positives: {len(heat_fp)} ({len(heat_fp)/len(y_test_heat)*100:.2f}%)")
print(f"False Negatives: {len(heat_fn)} ({len(heat_fn)/len(y_test_heat)*100:.2f}%)")

if len(heat_fp) > 0:
    print(f"\nFalse Positive average probability: {heatwave_probs[heat_fp].mean():.3f}")
if len(heat_fn) > 0:
    print(f"False Negative average probability: {heatwave_probs[heat_fn].mean():.3f}")

# Flood analysis
flood_fp = np.where((y_test_flood == 0) & (flood_preds == 1))[0]
flood_fn = np.where((y_test_flood == 1) & (flood_preds == 0))[0]

print("\n" + "="*60)
print("FLOOD MODEL ERROR ANALYSIS")
print("="*60)
print(f"False Positives: {len(flood_fp)} ({len(flood_fp)/len(y_test_flood)*100:.2f}%)")
print(f"False Negatives: {len(flood_fn)} ({len(flood_fn)/len(y_test_flood)*100:.2f}%)")

if len(flood_fp) > 0:
    print(f"\nFalse Positive average probability: {flood_probs[flood_fp].mean():.3f}")
if len(flood_fn) > 0:
    print(f"False Negative average probability: {flood_probs[flood_fn].mean():.3f}")

## 14. Export Results Summary

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate metrics
summary = {
    'Model': ['Heatwave', 'Flood'],
    'Accuracy': [
        accuracy_score(y_test_heat, heatwave_preds),
        accuracy_score(y_test_flood, flood_preds)
    ],
    'Precision': [
        precision_score(y_test_heat, heatwave_preds, zero_division=0),
        precision_score(y_test_flood, flood_preds, zero_division=0)
    ],
    'Recall': [
        recall_score(y_test_heat, heatwave_preds, zero_division=0),
        recall_score(y_test_flood, flood_preds, zero_division=0)
    ],
    'F1-Score': [
        f1_score(y_test_heat, heatwave_preds, zero_division=0),
        f1_score(y_test_flood, flood_preds, zero_division=0)
    ]
}

if len(np.unique(y_test_heat)) > 1 and len(np.unique(y_test_flood)) > 1:
    summary['ROC-AUC'] = [
        roc_auc_score(y_test_heat, heatwave_probs),
        roc_auc_score(y_test_flood, flood_probs)
    ]

summary_df = pd.DataFrame(summary)
print("\n" + "="*80)
print("MODEL PERFORMANCE SUMMARY")
print("="*80)
print(summary_df.to_string(index=False))

# Save summary
summary_df.to_csv('model_performance_summary.csv', index=False)
print("\n✅ Summary saved to 'model_performance_summary.csv'")

## 15. Cleanup

In [None]:
# Stop Spark session
spark.stop()
print("✅ Spark session stopped")