In [48]:
from google.colab import drive
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from datetime import datetime, timedelta
import requests
import os
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium import plugins

In [49]:
def fetch_earthquake_data(timeframe='day', min_magnitude=2.5):
    """
    Fetch earthquake data from USGS API

    Parameters:
    timeframe (str): 'hour', 'day', 'week', or 'month'
    min_magnitude (float): Minimum earthquake magnitude to include

    Returns:
    pandas DataFrame with earthquake data
    """
    # Define base URL for USGS GeoJSON feed
    base_url = "https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/"

    # Construct the endpoint URL based on parameters
    endpoint = f"{min_magnitude}_{timeframe}.geojson"
    url = base_url + endpoint

    try:
        # Make the API request
        print(f"Fetching data from {url}")
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes

        # Parse the JSON response
        data = response.json()

        # Extract the features from the GeoJSON
        earthquakes = data['features']

        # Create a list to store the processed data
        processed_data = []

        # Process each earthquake
        for quake in earthquakes:
            properties = quake['properties']
            coordinates = quake['geometry']['coordinates']

            # Extract relevant information
            processed_data.append({
                'time': datetime.fromtimestamp(properties['time'] / 1000),  # Convert milliseconds to datetime
                'magnitude': properties['mag'],
                'place': properties['place'],
                'longitude': coordinates[0],
                'latitude': coordinates[1],
                'depth': coordinates[2],
                'type': properties['type'],
                'alert': properties.get('alert', 'none'),  # Default to 'none' if no alert
                'tsunami': properties['tsunami'],
                'sig': properties['sig']  # Significance value
            })

        # Create DataFrame
        df = pd.DataFrame(processed_data)

        # Sort by time
        df = df.sort_values('time', ascending=False)

        # Print confirmation of successful data collection
        print("\nData Collection Summary:")
        print("-" * 30)
        print(f"Total earthquakes collected: {len(df)}")
        print(f"Date range: {df['time'].min()} to {df['time'].max()}")
        print(f"Magnitude range: {df['magnitude'].min():.1f} to {df['magnitude'].max():.1f}")
        print("-" * 30)

        return df

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None

# Test the function with a small sample
if __name__ == "__main__":
    # Fetch last week's earthquakes of magnitude 2.5 or greater
    df = fetch_earthquake_data(timeframe='day', min_magnitude=2.5)

    if df is not None:
        # Display the first few rows
        print("\nSample of collected data:")
        print("-" * 30)
        print(df.head())

        # Display data types and non-null counts
        print("\nDataset Information:")
        print("-" * 30)
        print(df.info())

Fetching data from https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/2.5_day.geojson

Data Collection Summary:
------------------------------
Total earthquakes collected: 26
Date range: 2024-11-16 22:17:04.400000 to 2024-11-17 19:18:44.983000
Magnitude range: 2.6 to 6.1
------------------------------

Sample of collected data:
------------------------------
                     time  magnitude                                   place  \
0 2024-11-17 19:18:44.983        4.4           254 km S of Lembar, Indonesia   
1 2024-11-17 19:02:56.160        4.4     96 km SSW of Alo, Wallis and Futuna   
2 2024-11-17 18:55:37.688        5.2  122 km ESE of Kokopo, Papua New Guinea   
3 2024-11-17 18:07:09.067        4.2                               Banda Sea   
4 2024-11-17 15:42:33.864        3.2          44 km WNW of Ninilchik, Alaska   

   longitude  latitude    depth        type alert  tsunami  sig  
0   116.3326  -11.0094   10.000  earthquake  None        0  298  
1  -178.3951  -15.1

In [50]:
def setup_drive_directory(base_path='earthquake_data'):
    """Mount Google Drive and create necessary directories"""
    drive.mount('/content/drive')
    full_path = f'/content/drive/My Drive/{base_path}'
    if not os.path.exists(full_path):
        os.makedirs(full_path)
        print(f"Created directory: {full_path}")
    else:
        print(f"Directory already exists: {full_path}")
    return full_path

In [51]:
class EarthquakeDataset(Dataset):
    def __init__(self, features, targets, seq_length):
        self.features = features
        self.targets = targets
        self.seq_length = seq_length

    def __len__(self):
        return max(0, len(self.features) - self.seq_length)

    def __getitem__(self, idx):
        # Get sequence of features
        feature_seq = self.features[idx:idx + self.seq_length]
        # Get corresponding target
        target = self.targets[idx + self.seq_length - 1]
        return feature_seq, target

In [52]:
class TransformerPredictor(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, num_heads, output_dim):
        super().__init__()

        # Adjust input dimension to be divisible by num_heads
        self.adjusted_dim = ((input_dim // num_heads) + 1) * num_heads

        # Initial projection to make input_dim divisible by num_heads
        self.input_projection = nn.Linear(input_dim, self.adjusted_dim)

        self.pos_encoder = nn.Sequential(
            nn.Linear(self.adjusted_dim, self.adjusted_dim),
            nn.ReLU()
        )

        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=self.adjusted_dim,
                nhead=num_heads,
                dim_feedforward=hidden_dim,
                dropout=0.1,
                batch_first=True  # Important: handle batch dimension first
            ),
            num_layers=num_layers
        )

        self.output_layer = nn.Linear(self.adjusted_dim, output_dim)

    def forward(self, x):
        # x shape: [batch_size, seq_length, input_dim]
        batch_size, seq_length, _ = x.shape

        # Project input to adjusted dimension
        x = self.input_projection(x)

        # Add positional encoding
        position_encoding = self.pos_encoder(x)
        x = x + position_encoding

        # Apply transformer
        x = self.transformer(x)

        # Get prediction from last sequence element
        x = x[:, -1, :]  # Take last sequence element for each batch

        # Project to output dimension
        x = self.output_layer(x)

        return x

In [53]:
class EarthquakePipeline:
    def __init__(self, drive_path=None, seq_length=3, prediction_horizon=1):
        if drive_path is None:
            self.drive_path = setup_drive_directory()
        else:
            self.drive_path = drive_path
        self.seq_length = seq_length
        self.prediction_horizon = prediction_horizon
        self.feature_scaler = MinMaxScaler()
        self.target_scaler = MinMaxScaler()
        self.model = None
        self.feature_columns = ['magnitude', 'latitude', 'longitude', 'depth', 'sig']
        self.target_columns = ['magnitude', 'latitude', 'longitude']

    def fetch_historical_week(self):
        """Fetch the last 7 days of data"""
        all_data = []
        end_date = datetime.now()
        start_date = end_date - timedelta(days=7)

        df = fetch_earthquake_data(timeframe='week', min_magnitude=2.5)
        if df is not None:
            df = df[df['time'].between(start_date, end_date)]

            for i in range(7):
                day = end_date - timedelta(days=i)
                day_next = day + timedelta(days=1)
                day_data = df[df['time'].between(day, day_next)]

                filename = f'earthquake_data_{day.strftime("%Y%m%d")}.csv'
                filepath = os.path.join(self.drive_path, filename)
                day_data.to_csv(filepath, index=False)

                all_data.append(day_data)

        return all_data if all_data else None

    def split_validation_data(self, data_list):
        """Split the data into yesterday's data and training data with proper handling"""
        if not data_list or len(data_list) < 7:
            raise ValueError("Insufficient data for splitting")

        # Ensure data_list contains DataFrames and they're not empty
        data_list = [df for df in data_list if not df.empty]
        if not data_list:
            raise ValueError("No valid data found in data_list")

        # Yesterday's data is the first element (most recent)
        validation_data = data_list[0]

        # Training data is the rest (older data)
        training_data = pd.concat(data_list[1:])

        print(f"\nData Split Summary:")
        print(f"Training data size: {len(training_data)} events")
        print(f"Validation data size: {len(validation_data)} events")

        return validation_data, training_data

    def prepare_data(self, df):
      """Prepare data for the model with proper sequence handling"""
      df = df.sort_values('time')

      # Extract features and targets
      features = df[self.feature_columns].values
      targets = df[self.target_columns].values

      # Scale features and targets separately
      scaled_features = self.feature_scaler.fit_transform(features)
      scaled_targets = self.target_scaler.fit_transform(targets)

      # Convert to PyTorch tensors
      features_tensor = torch.FloatTensor(scaled_features)
      targets_tensor = torch.FloatTensor(scaled_targets)

      # Create dataset
      dataset = EarthquakeDataset(features_tensor, targets_tensor, self.seq_length)

      # Verify we have enough data
      if len(dataset) < 1:
          raise ValueError("Not enough data points to create sequences")

      return dataset

    def train_model(self, dataset, epochs=100, batch_size=32, learning_rate=0.001):
      """Train the transformer model with proper batch and sequence handling"""
      input_dim = len(self.feature_columns)
      output_dim = len(self.target_columns)

      if self.model is None:
          self.model = TransformerPredictor(
              input_dim=input_dim,
              hidden_dim=64,
              num_layers=2,
              num_heads=8,
              output_dim=output_dim
          )

      # Adjust batch size if dataset is small
      batch_size = min(batch_size, len(dataset))

      # Create data loader with collate function
      def collate_fn(batch):
          # Separate features and targets
          features = [item[0] for item in batch]
          targets = [item[1] for item in batch]

          # Stack them into tensors
          feature_tensor = torch.stack(features)
          target_tensor = torch.stack(targets)

          return feature_tensor, target_tensor

      dataloader = DataLoader(
          dataset,
          batch_size=batch_size,
          shuffle=True,
          collate_fn=collate_fn,
          drop_last=True
      )

      if len(dataloader) == 0:
          raise ValueError("Not enough data to create batches")

      optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
      criterion = nn.MSELoss()

      print(f"\nTraining with:")
      print(f"Batch size: {batch_size}")
      print(f"Sequence length: {self.seq_length}")
      print(f"Number of batches per epoch: {len(dataloader)}")
      print(f"Input dimension: {input_dim}")
      print(f"Output dimension: {output_dim}")

      best_loss = float('inf')
      patience = 5
      patience_counter = 0

      for epoch in range(epochs):
          total_loss = 0
          batch_count = 0
          self.model.train()

          for batch_features, batch_targets in dataloader:
              # Print shapes for debugging in first epoch
              if epoch == 0 and batch_count == 0:
                  print(f"\nBatch shapes:")
                  print(f"Features: {batch_features.shape}")
                  print(f"Targets: {batch_targets.shape}")

              optimizer.zero_grad()

              # Forward pass
              predictions = self.model(batch_features)

              # Compute loss
              loss = criterion(predictions, batch_targets)

              # Backward pass
              loss.backward()
              optimizer.step()

              total_loss += loss.item()
              batch_count += 1

          if batch_count == 0:
              print("No valid batches in epoch")
              continue

          avg_loss = total_loss / batch_count

          if (epoch + 1) % 10 == 0:
              print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

          if avg_loss < best_loss:
              best_loss = avg_loss
              patience_counter = 0
          else:
              patience_counter += 1
              if patience_counter >= patience:
                  print(f"Early stopping at epoch {epoch+1}")
                  break

    def optimize_model(self, validation_data, current_metrics, learning_rate=0.0001):
        """Optimize model based on validation performance"""
        print("\nOptimizing model based on validation results...")

        val_dataset = self.prepare_data(validation_data)
        optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
        criterion = nn.MSELoss()

        best_loss = float('inf')
        patience = 3
        patience_counter = 0

        for epoch in range(50):
            total_loss = 0
            self.model.train()

            dataloader = DataLoader(val_dataset, batch_size=1, shuffle=True)

            for batch_features, batch_targets in dataloader:
                optimizer.zero_grad()
                predictions = self.model(batch_features)
                loss = criterion(predictions, batch_targets)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            avg_loss = total_loss / len(dataloader)

            if avg_loss < best_loss:
                best_loss = best_loss
                patience_counter = 0
                print(f"Optimization Epoch {epoch+1}: Loss improved to {avg_loss:.4f}")
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping optimization")
                    break

        return best_loss

    def predict_for_validation(self, training_data):
        """Generate predictions for validation period with error handling"""
        self.model.eval()
        try:
            with torch.no_grad():
                # Get the most recent sequence of data
                recent_data = training_data.sort_values('time').tail(self.seq_length)

                if len(recent_data) < self.seq_length:
                    raise ValueError(f"Insufficient data for sequence length {self.seq_length}")

                print(f"\nPrediction Data Summary:")
                print(f"Using {len(recent_data)} most recent events for prediction")
                print(f"Time range: {recent_data['time'].min()} to {recent_data['time'].max()}")

                scaled_features = self.feature_scaler.transform(recent_data[self.feature_columns].values)
                features_tensor = torch.FloatTensor(scaled_features)

                # Add batch dimension
                features_tensor = features_tensor.unsqueeze(0)

                # Get predictions
                predictions = self.model(features_tensor)
                unscaled_predictions = self.target_scaler.inverse_transform(predictions.numpy())

                return pd.DataFrame(
                    unscaled_predictions,
                    columns=['predicted_magnitude', 'predicted_latitude', 'predicted_longitude']
                )

        except Exception as e:
            print(f"Error in prediction: {str(e)}")
            return pd.DataFrame()

    def predict_next_day(self, recent_data):
        """Generate predictions for the next day"""
        self.model.eval()
        with torch.no_grad():
            recent_data = recent_data.sort_values('time').tail(self.seq_length)
            scaled_features = self.feature_scaler.transform(recent_data[self.feature_columns].values)
            features_tensor = torch.FloatTensor(scaled_features)

            predictions = self.model(features_tensor.unsqueeze(0))
            unscaled_predictions = self.target_scaler.inverse_transform(predictions.numpy())

            prediction_df = pd.DataFrame(
                unscaled_predictions,
                columns=['predicted_magnitude', 'predicted_latitude', 'predicted_longitude']
            )

            prediction_df['predicted_time'] = datetime.now() + timedelta(days=1)

            timestamp = datetime.now().strftime('%Y%m%d')
            prediction_file = os.path.join(self.drive_path, f'prediction_{timestamp}.csv')
            prediction_df.to_csv(prediction_file, index=False)

            return prediction_df

    def visualize_predictions(self, predictions, actual_data):
        """Create comprehensive visualizations of predictions vs actual values"""
        # Create directory for visualizations if it doesn't exist
        vis_dir = os.path.join(self.drive_path, 'visualizations')
        os.makedirs(vis_dir, exist_ok=True)

        # 1. Static Plots
        fig = plt.figure(figsize=(15, 10))

        # Magnitude Comparison
        plt.subplot(2, 2, 1)
        plt.title('Earthquake Magnitude: Predicted vs Actual')
        plt.plot(actual_data['magnitude'].values, label='Actual', marker='o')
        plt.plot(predictions['predicted_magnitude'].values, label='Predicted', marker='x')
        plt.xlabel('Event Index')
        plt.ylabel('Magnitude')
        plt.legend()
        plt.grid(True)

        # Error Distribution
        plt.subplot(2, 2, 2)
        errors = {
            'Magnitude Error': np.abs(predictions['predicted_magnitude'].values - actual_data['magnitude'].values),
            'Latitude Error': np.abs(predictions['predicted_latitude'].values - actual_data['latitude'].values),
            'Longitude Error': np.abs(predictions['predicted_longitude'].values - actual_data['longitude'].values)
        }

        sns.boxplot(data=pd.DataFrame(errors))
        plt.title('Prediction Error Distribution')
        plt.ylabel('Absolute Error')
        plt.xticks(rotation=45)

        # Time Series of Errors
        plt.subplot(2, 1, 2)
        for col, err in errors.items():
            plt.plot(err, label=col, marker='o')
        plt.title('Prediction Errors Over Time')
        plt.xlabel('Event Index')
        plt.ylabel('Absolute Error')
        plt.legend()
        plt.grid(True)

        plt.tight_layout()

        # Save static plots
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        static_plot_path = os.path.join(vis_dir, f'prediction_analysis_{timestamp}.png')
        plt.savefig(static_plot_path)
        plt.close()

        # 2. Interactive Map
        # Calculate center point for the map
        center_lat = np.mean([actual_data['latitude'].mean(), predictions['predicted_latitude'].mean()])
        center_lon = np.mean([actual_data['longitude'].mean(), predictions['predicted_longitude'].mean()])

        # Create map
        m = folium.Map(location=[center_lat, center_lon], zoom_start=4)

        # Add actual earthquakes
        actual_fg = folium.FeatureGroup(name='Actual Earthquakes')
        for idx, row in actual_data.iterrows():
            folium.CircleMarker(
                location=[row['latitude'], row['longitude']],
                radius=row['magnitude'] * 2,  # Size based on magnitude
                popup=f"Actual: Mag {row['magnitude']:.1f}",
                color='blue',
                fill=True
            ).add_to(actual_fg)
        actual_fg.add_to(m)

        # Add predicted earthquakes
        pred_fg = folium.FeatureGroup(name='Predicted Earthquakes')
        for idx, row in predictions.iterrows():
            folium.CircleMarker(
                location=[row['predicted_latitude'], row['predicted_longitude']],
                radius=row['predicted_magnitude'] * 2,
                popup=f"Predicted: Mag {row['predicted_magnitude']:.1f}",
                color='red',
                fill=True
            ).add_to(pred_fg)
        pred_fg.add_to(m)

        # Add connection lines between actual and predicted locations
        lines_fg = folium.FeatureGroup(name='Prediction Lines')
        for idx in range(len(predictions)):
            folium.PolyLine(
                locations=[
                    [actual_data.iloc[idx]['latitude'], actual_data.iloc[idx]['longitude']],
                    [predictions.iloc[idx]['predicted_latitude'], predictions.iloc[idx]['predicted_longitude']]
                ],
                weight=1,
                color='gray',
                opacity=0.5
            ).add_to(lines_fg)
        lines_fg.add_to(m)

        # Add layer control
        folium.LayerControl().add_to(m)

        # Save interactive map
        map_path = os.path.join(vis_dir, f'prediction_map_{timestamp}.html')
        m.save(map_path)

        print(f"\nVisualizations saved to:")
        print(f"1. Static plots: {static_plot_path}")
        print(f"2. Interactive map: {map_path}")

        # Create prediction summary
        summary_df = pd.DataFrame({
            'Metric': ['Average Magnitude Error', 'Average Location Error (km)', 'Max Magnitude Error', 'Max Location Error (km)'],
            'Value': [
                np.mean(errors['Magnitude Error']),
                np.mean(np.sqrt(errors['Latitude Error']**2 + errors['Longitude Error']**2) * 111),
                np.max(errors['Magnitude Error']),
                np.max(np.sqrt(errors['Latitude Error']**2 + errors['Longitude Error']**2) * 111)
            ]
        })

        # Save summary
        summary_path = os.path.join(vis_dir, f'prediction_summary_{timestamp}.csv')
        summary_df.to_csv(summary_path, index=False)

        print(f"3. Summary statistics: {summary_path}")

        # Print summary to console
        print("\nPrediction Summary:")
        print("-" * 50)
        print(summary_df.to_string(index=False))

        return {
            'static_plot': static_plot_path,
            'interactive_map': map_path,
            'summary': summary_path
        }

    def evaluate_predictions(self, predictions, validation_data):
        """Enhanced evaluation with detailed statistics and visualizations"""
        if predictions.empty or validation_data.empty:
            print("Warning: Empty predictions or validation data")
            return None

        actual_data = validation_data[self.target_columns].head(len(predictions))

        if len(actual_data) == 0:
            print("Warning: No matching validation data found")
            return None

        # Calculate basic metrics
        mse = np.mean((actual_data.values - predictions.values) ** 2, axis=0)
        mae = np.mean(np.abs(actual_data.values - predictions.values), axis=0)

        # Calculate MAPE with handling for zero values
        mape = []
        for i in range(len(self.target_columns)):
            actual_vals = actual_data.values[:, i]
            pred_vals = predictions.values[:, i]
            valid_indices = actual_vals != 0
            if np.any(valid_indices):
                mape_val = np.mean(np.abs((actual_vals[valid_indices] - pred_vals[valid_indices]) /
                                        actual_vals[valid_indices])) * 100
            else:
                mape_val = np.nan
            mape.append(mape_val)

        # Create detailed comparison DataFrame
        comparison_df = pd.DataFrame({
            'Event_Time': validation_data['time'].head(len(predictions)),
            'Actual_Magnitude': actual_data['magnitude'],
            'Predicted_Magnitude': predictions['predicted_magnitude'],
            'Actual_Latitude': actual_data['latitude'],
            'Predicted_Latitude': predictions['predicted_latitude'],
            'Actual_Longitude': actual_data['longitude'],
            'Predicted_Longitude': predictions['predicted_longitude']
        })

        # Calculate errors
        comparison_df['Magnitude_Error'] = abs(comparison_df['Actual_Magnitude'] - comparison_df['Predicted_Magnitude'])
        comparison_df['Location_Error_km'] = np.sqrt(
            (comparison_df['Actual_Latitude'] - comparison_df['Predicted_Latitude'])**2 +
            (comparison_df['Actual_Longitude'] - comparison_df['Predicted_Longitude'])**2
        ) * 111  # Rough conversion to kilometers

        # Print detailed evaluation
        print("\nPrediction Evaluation:")
        print("-" * 50)
        print("\nSummary Statistics:")
        print(f"Number of predictions: {len(predictions)}")

        for i, feature in enumerate(self.target_columns):
            print(f"\n{feature.capitalize()} Metrics:")
            print(f"MSE: {mse[i]:.4f}")
            print(f"MAE: {mae[i]:.4f}")
            print(f"MAPE: {mape[i]:.2f}%")

        print("\nLocation Error Statistics (km):")
        print(comparison_df['Location_Error_km'].describe())

        # Generate visualizations
        self.visualize_predictions(predictions, actual_data)

        # Save detailed results
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        results_path = os.path.join(self.drive_path, f'prediction_results_{timestamp}.csv')
        comparison_df.to_csv(results_path, index=False)
        print(f"\nDetailed results saved to: {results_path}")

        return {
            'mse': mse,
            'mae': mae,
            'mape': mape,
            'comparison': comparison_df,
            'summary_stats': {
                'location_error_mean': comparison_df['Location_Error_km'].mean(),
                'location_error_median': comparison_df['Location_Error_km'].median(),
                'magnitude_error_mean': comparison_df['Magnitude_Error'].mean(),
                'magnitude_error_median': comparison_df['Magnitude_Error'].median()
            }
        }

    def save_model(self):
        """Save model and scalers to Google Drive"""
        model_path = os.path.join(self.drive_path, 'earthquake_model.pth')
        feature_scaler_path = os.path.join(self.drive_path, 'feature_scaler.pkl')
        target_scaler_path = os.path.join(self.drive_path, 'target_scaler.pkl')

        torch.save(self.model.state_dict(), model_path)
        with open(feature_scaler_path, 'wb') as f:
            pickle.dump(self.feature_scaler, f)
        with open(target_scaler_path, 'wb') as f:
            pickle.dump(self.target_scaler, f)

    def load_model(self):
        """Load model and scalers from Google Drive"""
        model_path = os.path.join(self.drive_path, 'earthquake_model.pth')
        feature_scaler_path = os.path.join(self.drive_path, 'feature_scaler.pkl')
        target_scaler_path = os.path.join(self.drive_path, 'target_scaler.pkl')

        if all(os.path.exists(p) for p in [model_path, feature_scaler_path, target_scaler_path]):
            self.model.load_state_dict(torch.load(model_path))
            with open(feature_scaler_path, 'rb') as f:
                self.feature_scaler = pickle.load(f)
            with open(target_scaler_path, 'rb') as f:
                self.target_scaler = pickle.load(f)

    def run_validation_pipeline(self):
        """Run the initial validation pipeline with comprehensive error handling"""
        try:
            print("1. Fetching historical data...")
            historical_data = self.fetch_historical_week()

            if not historical_data:
                raise ValueError("No historical data retrieved")

            print("2. Splitting data into validation and training sets...")
            validation_data, training_data = self.split_validation_data(historical_data)

            print("3. Preparing training data...")
            train_dataset = self.prepare_data(training_data)
            print(f"Dataset size: {len(train_dataset)} sequences")

            print("4. Training model...")
            self.train_model(train_dataset)

            print("5. Generating predictions...")
            predictions = self.predict_for_validation(training_data)

            if predictions.empty:
                raise ValueError("Failed to generate predictions")

            print("6. Evaluating predictions...")
            metrics = self.evaluate_predictions(predictions, validation_data)

            if metrics is None:
                raise ValueError("Failed to compute evaluation metrics")

            return predictions, validation_data, metrics

        except Exception as e:
            print(f"Error in validation pipeline: {str(e)}")
            return None, None, None

    def run_continuous_pipeline(self):
        """Run the continuous pipeline for iterative optimization and prediction"""
        try:
            print("\nFetching today's actual data...")
            today_data = fetch_earthquake_data(timeframe='day', min_magnitude=2.5)

            yesterday = datetime.now() - timedelta(days=1)
            prediction_file = os.path.join(
                self.drive_path,
                f'prediction_{yesterday.strftime("%Y%m%d")}.csv'
            )

            if os.path.exists(prediction_file):
                print("\nEvaluating yesterday's predictions...")
                yesterday_pred = pd.read_csv(prediction_file)

                # Evaluate prediction
                metrics = self.evaluate_predictions(yesterday_pred, today_data)

                # Optimize model based on performance
                print("\nOptimizing model...")
                optimization_result = self.optimize_model(today_data, metrics)

                print(f"\nModel optimization complete. Final loss: {optimization_result:.4f}")

            # Generate prediction for tomorrow
            print("\nGenerating prediction for tomorrow...")
            tomorrow_pred = self.predict_next_day(today_data)

            print("\nPrediction for tomorrow:")
            print(tomorrow_pred)

            # Save updated model
            self.save_model()

            return {
                'today_data': today_data,
                'yesterday_prediction': yesterday_pred if 'yesterday_pred' in locals() else None,
                'tomorrow_prediction': tomorrow_pred,
                'metrics': metrics if 'metrics' in locals() else None
            }

        except Exception as e:
            print(f"Error in continuous pipeline: {str(e)}")
            return None

    def run_complete_pipeline(self, is_initial=False):
        """Run either initial validation or continuous pipeline"""
        if is_initial:
            print("\nRunning initial validation pipeline...")
            return self.run_validation_pipeline()
        else:
            print("\nRunning continuous optimization pipeline...")
            return self.run_continuous_pipeline()

In [55]:
def run_pipeline():
    """Run the complete earthquake prediction pipeline with error handling"""
    try:
        # Initialize the pipeline
        pipeline = EarthquakePipeline()

        # First run: Establish baseline
        print("Establishing baseline model...")
        predictions, validation_data, metrics = pipeline.run_complete_pipeline(is_initial=True)

        if predictions is None:
            raise ValueError("Failed to establish baseline model")

        print("\nBaseline Model Performance:")
        print("-" * 50)
        print("Validation Data Statistics:")
        print(f"Number of events: {len(validation_data)}")
        print("\nPrediction Metrics:")
        for metric_name, values in metrics.items():
            print(f"{metric_name.upper()}: {values}")

        # After baseline is established, run daily pipeline
        while True:
            user_input = input("\nRun daily pipeline? (yes/no): ")
            if user_input.lower() != 'yes':
                print("\nExiting pipeline. Model and data saved.")
                break

            print("\nRunning daily pipeline update...")
            results = pipeline.run_complete_pipeline(is_initial=False)

            if results:
                print("\nPipeline Results:")
                print("-" * 50)

                # Print yesterday's performance if available
                if results['yesterday_prediction'] is not None:
                    print("\nYesterday's Prediction Performance:")
                    print("-" * 30)
                    print("Predicted vs Actual:")
                    comparison = pd.DataFrame({
                        'Predicted': results['yesterday_prediction'].values[:, 0],
                        'Actual': results['today_data'][pipeline.target_columns].values[0]
                    }, index=pipeline.target_columns)
                    print(comparison)
                    print("\nMetrics:")
                    for metric_name, values in results['metrics'].items():
                        print(f"{metric_name.upper()}: {values}")

                # Print tomorrow's predictions
                print("\nTomorrow's Predictions:")
                print("-" * 30)
                pred_df = results['tomorrow_prediction']
                for col in ['predicted_magnitude', 'predicted_latitude', 'predicted_longitude']:
                    print(f"{col}: {pred_df[col].values[0]:.4f}")

                # Save results
                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
                results_file = f'pipeline_results_{timestamp}.txt'
                with open(os.path.join(pipeline.drive_path, results_file), 'w') as f:
                    f.write(f"Pipeline Results - {timestamp}\n")
                    f.write("-" * 50 + "\n")
                    if results['yesterday_prediction'] is not None:
                        f.write("\nYesterday's Performance:\n")
                        f.write(str(results['metrics']))
                    f.write("\n\nTomorrow's Predictions:\n")
                    f.write(str(results['tomorrow_prediction']))

                print(f"\nResults saved to {results_file}")

            else:
                print("\nError: Failed to generate results for today")

            print("\nWaiting for next day's data...")

    except Exception as e:
        print(f"\nError running pipeline: {str(e)}")
        return False

    return True

if __name__ == "__main__":
    success = run_pipeline()
    if not success:
        print("\nPipeline execution failed. Please check the error messages above.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Directory already exists: /content/drive/My Drive/earthquake_data
Establishing baseline model...

Running initial validation pipeline...
1. Fetching historical data...
Fetching data from https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/2.5_week.geojson

Data Collection Summary:
------------------------------
Total earthquakes collected: 253
Date range: 2024-11-10 20:13:16.630000 to 2024-11-17 19:18:44.983000
Magnitude range: 2.5 to 6.6
------------------------------
2. Splitting data into validation and training sets...

Data Split Summary:
Training data size: 200 events
Validation data size: 26 events
3. Preparing training data...
Dataset size: 197 sequences
4. Training model...

Training with:
Batch size: 32
Sequence length: 3
Number of batches per epoch: 6
Input dimension: 5
Output dimension: 3

Batch shapes:
Features: torch.Size([32, 3, 5])
Targe