In [None]:
import tensorflow as tf
import ee
import pandas as pd
from google.cloud import storage

In [None]:
client = storage.Client()
ee.Authenticate()
ee.Initialize(project='wildfire-lab')
fires = pd.read_csv("request_data.csv", index_col=0)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
fires = pd.read_csv("request_data.csv", index_col=0)

In [None]:
bucket = "wildfire-lab"
folder = "batch_export_0422_test_3"

## Operational data

In [None]:
df = pd.read_csv("operational_table.csv", index_col=0)
df['next_area_diff'] = df.groupby('id')['area_diff'].shift(-1)
df = df.dropna(subset=['next_area_diff'])
df = df[df['day_since_first_report']<21]

In [None]:
import torch
from torch.utils.data import Dataset
import tensorflow as tf
import pickle
import numpy as np
from tqdm import tqdm
import re

class FireDataset(Dataset):
    def __init__(self, dataframe, features_path):
        self.dataframe = dataframe
        self.preloaded_images = {}
        with open(features_path, "rb") as f:
            self.features_dict = pickle.load(f)
        self._preload_images()
        self.number_features = dataframe.iloc[0].drop(['export_index', 'report_id', 'id', 'next_area_diff']).shape[0]


    def __len__(self):
        return len(self.preloaded_images)

    def __getitem__(self, idx): #idx = idx in dataframe
        row = self.dataframe.iloc[idx]
        features = torch.tensor(row.drop(['export_index', 'report_id', 'id', 'next_area_diff']).values.astype(float), dtype=torch.float)
        target = torch.tensor(row['next_area_diff'], dtype=torch.float)
        images_key = (row['id'], int(row['day_since_first_report']))
        images_tensor = self.preloaded_images.get(images_key, torch.zeros(7, 64, 64, dtype=torch.float))
        return features, images_tensor, target

    def _preload_images(self):
        for index, row in tqdm(self.dataframe.iterrows(), total=self.dataframe.shape[0]):
            fire_id = row['id']
            day_since_first_report = row['day_since_first_report']
            pattern = r'[^a-zA-Z0-9.,:_;-]'
            cleaned_fire_id = re.sub(pattern, '', fire_id)
            tfrecord_path = f"gs://{bucket}/{folder}/{cleaned_fire_id}.tfrecord.gz"
            try:
              self.preloaded_images[(fire_id, int(day_since_first_report))] = self.load_and_parse_tfrecord(tfrecord_path, day_since_first_report)
            except:
              try:
                pattern = r'[^a-zA-Z0-9.,:_; -]'
                cleaned_fire_id = re.sub(pattern, '', fire_id)

                tfrecord_path = f"gs://{bucket}/{folder}/{cleaned_fire_id}.tfrecord.gz"
                self.preloaded_images[(fire_id, int(day_since_first_report))] = self.load_and_parse_tfrecord(tfrecord_path, day_since_first_report)
              except:
                print(tfrecord_path, "not found in GCS")
    def load_and_parse_tfrecord(self, tfrecord_path, day_since_first_report):
        layers = ['pr', 'sph', 'th', 'tmmn', 'tmmx', 'vs', 'erc']
        day = f"{int(day_since_first_report):02}"
        images_array = np.zeros((7, 64, 64))
        try:
          raw_dataset = tf.data.TFRecordDataset(tfrecord_path, compression_type='GZIP')
          for raw_record in raw_dataset.take(1): # one record per dataset
              example = tf.io.parse_single_example(raw_record, self.features_dict)
              for i, layer in enumerate(layers):
                  key = f'{day}_{layer}'
                  if key in example:
                      images_array[i] = example[key].numpy()
                  else:
                      print(f"Key {key} not found in TFRecord.")
        except:
          raise ValueError("Path not found", tfrecord_path, day_since_first_report)

        return  torch.tensor(images_array, dtype=torch.float)



In [None]:

class FireDataset(Dataset):
    def __init__(self, dataframe, features_path):
        self.dataframe = dataframe
        self.features_dict = self._load_features(features_path)
        self.preloaded_images = self._preload_images()

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        features = self._extract_features(row)
        target = torch.tensor(row['next_area_diff'], dtype=torch.float)
        images_tensor = self.preloaded_images.get((row['id'], int(row['day_since_first_report'])), torch.zeros(7, 64, 64, dtype=torch.float))
        return features, images_tensor, target

    def _load_features(self, path):
        with open(path, "rb") as f:
            return pickle.load(f)

    def _extract_features(self, row):
        feature_cols = row.drop(['export_index', 'report_id', 'id', 'next_area_diff']).values.astype(float)
        return torch.tensor(feature_cols, dtype=torch.float)

    def _preload_images(self):
        images = {}
        for index, row in tqdm(self.dataframe.iterrows(), total=self.dataframe.shape[0]):
            fire_id, day_since_first_report = row['id'], int(row['day_since_first_report'])
            cleaned_fire_id = self._clean_id(fire_id)
            tfrecord_path = self._construct_tfrecord_path(cleaned_fire_id)
            images[(fire_id, day_since_first_report)] = self._load_image_data(tfrecord_path, day_since_first_report)
        return images

    def _clean_id(self, fire_id):
        pattern = r'[^a-zA-Z0-9.,:_;-]'
        return re.sub(pattern, '', fire_id)

    def _construct_tfrecord_path(self, cleaned_fire_id):
        return f"gs://{bucket}/{folder}/{cleaned_fire_id}.tfrecord.gz"

    def _load_image_data(self, tfrecord_path, day_since_first_report):
        layers = ['pr', 'sph', 'th', 'tmmn', 'tmmx', 'vs', 'erc']
        day = f"{day_since_first_report:02}"
        images_array = np.zeros((7, 64, 64))
        try:
            raw_dataset = tf.data.TFRecordDataset(tfrecord_path, compression_type='GZIP')
            for raw_record in raw_dataset.take(1):  # one record per dataset
                example = tf.io.parse_single_example(raw_record, self.features_dict)
                for i, layer in enumerate(layers):
                    key = f'{day}_{layer}'
                    if key in example:
                        images_array[i] = example[key].numpy()
                    else:
                        print(f"Key {key} not found in TFRecord.")
        except:
            raise ValueError("Path not found", tfrecord_path, day_since_first_report)
        return torch.tensor(images_array, dtype=torch.float)

# Ensure that 'bucket' and 'folder' are defined somewhere in your code or passed as parameters to methods or constructors as needed.


In [None]:
import torch
import tensorflow as tf
import pickle
import numpy as np
from tqdm import tqdm
import re

class FireDataset(Dataset):
    def __init__(self, dataframe, features_path, bucket, folder):
        self.dataframe = dataframe
        self.preloaded_images = {}
        self.bucket = bucket
        self.folder = folder

        with open(features_path, "rb") as f:
            self.features_dict = pickle.load(f)

        self._preload_images()
        self.number_features = dataframe.iloc[0].drop(['export_index', 'report_id', 'id', 'next_area_diff'])

    def __len__(self):
        return len(self.preloaded_images)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        features = torch.tensor(row.drop(['export_index', 'report_id', 'id', 'next_area_diff']).values.astype(float), dtype=torch.float)
        target = torch.tensor(row['next_area_diff'], dtype=torch.float)
        images_key = (row['id'], int(row['day_since_first_report']))
        images_tensor = self.preloaded_images.get(images_key, torch.zeros(7, 64, 64, dtype=torch.float))
        return features, images_tensor, target

    def _preload_images(self):
        for index, row in tqdm(self.dataframe.iterrows(), total=self.dataframe.shape[0]):
            fire_id = row['id']
            day_since_first_report = row['day_since_first_report']
            cleaned_fire_id = self._clean_fire_id(fire_id)
            tfrecord_path = self._get_tfrecord_path(cleaned_fire_id)

            cleaned_fire_id2 = self._clean_fire_id2(fire_id)
            tfrecord_path2 = self._get_tfrecord_path(cleaned_fire_id2)

            try:
                self.preloaded_images[(fire_id, int(day_since_first_report))] = self._load_and_parse_tfrecord(tfrecord_path, day_since_first_report)
            except FileNotFoundError:
              try:
                self.preloaded_images[(fire_id, int(day_since_first_report))] = self._load_and_parse_tfrecord(tfrecord_path2, day_since_first_report)
              except:
                print(f"TFRecord not found: {tfrecord_path}\n{tfrecord_path2}")

    def _clean_fire_id(self, fire_id):
        pattern = r'[^a-zA-Z0-9.,:_;-]'
        return re.sub(pattern, '', fire_id)

    def _clean_fire_id2(self, fire_id):
        pattern = r'[^a-zA-Z0-9.,:_; -]'
        return re.sub(pattern, '', fire_id)

    def _get_tfrecord_path(self, cleaned_fire_id):
        return f"gs://{self.bucket}/{self.folder}/{cleaned_fire_id}.tfrecord.gz"

    def _load_and_parse_tfrecord(self, tfrecord_path, day_since_first_report):
        layers = ['pr', 'sph', 'th', 'tmmn', 'tmmx', 'vs', 'erc']
        day = f"{int(day_since_first_report):02}"
        images_array = np.zeros((7, 64, 64))

        try:
            raw_dataset = tf.data.TFRecordDataset(tfrecord_path, compression_type='GZIP')
            for raw_record in raw_dataset.take(1):
                example = tf.io.parse_single_example(raw_record, self.features_dict)
                for i, layer in enumerate(layers):
                    key = f'{day}_{layer}'
                    if key in example:
                        images_array[i] = example[key].numpy()
                    else:
                        print(f"Key {key} not found in TFRecord.")
        except:
            raise FileNotFoundError(f"TFRecord not found: {tfrecord_path}")

        return torch.tensor(images_array, dtype=torch.float)

In [None]:
dataframe = df
folder = "batch_export_0422_test_3"
features_path = "/content/band_features.pkl"
dataset = FireDataset(dataframe,  features_path, bucket, folder)

torch.save(obj=dataset, f="/content/drive/My Drive/RA/full_dataset.pt")

100%|██████████| 25291/25291 [3:12:26<00:00,  2.19it/s]


In [None]:
path = "gs://wildfire-lab/batch_export_0422_test_3/2015_2714081_BOOT COVE FIRE.tfrecord.gz"

In [None]:
with open(features_path, "rb") as f:
  features_dict = pickle.load(f)

In [None]:

def load_and_parse_tfrecord(tfrecord_path, day_since_first_report):
      layers = ['pr', 'sph', 'th', 'tmmn', 'tmmx', 'vs', 'erc']
      day = f"{int(day_since_first_report):02}"
      images_array = np.zeros((7, 64, 64))
      try:
        raw_dataset = tf.data.TFRecordDataset(tfrecord_path, compression_type='GZIP')
        for raw_record in raw_dataset.take(1): # one record per dataset
            example = tf.io.parse_single_example(raw_record, features_dict)
            for i, layer in enumerate(layers):
                key = f'{day}_{layer}'
                if key in example:
                    images_array[i] = example[key].numpy()
                else:
                    print(f"Key {key} not found in TFRecord.")
      except:
        raise ValueError("Path not found", tfrecord_path, day_since_first_report)
      return  torch.tensor(images_array, dtype=torch.float)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

feature_names = list(df.iloc[0].index[3:-8])

def visualize_dataset_item(dataset, layer_names, feature_names, idx):
    features, images_tensor, target = dataset[idx]
    images_np = images_tensor.numpy()

    fig, axs = plt.subplots(1, 9, figsize=(25, 3))  # 7 for the images, 1 for the features

    # images
    for i in range(images_np.shape[0]):
        axs[i].imshow(images_np[i], cmap=plt.get_cmap('hot'))
        axs[i].set_title(layer_names[i])
        axs[i].axis('off')



    # one hot features
    active_features = [feat for i, feat in enumerate(feature_names) if features[i]>0.1]
    for i, feature in enumerate(active_features):
        axs[7].text(0.1, 0.9-i*1/6, feature, fontsize=9)
        axs[7].axis('off')
        axs[7].set_title('Features')

    axs[8].text(0.4, 0.5, target.item(), fontsize=15)
    axs[8].axis('off')
    axs[8].set_title('Diff area')
    plt.show()


layers = ['pr', 'sph', 'th', 'tmmn', 'tmmx', 'vs', 'erc']
for i in range(50):
  print(f"Report index: {i}, fire id: {df.iloc[i].loc['id']}")
  visualize_dataset_item(dataset, layers, feature_names, i)


In [None]:
dataset = torch.save("full_gee_dataset.pt")