# H3 hurricane model pipeline

In [1]:
%%capture
!pip install rasterio # pip install necessary to avoid error in this package
!pip install geopandas
!pip install georaster
!pip install pandas --upgrade # make sure on latest version of pandas so you can open the pickled files
!pip install pytorch_lightning

In [2]:
import pytorch_lightning as pl
import torchvision.models as models
import torchvision
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import torch
import geopandas as gpd
import os
import torch.nn as nn
import torch.optim
from torchvision.io import read_image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from google.colab import drive
from PIL import Image
from sklearn.model_selection import train_test_split
import torchmetrics
from torchmetrics import Accuracy
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import torch.nn.functional as F 
from torchvision.models import ViT_L_16_Weights, vit_l_16
from torchvision.models import swin_v2_b, Swin_V2_B_Weights
from torchvision.models.swin_transformer import SwinTransformer
import random
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import MinMaxScaler
import time

## Model pipeline

In [3]:
class DataAugmentation(nn.Module):
    """Module to perform data augmentation on torch tensors."""
    """Consider using the augmentations in the below link, as they work on tensors"""
    """https://pytorch.org/vision/main/transforms.html"""

    def __init__(self,
                 use_noise = False,
                 use_flipping = True,
                 use_rotation = True,
                 use_zoom = True,
                 use_solarize = False,
                 use_colorjitter = False,
                 noise_amount = 20, # the std of the noise level, assuming the image pixel value is between 0 to 255 
                 flip_probability = 0.5,
                 scale_range=(0.9,1.1),
                 solarize_threshold=128, # above the solarize_threshold, the pixle will be randomly inverted with probability to 'solarize_probability'
                 solarize_probability=0.5,
                 cj_brightness=(0.95,1), # colorjitter brightness
                 cj_contrast=(0.95,1),  # colorjitter contrast
                 cj_saturation=(0.95,1),  # colorjitter saturation
                 cj_hue=(-0.15,0.15)) -> None:  ## colorjitter hue
        super().__init__()

        #the following code apply gaussian noise to the image with the std value of the noise specified by 'noise_amount'
        self.apply_noise = transforms.Compose([
                            transforms.Lambda(lambda x: x + torch.randn(x.size())*noise_amount),
                            transforms.Lambda(lambda x: torch.clamp(x, 0, 255)),
                            transforms.Lambda(lambda x: x.int())
        ])


        self.flipping = nn.Sequential(
            transforms.RandomHorizontalFlip(p = flip_probability),
            transforms.RandomVerticalFlip(p = flip_probability)
        )

        self.rotation = transforms.Lambda(lambda x: transforms.functional.rotate(x, random.choice([0, 90, 180, 270])))
            

        #the following code apply zoom 
        self.zoom = transforms.RandomAffine(degrees=(0, 0), translate=(0, 0), scale=scale_range)
        #the following code apply solarize
        #please set the threshold(range:0 to 255), which when exceeded the pixel will be solarized.
        #let the threhold to be 0.8 to 1 times of the brightest pixel. 
        #if set threhold very low (like 0.1) the effect will be like invert, which differs from original a lot.
        self.solarize = transforms.RandomSolarize(solarize_threshold,solarize_probability)

        # the following code apply colorjitte to the image
        self.colorjitter=transforms.Compose([
                            transforms.Lambda(lambda x: x/255),
                            transforms.ColorJitter(brightness=(0.95,1),contrast=(0.95,1),saturation=(0.95,1),hue=(-0.15,0.15)),
                            transforms.Lambda(lambda x: x*255),
                            transforms.Lambda(lambda x: x.int())
        ])
                
        self.use_noise = use_noise
        self.use_flipping = use_flipping
        self.use_rotation = use_rotation
        self.use_zoom = use_zoom
        self.use_solarize = use_solarize
        self.use_colorjitter = use_colorjitter



    @torch.no_grad()  # disable gradients for effiency
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if self.use_flipping:
            x = self.flipping(x)

        if self.use_noise:
            x = self.apply_noise(x)

        if self.use_rotation:
            x = self.rotation(x)
        
        if self.use_zoom:
            x = self.zoom(x)

        if self.use_solarize:
            x = self.solarize(x)

        if self.use_colorjitter:
            x = self.colorjitter(x)

        return x

"""initialize the image embedding block"""
"""in GaLeNet Fig.1 this is the CLIP box"""
class ImageEncoder(pl.LightningModule):
    def __init__(self, image_embedding_architecture):
        super().__init__()

        if image_embedding_architecture == "ResNet18":
        # tell pytorch to use the ResNet18 architecture

            backbone = models.resnet18(weights = "DEFAULT")

            # drop final layer since non-SSL trained model
            # with the below ResNet, num_image_encoder_features == 512
            layers = list(backbone.children())[:-1] 
            self.feature_extractor = nn.Sequential(*layers)

        elif image_embedding_architecture == "ViT_L_16":
            # with the below ViT_L_16, num_image_encoder_features == 1024
            backbone = vit_l_16(weights=ViT_L_16_Weights.DEFAULT)
            layers = list(backbone.children())[:-1] 
            self.feature_extractor = nn.Sequential(*layers)
            self.model = backbone

        elif image_embedding_architecture == "Swin_V2_B":
            # with the below Swin_V2_B, num_image_encoder_features == 1024
            backbone = swin_v2_b(weights=Swin_V2_B_Weights.DEFAULT)
            layers = list(backbone.children())[:-1] 
            self.feature_extractor = nn.Sequential(*layers)

        #elif image_embedding_architecture == "SatMAE":
            # with the below SatMAE, num_image_encoder_features == 1024
        #    self.model = h3.models.vision_transformer.get_model("SatMAE")


        self.image_embedding_architecture = image_embedding_architecture

    def forward(self, x):
        self.feature_extractor.eval()

        with torch.no_grad(): 
            if self.image_embedding_architecture == "ResNet18":
                embedding = self.feature_extractor(x).flatten(1)

            elif self.image_embedding_architecture == "ViT_L_16":
                # the following code is taken from https://discuss.pytorch.org/t/feature-extraction-in-torchvision-models-vit-b-16/148029/2

                # This is the whole encoder sequence
                encoder = self.feature_extractor[1]

                # This is how the model preprocess the image.
                # The output shape is the one desired 
                x = self.model._process_input(x)

                n = x.shape[0]

                batch_class_token = self.model.class_token.expand(n, -1, -1)
                x = torch.cat([batch_class_token, x], dim = 1)
                x = encoder(x)

                # Classifier "token" as used by standard language architectures
                embedding = x[:, 0]

            elif self.image_embedding_architecture == "Swin_V2_B":
                embedding = self.feature_extractor(x)

            #elif SatMAE

        return embedding


"""initalize the generic encoder block"""
"""in GaLeNet Fig.1 this is all of the small grey Encoder Blocks"""
class GenericEncoder(pl.LightningModule):
    def __init__(self, num_input_features, num_output_features, dropout_rate): 
        super().__init__()

        self.l1 = nn.Linear(num_input_features, num_output_features)
        self.batchnorm = nn.BatchNorm1d(num_output_features)
        self.dropout = nn.Dropout(dropout_rate)
        self.activation = nn.SiLU()

    def forward(self, x):
        x = self.l1(x)
        x = self.batchnorm(x)
        x = self.activation(x)
        x = self.dropout(x)

        return x


"""initalize the SoftMax classification layer to predict damage class"""
class ClassificationLayer(pl.LightningModule):
    def __init__(self, num_input_features, num_output_classes, output_activation): 
        super().__init__()

        
        if output_activation == "sigmoid": # use Sigmoid for binary classification
            self.activation = nn.Sigmoid()
            self.l1 = nn.Linear(num_input_features, 1)

        elif output_activation == "softmax": #softmax for multiclass classification
            self.activation = torch.nn.Softmax(dim = 1)
            self.l1 = self.l1 = nn.Linear(num_input_features, num_output_classes)

        elif output_activation == "relu":
            """relu could be used if we treat damage classes as a regression
            problem. this is unbounded though so can produce values > 4."""
            self.activation = F.relu
            self.l1 = self.l1 = nn.Linear(num_input_features, 1)

        elif output_activation == None:
            self.activation = nn.Identity()
            self.l1 = self.l1 = nn.Linear(num_input_features, num_output_classes)

    def forward(self, x):
        x = self.l1(x)
        x = self.activation(x)

        return x

"""initalize the overall architecture, i.e. the combination of encoder blocks"""
class OverallModel(pl.LightningModule):
    """
    Description of what this class does here

    Parameters
    ----------
    training_dataset : torch.utils.data.Dataset
        Contains the data used for training

    validation_dataset : torch.utils.data.Dataset
        Contains the data used for training

    image_embedding_architecture : str
        Determines the image embedding architecture used. Possible values are:
            - 'ResNet18'
            - 'ViT_L_16'
            - 'Swin_V2_B'

    num_input_channels : int
        The number of channels in the input images.

    EF_features : dict(String: List(String))
        A dictionary mapping from type of EF to a list of strings of names of the EFs.
        E.g., {"weather": ["precip", "wind_speed"], "soil": ["clay", "sand"]}

    dropout_rate : float
        The dropout probability

    image_encoder_lr : float
        The learning rate for the image encoder. If 0, then image encoder weights are frozen.

    general_lr : float
        The learning rate for all other parts of the model.

    batch_size : int
        The batch size used during training and validation steps.

    weight_decay : float
        Adam weight decay (L2 penalty)

    lr_scheduler_patience : int
        The number of epochs of validation loss plateau before lr is decreased.

    num_image_feature_encoder_features : int
        The number of features output from the encoder that operates on the 
        features produced by the image encoder

    num_output_classes : int
        The number of output classes. Set to 1 for regression.

    zoom_levels : List[str]
        A list containing the different image zoom levels.

    class_weights: torch.FloatTensor
        A tensor containing a weights to be applied to each class in the 
        cross entropy loss function.

    image_only_model: Boolean
        If true, then the model behaves as if there were no EFs, and only the
        images are used to make predictions.

    loss_function_str : str
        Determines the loss function used. Possible values are:
            - 'BCELoss' : Binary Cross Entropy Loss, for binary classification
            - 'CELoss' : Cross Entropy Loss, for multiclass classification
            - 'MSE' : Mean Squared Error, for regression

    output_activation : str
        Determines the output activation function used. Possible values are:
            - 'sigmoid' : Sigmoid, for binary classification
            - 'softmax' : Softmax, for multiclass classification
            - 'relu' : ReLU, for regression


    Attributes
    ----------
    Describe the attributes here, e.g. image_encoder, classification, augment
    """

    def __init__(
        self,
        training_dataset, 
        validation_dataset,
        image_embedding_architecture = "ResNet18",
        dropout_rate: float = 0.2,
        general_lr: float = 1e-4,
        image_encoder_lr: float = 0,
        batch_size = 32,
        weight_decay = 0,
        lr_scheduler_patience = 2, 
        num_input_channels = 3,
        EF_features = None,
        num_concat_encoder_features = 100,
        num_image_feature_encoder_features = 56,
        num_output_classes = 4,
        zoom_levels = ["1"],
        class_weights = None,
        image_only_model = False,
        
        loss_function_str = "CELoss", # maybe use focal loss for unbalanced multiclass as in GaLeNet
        output_activation = None # CELoss expects unnormalized logits

    ) -> None:
        super().__init__()

        if image_embedding_architecture == "ResNet18":
            num_image_encoder_features = 512
        else: # every other case should be a ViT which outputs 1024 features
            num_image_encoder_features = 1024


        # total number of EFs present in the EF_features dictionary
        total_num_EFs = sum(map(len, EF_features.values()))

        self.augment = DataAugmentation()

        # the image encoding architecture (e.g. ResNet)
        self.image_encoder = ImageEncoder(
            image_embedding_architecture
        )

        # need nn.ModuleList() to create a variable number of encoders depending
        # on the zoom levels supplied
        self.image_feature_encoders = nn.ModuleList()
        for _ in zoom_levels:
            # the encoding block for image features (produces Ai1 as in the diagram)
            self.image_feature_encoders.append(GenericEncoder(
                num_image_encoder_features, num_image_feature_encoder_features, dropout_rate
            ))

        self.image_feature_classifiers = nn.ModuleList()
        for _ in zoom_levels:
            # the classification block for each embedded zoomed image
            self.image_feature_classifiers.append(ClassificationLayer(
                num_image_feature_encoder_features, num_output_classes, output_activation
            ))

        if not image_only_model:
            # each EF specified in the EF_features dictionary gets a different
            # encoding block. 

            self.ef_encoders = nn.ModuleDict()

            for key in EF_features:
                num_EFs = len(EF_features[key]) #num EFs in modality

                self.ef_encoders.update({key: 
                    GenericEncoder(num_EFs, num_EFs, dropout_rate)}
                )


        if not image_only_model:
            # the encoder that takes as input the encoded image features + encoded EFs
            self.concat_encoder = GenericEncoder(
                (num_image_feature_encoder_features * len(zoom_levels)) + total_num_EFs, num_concat_encoder_features, dropout_rate  
            )

        else:
            # the encoder that takes as input the encoded image features
            self.concat_encoder = GenericEncoder(
                (num_image_feature_encoder_features * len(zoom_levels)), num_concat_encoder_features, dropout_rate  
            )

        # the classification layer used with the concatenated embedded features
        self.concat_classification = ClassificationLayer(num_concat_encoder_features, num_output_classes, output_activation)


        # """below, num_input_features should be some parameter that contains the number of weather related features"""
        # self.weather_encoder = GenericEncoder(
        #     num_input_features, num_output_features, dropout_rate  
        # )

        # """below, num_input_features should be some parameter that contains the number of DEM related features"""
        # self.dem_encoder = GenericEncoder(
        #     num_input_features, num_output_features, dropout_rate  
        # )

        # """there will be one storm surge feature. how, if at all, should this be encoded?"""
        # self.storm_surge_encoder = GenericEncoder(
        #     num_input_features, num_output_features, dropout_rate  
        # )

        # """... more EF encoders"""
        



        if image_encoder_lr == 0:
            self.image_encoder.freeze()

        if loss_function_str == "BCELoss":
            self.loss_function = torch.nn.BCELoss()

        elif loss_function_str == "CELoss":
            self.loss_function = torch.nn.CrossEntropyLoss()

        elif loss_function_str == "MSE":
            self.loss_function = torch.nn.MSELoss()     

        elif loss_function_str == "weighted_CELoss":
            self.loss_function = torch.nn.CrossEntropyLoss(weight = class_weights)


        self.image_encoder_lr = image_encoder_lr
        self.general_lr = general_lr
        self.batch_size = batch_size
        self.lr_scheduler_patience = lr_scheduler_patience
        self.zoom_levels = zoom_levels
        self.weight_decay = weight_decay
        self.image_only_model = image_only_model
        self.EF_features = EF_features

        self.training_dataset = training_dataset
        self.validation_dataset = validation_dataset

        # balanced accuracy (i think)
        self.accuracy = Accuracy(task = 'multiclass', average = 'macro', num_classes = num_output_classes)


        self.save_hyperparameters()


    def forward(self, inputs):
        """for each zoom level Z, do image_Z_embedding = self.image_encoder(inputs["image_Z"], image_embedding_architecture)"""

        """for each zoom level Z, do image_Z_embedding = self.GenericEncoder(image_Z_embedding, num_input_features, num_output_features)"""


        """for each type of EF, do EF_embedding = self.image_encoder(inputs["EF"])"""
        """all related EFs (e.g. all the weather EFs) should be in a
        single vector and pushed through a single embedding block."""
        """there should be a different embedding block for each type of EF"""

        """concat_embedding = concat all EF_embeddings and all image_Z_embeddings"""
        """concat_embedding = GenericEncoder(concat_embedding)"""
        """concat_prediction = self.ClassificationLayer(concat_prediction)"""

        """for each zoom level Z, do Z_prediction = self.ClassificationLayer(image_Z_embedding)"""


        """return the predictions from each zoom level individually and also the 
        predciction from the concat_embedding"""

        # return Z1_prediction, Z2_prediction, ..., concat_prediction

        # a list of tensors to be concatenated
        embeddings_to_concat = []

        # for each zoom level, put the image embedding tensor into the list
        for i in range(len(self.zoom_levels)):
            zoom_level = self.zoom_levels[i]
            image_zoom_embedding = self.image_encoder(inputs["img_zoom_" + zoom_level])
            image_zoom_embedding = self.image_feature_encoders[i](image_zoom_embedding)
            embeddings_to_concat.append(image_zoom_embedding)


        # a list of the predictions made from each embedded zoom level
        image_feature_predictions = []

        # for each embedded zoom level, predict the output class
        for i in range(len(embeddings_to_concat)):
            image_feature_predictions.append(self.image_feature_classifiers[i](embeddings_to_concat[i]))

        if not self.image_only_model:
            # put the embedded EFs into the the list
            for key in self.EF_features:
                embeddings_to_concat.append(self.ef_encoders[key](inputs[key]))

        # concats the EF and zoomed image embeddings. first dim is batch dimension, so concat along dim = 1
        concat_embedding = torch.concat(embeddings_to_concat, dim = 1)
        concat_embedding = self.concat_encoder(concat_embedding)

        concat_predictions = self.concat_classification(concat_embedding)

        return concat_predictions, image_feature_predictions


    def _compute_losses(self, concat_predictions, image_feature_predictions, y):
    #def _compute_losses(self, Z1_prediction, Z2_prediction, ..., concat_prediction, y):

        """Z1_loss = focal_loss(Z1_prediction, y)"""
        """Z2_loss = focal_loss(Z2_prediction, y)"""
        """..."""
        """concat_loss = focal_loss(concat_loss, y)"""

        """loss = sum(Z1_loss to Z4_loss) + concat_loss"""
        """each of the individual loss functions is a torchvision.ops.focal_loss, as in GaLeNet"""
        """the L_i's correspond to different zoom levels, so only use one L_i to start with"""

        #display(predictions.shape)
        #display(y.shape)
        #display(y.flatten().shape)

        loss = self.loss_function(concat_predictions, y.flatten())

        # as in GaLeNet, combine losses from concat embedding and each of the 
        # zoomed image embeddings
        for image_feature_prediction in image_feature_predictions:
            loss += self.loss_function(image_feature_prediction, y.flatten())

        return loss


    def configure_optimizers(self):

        # this code allows different learning rates for the different blocks
        if not self.image_only_model:
            parameters = [
                {"params": self.image_encoder.parameters(), "lr": self.image_encoder_lr, "weight_decay": self.weight_decay},
                {"params": self.concat_classification.parameters(), "lr": self.general_lr, "weight_decay": self.weight_decay},
                {"params": self.concat_encoder.parameters(), "lr": self.general_lr, "weight_decay": self.weight_decay},
                {"params": self.ef_encoders.parameters(), "lr": self.general_lr, "weight_decay": self.weight_decay},
                {"params": self.image_feature_classifiers.parameters(), "lr": self.general_lr, "weight_decay": self.weight_decay},
                {"params": self.image_feature_encoders.parameters(), "lr": self.general_lr, "weight_decay": self.weight_decay}
            ]
        else:
            # this else just removes ef_encoders
            parameters = [
                {"params": self.image_encoder.parameters(), "lr": self.image_encoder_lr, "weight_decay": self.weight_decay},
                {"params": self.concat_classification.parameters(), "lr": self.general_lr, "weight_decay": self.weight_decay},
                {"params": self.concat_encoder.parameters(), "lr": self.general_lr, "weight_decay": self.weight_decay},
                {"params": self.image_feature_classifiers.parameters(), "lr": self.general_lr, "weight_decay": self.weight_decay},
                {"params": self.image_feature_encoders.parameters(), "lr": self.general_lr, "weight_decay": self.weight_decay}
            ]

        optimizer = torch.optim.Adam(parameters)
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode = "min",
            patience = self.lr_scheduler_patience
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": lr_scheduler,
            "monitor": "val/loss",
        }


    def training_step(self, batch, *args, **kwargs):
        x, y = batch

        concat_predictions, image_feature_predictions = self.forward(x)
        loss = self._compute_losses(concat_predictions, image_feature_predictions, y).mean() 

        acc = self.accuracy(concat_predictions, y)

        # belwo is for multizoom loss
        #Z1_prediction, Z2_prediction, ..., concat_prediction = self.forward(x)
        #loss = self._compute_losses(Z1_prediction, Z2_prediction, ..., concat_prediction, y).mean() # maybe normalize the loss?

        train_loss = self.all_gather(loss) # what does all_gather do?
        self.log("train/loss", train_loss.mean(), logger = True, on_epoch = True)
        self.log("train accuracy", acc, logger = True, on_epoch = True)

        return train_loss

    def validation_step(self, batch, *args, **kwargs):
        x, y = batch

        concat_predictions, image_feature_predictions = self.forward(x)
        loss = self._compute_losses(concat_predictions, image_feature_predictions, y).mean() 

        acc = self.accuracy(concat_predictions, y)

        # below code is for multi-zoom processing
        #Z05_prediction, Z1_prediction, Z2_prediction, ..., concat_prediction = self.forward(x)
        #loss = self._compute_losses(Z1_prediction, Z2_prediction, ..., concat_prediction, y).mean()

        val_loss = self.all_gather(loss) # what does all_gather do?
        self.log("val/loss", val_loss.mean(), logger = True, on_epoch = True)
        self.log("val accuracy", acc, logger = True, on_epoch = True)
        return val_loss

    def training_epoch_end(self, out):
        # put stuff here that happens at the end of every training epoch
        #self.log('train_acc_epoch', self.accuracy)
        pass

    def validation_epoch_end(self, out):
        # put stuff here that happens at the end of every validation epoch
        #self.log('validation_acc_epoch', self.accuracy)
        pass

    def train_dataloader(self):
        loader = DataLoader(self.training_dataset, batch_size = self.batch_size)
        return loader

    def val_dataloader(self):
        loader = DataLoader(self.validation_dataset, batch_size = self.batch_size)
        return loader

## Image processing

In [4]:
class HurricaneDataset(Dataset):
    def __init__(self, dataframe, img_path, EF_features, image_embedding_architecture, augmentations = None, zoom_levels = ["1"]):
        self.dataframe = dataframe
        self.img_path = img_path
        self.EF_features = EF_features
        self.zoom_levels = zoom_levels

        if image_embedding_architecture == "ResNet18":
            self.preprocessing = transforms.Compose([
                    transforms.CenterCrop(224),
                    transforms.ToTensor(),
                    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                ])
            
        elif image_embedding_architecture == "ViT_L_16":
            self.preprocessing = ViT_L_16_Weights.IMAGENET1K_V1.transforms()
        
        elif image_embedding_architecture == "Swin_V2_B":
            self.preprocessing = Swin_V2_B_Weights.IMAGENET1K_V1.transforms()

        elif image_embedding_architecture == "SatMAE":
            # values from CustomDatasetFromImages() in https://github.com/sustainlab-group/SatMAE/blob/main/util/datasets.py
            self.preprocessing = transforms.Compose([
                    transforms.CenterCrop(224),
                    transforms.ToTensor(),
                    transforms.Normalize(mean=[0.4182007312774658, 0.4214799106121063, 0.3991275727748871], 
                                         std=[0.28774282336235046, 0.27541765570640564, 0.2764017581939697]),
                ])

        if augmentations is not None:
            self.transform = transforms.Compose([augmentations, self.preprocessing])

        else:
            self.transform = self.preprocessing

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        image_id = self.dataframe["id"].iloc[idx]
        x = {}
        zoomed_images = {}

        for zoom_level in self.zoom_levels:
            path = os.path.join(self.img_path, "zoom_" + zoom_level, str(image_id) + ".png")

            img = Image.open(os.path.join(self.img_path, "zoom_" + zoom_level, str(image_id) + ".png"))
            img = self.transform(img)
            img = np.asarray(img)
            #img = np.swapaxes(img, 0, 2)

            zoomed_images["img_zoom_" + zoom_level] = np.copy(img)

        # for each of the different types of EF (e.g. weather, soil, DEM) grab
        # their associated values and put them into the dictionary
        for key in EF_features:
            x.update({key: torch.as_tensor(self.dataframe[self.EF_features[key]].iloc[idx]).type(torch.FloatTensor)})
        
        label = torch.as_tensor(self.dataframe["damage_class"].iloc[idx]).type(torch.LongTensor)
        x.update(zoomed_images)
        return x, label

## Load datasets

In [6]:
## Functions from Basic_models.ipynb
from typing import List, Union
from pathlib import Path
from functools import reduce

def check_files_in_list_exist(
    file_list: Union[List[str], List[Path]]
    ):
    """State which files don't exist and remove from list"""
    files_found = []
    for fl in file_list:
        # attempt conversion to Path object if necessary
        if type(fl) != Path:
            try:
                fl = Path(fl)
            except TypeError:
                print(f'{fl} could not be converted to Path object')
        
        if fl.is_file():
            files_found += fl,
        else:
            print(f'{fl} not found. Removing from list.')

    return files_found


def read_and_merge_pkls(
    pkl_paths: Union[List[str], List[Path]]
) -> pd.DataFrame:
    """Read in pkl files from list of file paths and merge on index"""
    # check all files exist
    pkl_paths_present = check_files_in_list_exist(pkl_paths)
    df_list = [pd.read_pickle(pkl) for pkl in pkl_paths_present]

    return reduce(lambda df1,df2: pd.merge(df1,df2,left_index=True,right_index=True), df_list)


def rename_and_drop_duplicated_cols(
    df: pd.DataFrame
) -> pd.DataFrame:
    """Drop columns which are copies of others and rename the 'asdf_x' headers which would have resulted"""
    # need to ensure no bad types first
    df = drop_cols_containing_lists(df)
    # remove duplicated columns
    dropped_df = df.T.drop_duplicates().T
    # rename columns for clarity (especially those which are shared between dfs). Will be able to remove most with better
    # column naming further up the process
    new_col_names = {col: col.replace('_x', '') for col in dropped_df.columns if col.endswith('_x')}
    
    return dropped_df.rename(columns=new_col_names)

def drop_cols_containing_lists(
    df: pd.DataFrame
) -> pd.DataFrame:
    """It seemed like the best solution at the time: and to be fair, I can't really think of better...
    N.B. for speed, only looks at values in first row – if there is a multi-type column, this would be the least of
    our worries...
    """
    df = df.loc[:, df.iloc[0].apply(lambda x: type(x) != list)]

    return df


In [15]:
drive.mount("/content/drive/")
data_dir = "/content/drive/MyDrive/ai4er/python/hurricane/hurricane-harm-herald/data/datasets"
xbd_dir = "/content/drive/MyDrive/ai4er/python/hurricane/hurricane-harm-herald/data/datasets/xBD_data"
# the below directory should be to the .pkl with all EFs
img_path = "/content/images/"

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [32]:
# weather EFs
df_noaa_xbd_pkl_path = os.path.join(data_dir, "EFs/weather_data/xbd_obs_noaa_six_hourly_larger_dataset.pkl")
# terrain efs
df_terrain_efs_path = os.path.join(data_dir, "processed_data/Terrian_EFs.pkl")
# flood, storm surge and soil properties
df_topographic_efs_path = os.path.join(data_dir,"processed_data/df_points_posthurr_flood_risk_storm_surge_soil_properties.pkl")
# distance to track, interpolated to different resolutions (ADD LATER)
# df_distance_to_track = os.path.join(data_dir, "processed_data/shortest_dis2hurricanes_varying_res.pkl")

pkl_paths = [df_noaa_xbd_pkl_path, df_topographic_efs_path, df_terrain_efs_path]
EF_df = read_and_merge_pkls(pkl_paths)
EF_df_no_dups = rename_and_drop_duplicated_cols(EF_df)

EF_df_no_dups["id"] = EF_df_no_dups.index

Download manually zipped images locally, then unzip

In [None]:
!cp -r /content/drive/MyDrive/images-001.zip /content
!cp -r /content/drive/MyDrive/images-002.zip /content

In [None]:
!unzip -q -o images-001.zip -d /content
!unzip -q -o images-002.zip -d /content

## Model loading

In [27]:
 EF_features_withtrack = ['damage_class', 'noaa_obs_date', 'max_sust_wind', 'min_p', 'r_ne_34', 'r_se_34',
                'r_nw_34', 'r_sw_34', 'r_ne_50', 'r_se_50', 'r_nw_50', 'r_sw_50',
                'r_ne_64', 'r_se_64', 'r_nw_64', 'r_sw_64', 'strength',
                'noaa_obs_geometry', 'shortest_distance_to_track', 'geometry',
                'polygon_lnglat', 'point_xy', 'polygon_xy','storm_surge', 'soil_density',
                'sand_content', 'clay_content','silt_content', 'elevation', 'slope',
                'aspect', 'dis2coast', 'dis2hurricane_res10000m', 'dis2hurricane_res8000m',
                'dis2hurricane_res6000m', 'dis2hurricane_res4000m','dis2hurricane_res2000m',
                'dis2hurricane_res1000m', 'dis2hurricane_res800m', 'dis2hurricane_res600m',
                'dis2hurricane_res400m', 'dis2hurricane_res200m']


EF_features_whatisthis = {"weather": ["max_sust_wind", "shortest_distance_to_track", "max_sust_wind", "min_p", 
                       "r_ne_34", "r_se_34", "r_nw_34", "r_sw_34", "r_ne_50",
                       "r_se_50", "r_nw_50", "r_sw_50", "r_ne_64", "r_se_64",
                       "r_nw_64", "r_sw_64", "strength"],
               "soil": ["soil_density", "sand_content", "clay_content", "silt_content"],
               "storm_surge": ["storm_surge"]}

EF_features = [
       'damage_class', 'max_sust_wind', 'min_p', 'r_ne_34',
       'r_se_34', 'r_nw_34', 'r_sw_34', 'r_ne_50', 'r_se_50', 'r_nw_50',
       'r_sw_50', 'r_ne_64', 'r_se_64', 'r_nw_64', 'r_sw_64', 'r_max_wind',
       'strength', 'shortest_distance_to_track',
       'disaster_name_y', 'storm_surge', 'soil_density', 'sand_content',
       'clay_content', 'silt_content', 'elevation', 'slope', 'aspect',
       'dis2coast']

In [None]:
train_df, val_df = train_test_split(df, test_size = 0.2)
scaled_train_df = train_df.copy()
scaled_val_df = val_df.copy()

norm = MinMaxScaler()
scaled_train_df[EF_features] = norm.fit_transform(scaled_train_df[EF_features])
scaled_val_df[EF_features] = norm.transform(scaled_val_df[EF_features])

#augmentations = DataAugmentation()
augmentations = None

trainset = HurricaneDataset(scaled_train_df, img_path, EF_features,
                            image_embedding_architecture = "ResNet18",
                            zoom_levels = ["1", "2", "4", "0.5"],
                            augmentations = augmentations)

zoom_levels = ["1", "2", "4", "0.5"]

# class weights for weighted cross-entropy loss
class_weights = compute_class_weight(class_weight = "balanced",
                                     classes = np.unique(scaled_train_df["damage_class"].to_numpy()),
                                     y = scaled_train_df["damage_class"])

class_weights = torch.as_tensor(class_weights).type(torch.FloatTensor)


train_dataset = HurricaneDataset(scaled_train_df, img_path, EF_features, image_embedding_architecture = "ResNet18", zoom_levels = zoom_levels)
val_dataset = HurricaneDataset(scaled_val_df, img_path, EF_features, image_embedding_architecture = "ResNet18", zoom_levels = zoom_levels)

model = OverallModel(training_dataset = train_dataset,
                     validation_dataset = val_dataset,
                     num_input_channels = 3,
                     EF_features = EF_features,
                     batch_size = 4,
                     image_embedding_architecture = "ResNet18",
                     image_encoder_lr = 0,
                     general_lr = 1e-3,
                     output_activation = None,
                     loss_function_str = "weighted_CELoss",
                     num_output_classes = 5,
                     lr_scheduler_patience = 3,
                     zoom_levels = zoom_levels,
                     class_weights = class_weights,
                     image_only_model = False,
                     weight_decay = 0.001)

max_epochs = 4
log_every_n_steps = 100 # every 5th batch, log all the metrics


early_stop_callback = EarlyStopping(monitor="val/loss", patience=5, mode="min")
tic = time.perf_counter()

if torch.cuda.is_available():
    trainer = pl.Trainer(max_epochs = max_epochs, accelerator = 'gpu',
                         log_every_n_steps = log_every_n_steps,
                         callbacks = [early_stop_callback])
else:
    trainer = pl.Trainer(max_epochs = max_epochs,
                         log_every_n_steps = log_every_n_steps,
                         callbacks = [early_stop_callback])
%reload_ext tensorboard
%tensorboard --logdir=lightning_logs/
trainer.fit(model)

toc = time.perf_counter()
display(toc - tic)

# CREATE SAVE MODEL CODE

## Plot statistics

In [None]:
# pseudocode for checking model performance
"""
model.eval()

predictions = model.eval(test_df[EF_features])

metric = metric(predictions, test_df["damage_class"])
"""