# Installs (this restarts the kernel)

In [1]:
%%capture
!pip install pytorch_lightning
!pip install geopandas
!pip install pandas --upgrade
!pip install rich --upgrade
!pip install timm

In [2]:
# runtime has to restart to avoid an error in the following imports
exit()

# Imports

In [1]:
import os
import warnings
import pandas as pd
from google.colab import drive
import geopandas as gpd
from sklearn.model_selection import train_test_split
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.utils.class_weight import compute_class_weight
import torch
import time
import numpy as np
import pytorch_lightning as pl
from sklearn.preprocessing import MinMaxScaler

In [2]:
repo_name = "hurricane-harm-herald"
target_dir = "/content/"
os.chdir(target_dir)
print(os.getcwd())

/content


In [3]:
os.system("git clone --branch dev https://@github.com/ai4er-cdt/{}".format(repo_name))

0

In [4]:
%cd hurricane-harm-herald

/content/hurricane-harm-herald


In [5]:
from h3.dataprocessing.DataAugmentation import DataAugmentation
from h3.dataloading.HurricaneDataset import HurricaneDataset
from h3.models.multimodal import OverallModel
from h3.models.balance_process import main as balance_process_main

# Data Loading Functions

In [6]:
## Call function from basic models ipynb
from typing import List, Union
from pathlib import Path
from functools import reduce

def check_files_in_list_exist(
    file_list: Union[List[str], List[Path]]
    ):
    """State which files don't exist and remove from list"""
    files_found = []
    for fl in file_list:
        # attempt conversion to Path object if necessary
        if type(fl) != Path:
            try:
                fl = Path(fl)
            except TypeError:
                print(f'{fl} could not be converted to Path object')
        
        if fl.is_file():
            files_found += fl,
        else:
            print(f'{fl} not found. Removing from list.')

    return files_found


def read_and_merge_pkls(
    pkl_paths: Union[List[str], List[Path]]
) -> pd.DataFrame:
    """Read in pkl files from list of file paths and merge on index"""
    # check all files exist
    pkl_paths_present = check_files_in_list_exist(pkl_paths)
    df_list = [pd.read_pickle(pkl) for pkl in pkl_paths_present]

    return reduce(lambda df1,df2: pd.merge(df1,df2,left_index=True,right_index=True), df_list)


def rename_and_drop_duplicated_cols(
    df: pd.DataFrame
) -> pd.DataFrame:
    """Drop columns which are copies of others and rename the 'asdf_x' headers which would have resulted"""
    # need to ensure no bad types first
    df = drop_cols_containing_lists(df)
    # remove duplicated columns
    dropped_df = df.T.drop_duplicates().T
    # rename columns for clarity (especially those which are shared between dfs). Will be able to remove most with better
    # column naming further up the process
    new_col_names = {col: col.replace('_x', '') for col in dropped_df.columns if col.endswith('_x')}
    
    return dropped_df.rename(columns=new_col_names)

def drop_cols_containing_lists(
    df: pd.DataFrame
) -> pd.DataFrame:
    """It seemed like the best solution at the time: and to be fair, I can't really think of better...
    N.B. for speed, only looks at values in first row – if there is a multi-type column, this would be the least of
    our worries...
    """
    df = df.loc[:, df.iloc[0].apply(lambda x: type(x) != list)]

    return df

# Load dataframes, setup directories

In [7]:
# note: the drive paths may be different for you (for some reason "datasets" is "xBD_data" for me)
drive.mount("/content/drive/")

data_dir = "/content/drive/MyDrive/ai4er/python/hurricane/hurricane-harm-herald/data/datasets/"

Mounted at /content/drive/


In [8]:
# again, may need to change "xBD_data" to "datasets"

!cp -r /content/drive/MyDrive/ai4er/python/hurricane/hurricane-harm-herald/data/datasets/processed_data/processed_xbd/geotiffs_zoom/images/zoom_05.tar.gz /content
!cp -r /content/drive/MyDrive/ai4er/python/hurricane/hurricane-harm-herald/data/datasets/processed_data/processed_xbd/geotiffs_zoom/images/zoom_1.tar.gz /content
!cp -r /content/drive/MyDrive/ai4er/python/hurricane/hurricane-harm-herald/data/datasets/processed_data/processed_xbd/geotiffs_zoom/images/zoom_2.tar.gz /content
!cp -r /content/drive/MyDrive/ai4er/python/hurricane/hurricane-harm-herald/data/datasets/processed_data/processed_xbd/geotiffs_zoom/images/zoom_4.tar.gz /content

In [9]:
!mkdir /content/images
!mkdir /content/checkpoints

In [10]:
!tar -xzf /content/zoom_05.tar.gz -C /content/images/
!rm /content/zoom_05.tar.gz

!tar -xzf /content/zoom_1.tar.gz -C /content/images/
!rm /content/zoom_1.tar.gz

!tar -xzf /content/zoom_2.tar.gz -C /content/images/
!rm /content/zoom_2.tar.gz

!tar -xzf /content/zoom_4.tar.gz -C /content/images/
!rm /content/zoom_4.tar.gz


In [11]:
# the below directory should be to the .pkl with all EFs
img_path = "/content/images/"

In [None]:
df.columns

Index(['xbd_obs_geometry', 'polygon_lnglat', 'pointy', 'polygony',
       'disaster_name', 'image_name', 'capture_date', 'json_link',
       'damage_class', 'xbd_obs_lon', 'xbd_obs_lat', 'noaa_index', 'tag',
       'num_entries', 'noaa_obs_date', 'record_id', 'sys_status',
       'noaa_obs_lat', 'noaa_obs_lon', 'max_sust_wind', 'min_p', 'r_ne_34',
       'r_se_34', 'r_nw_34', 'r_sw_34', 'r_ne_50', 'r_se_50', 'r_nw_50',
       'r_sw_50', 'r_ne_64', 'r_se_64', 'r_nw_64', 'r_sw_64', 'r_max_wind',
       'strength', 'noaa_obs_geometry', 'shortest_distance_to_track',
       'disaster_name_y', 'storm_surge', 'soil_density', 'sand_content',
       'clay_content', 'silt_content', 'elevation', 'slope', 'aspect',
       'dis2coast', 'id'],
      dtype='object')

In [12]:
filtered_pickle_path = os.path.join(data_dir, "processed_data/metadata_pickle/filtered_lnglat_pre_pol_post_damage.pkl")

if os.path.exists(filtered_pickle_path):
    balanced_df = pd.read_pickle(filtered_pickle_path)
else:
    balanced_df = balance_process_main(data_dir)

# remove unclassified class
balanced_df = balanced_df[balanced_df.damage_class != 4]
balanced_df["id"] = balanced_df.index

# Choose EFs, train/val/test split, run model

In [13]:
# maybe unwise to use all features
# use RF feature importance and pick best features
# n.b. r_max_wind is NaN so don't use

EF_features = {
		"weather": [
			"max_sust_wind", "shortest_distance_to_track", "min_p",
			"r_nw_34", "r_sw_34",
		],
		"soil": ["soil_density", "sand_content", "clay_content", "silt_content"],
		"storm_surge": ["storm_surge"],
		"dem": ["elevation", "slope", "aspect", "dis2coast"]}

In [16]:
# insert Lisannes df downsampling code here 

# consider using a set seed for consistency
train_df, test_df = train_test_split(balanced_df, test_size = 0.1, random_state = 1)
train_df, val_df = train_test_split(train_df, test_size = 0.2/0.9, random_state = 1)

In [18]:
features_to_scale = [
		"max_sust_wind", "shortest_distance_to_track", "min_p",
		"r_nw_34", "r_sw_34",
		"soil_density", "sand_content", "clay_content", "silt_content",
		"storm_surge",
		"elevation", "slope", "aspect", "dis2coast"
	]

scaled_train_df = train_df.copy()
scaled_val_df = val_df.copy()

scaler = MinMaxScaler()
scaled_train_df[features_to_scale] = scaler.fit_transform(scaled_train_df[features_to_scale])
scaled_val_df[features_to_scale] = scaler.transform(val_df[features_to_scale])

In [19]:
#augmentations = DataAugmentation()
augmentations = DataAugmentation()

In [None]:
# zoom_levels = ["1", "2", "4", "0.5"]
zoom_levels = ["1"]
image_embedding_architecture = "SatMAE"

# class weights for weighted cross-entropy loss
# class_weights = compute_class_weight(class_weight = "balanced",
#                                      classes = np.unique(train_df["damage_class"].to_numpy()),
#                                      y = train_df["damage_class"])

# class_weights = torch.as_tensor(class_weights).type(torch.FloatTensor)


train_dataset = HurricaneDataset(scaled_train_df, img_path, EF_features,
                                 image_embedding_architecture = image_embedding_architecture,
                                 zoom_levels = zoom_levels,
                                 augmentations = augmentations)

val_dataset = HurricaneDataset(scaled_val_df, img_path, EF_features,
                               image_embedding_architecture = image_embedding_architecture,
                               zoom_levels = zoom_levels)

if cuda_device:
    torch.set_float32_matmul_precision('medium')
    num_workers = 4
    persistent_w = bool(num_workers)
else:
    num_workers = 0
    persistent_w = False

model = OverallModel(training_dataset = train_dataset, 
                     validation_dataset = val_dataset,
                     num_input_channels = 3,
                     EF_features = EF_features,
                     batch_size = 64,
                     image_embedding_architecture = image_embedding_architecture,
                     image_encoder_lr = 0,
                     general_lr = 1e-3,
                     output_activation = None,
                     loss_function_str = "CELoss",
                     num_output_classes = 4,
                     lr_scheduler_patience = 3,
                     zoom_levels = zoom_levels,
                     #class_weights = class_weights,
                     image_only_model = False,
                     weight_decay = 0.001,
                     num_workers = num_workers,
                     persistent_w = persistent_w)

max_epochs = 1
log_every_n_steps = 100 


early_stop_callback = EarlyStopping(monitor="val/loss", patience=5, mode="min")

experiment_name = "experiment_test1"

checkpoint_callback = ModelCheckpoint(
        monitor="val/loss",
        dirpath=os.path.join(data_dir, "checkpoints", experiment_name),
        filename="{epoch}-{val/loss:.4f}",
        save_top_k=1,        # save the best model
        mode="min",
        every_n_epochs=1
    )

tic = time.perf_counter()

if torch.cuda.is_available():
    trainer = pl.Trainer(max_epochs = max_epochs, accelerator = 'gpu',
                         log_every_n_steps = log_every_n_steps,
                         callbacks = [checkpoint_callback, early_stop_callback],
                         profiler = "simple")
else:
    trainer = pl.Trainer(max_epochs = max_epochs,
                         log_every_n_steps = log_every_n_steps,
                         callbacks = [checkpoint_callback, early_stop_callback],
                         profiler = "simple")
%reload_ext tensorboard
%tensorboard --logdir=lightning_logs/
trainer.fit(model)

toc = time.perf_counter()
display(toc - tic)

# Load a model from a saved checkpoint

In [None]:
# having to put in training_dataset and validation_dataset is weird
model_from_ckpt = OverallModel.load_from_checkpoint("/content/checkpoints/epoch=0-val/loss=6.16.ckpt",
                                                    training_dataset = train_dataset,
                                                    validation_dataset = val_dataset)
