# AIMS AI4Science Practical
This practical was developed by Kelsey Doerksen and Shreshth Malik from the [Oxford Applied and Theoretical Machine Learning Group](https://oatml.cs.ox.ac.uk/).



## Part 0: Add the data as a shortcut to your drive

> The data is accessible [here](https://drive.google.com/drive/folders/1-VcjxgyaATr1fRTpGnwVZRGi0RF7CeVI?usp=drive_link). Add this folder as a shortcut to your google drive to be able to access it here.

## Part 1: Data Exploration
An important part of any applied ML problem, particularly in scientific contexts, is a well-rounded understanding of the dataset you are working with. We've provided the initial code to begin exploring the Mars Frost dataset.


In [None]:
# Imports
import pandas as pd
import numpy as np
import os
import os.path as op
import json
from pathlib import Path
import shutil
from tqdm.autonotebook import tqdm
import logging
import zipfile
import imageio.v3 as iio
import matplotlib.pyplot as plt
from PIL import Image
import random
from tqdm import tqdm
import shutil
import shapely
from omegaconf import OmegaConf
from mars_frost.utils import DataUnit, get_metadata, get_sample, ImageMeta

%load_ext autoreload
%autoreload 2


### Step 1.0: Directory Setup

First, we will setup the correct data directories and become familiar with the data storage structure.
This data has been delivered directly from the science team - we therefore need to work around their existing data structure. Data is stored in subdirectories with the naming convention: `OBSERVATION-ID_IMAGE-COORDINATES` with each subdirectory containing a `labels` and `tiles` folder corresponding to the json labels describing each tile and a .png tile file.

In [124]:
# Set the path location to where you have added the shortcut directory of the Mars Frost dataset
data_dir = Path("practical_data")


train_data, test_data, val_data = (
    DataUnit.from_ids(data_dir, data_dir / f"{split_name}_source_images.txt")
    for split_name in ["train", "test", "val"]
)


def restructure(new_dir: str):
    new_dir = Path(new_dir)

    def restructure_dataset(new_base_path: Path, data_unit: DataUnit):
        for dir_ in data_unit.dirs:
            new_dir = new_base_path / dir_.stem
            new_dir.mkdir(parents=True, exist_ok=True)
            shutil.copytree(dir_, new_dir, dirs_exist_ok=True)

    for data_unit, label in [
        (train_data, "train"),
        (test_data, "test"),
        (val_data, "val"),
    ]:
        new_dir2 = new_dir / label
        restructure_dataset(new_dir2, data_unit)


In [125]:
# Find all subframe directories
# subdirs = [Path(subdir.stem) for subdir in data_dir.iterdir() if subdir.is_dir()]
# # List of image ids
# src_image_ids = ["_".join(a_path.name.split("_")[:3]) for a_path in subdirs]
# # Check that our image ids are greater than 0
# print(len(src_image_ids))

### Step 2.0: Loading Data and Class Balance
We have prepared a list of IDs for a train, test and validation split for the dataset as text files `train_source_images.txt`, `test_source_images.txt`, `val_source_images.txt`. You can use these splits or choose your own.


> **Question:** What should you consider when splitting your data into train, test, and validation?

> **Task:** Run the following code to report the class balance.

In [None]:
print(f"Train Frost Count: {train_data.class_balance.frost_count}")
print(f"Train Background Count: {train_data.class_balance.background_count}")

print(f"Test Frost Count: {test_data.class_balance.frost_count}")
print(f"Test Background Count: {test_data.class_balance.background_count}")

print(f"Validate Frost Count: {val_data.class_balance.frost_count}")
print(f"Validate Background Count: {val_data.class_balance.background_count}")

> **Question:** What is the class balance of the dataset? Include the class balance for the total data as well as the train, test, and validation splits.

### Step 3.0: Exploring the Dataset: Metadata
There is helpful information in each tile's metadata. Run the code below to query subdirectory paths from the training set and print their metadata information.

In [127]:
def print_metadata(sample_filepath):
    """
    Load label metadata to take a look
    """
    data = get_metadata(sample_filepath)

    # Print metadata key information
    print("Metadata keys are:\n")
    print(data.keys())
    print("\n")
    print("The data looks like:\n")
    print(json.dumps(data, indent=4))


# Printing Frost Metadata Sample


In [None]:
plt.imshow(image)

In [None]:
# Plot the image and overlay annotations


def plot_image_and_annotations(image, metadata):
    res_ = 0.5  # meters per pixel
    ax_image = plt.imshow(image)
    ax = ax_image.axes

    image_meta = ImageMeta.from_dict(metadata, image.shape)

    color_map = {
        "unknown_type": "red",
        "co_2": "blue",
    }
    image_meta.plot(ax, "red", color_map)
    # Set axis labels to show 0.5m per pixel scale

    # Set axis ticks to show distance in meters
    ax.set_xticks(np.arange(0, image.shape[1], 100))
    ax.set_yticks(np.arange(0, image.shape[0], 100))
    ax.set_xticklabels([f"{x * res_:.1f}" for x in ax.get_xticks()])
    ax.set_yticklabels([f"{y * res_:.1f}" for y in ax.get_yticks()])

    plt.tight_layout()

    ax.set_xlabel("Relative width (m))")
    ax.set_ylabel("Relative height (m)")

    ax.set_xlim(0, image.shape[1])
    ax.set_ylim(image.shape[0], 0)

    fig = ax.figure

    return fig

    # for k in range(100):
image, metadata = get_sample(train_data, True)

fig = plot_image_and_annotations(image, metadata)

In [None]:
image_meta = ImageMeta.from_dict(metadata, image.shape)

coverage = image_meta.get_coverage()

print(coverage)


In [None]:
plt.imshow(image)

In [132]:
def plot_images(directory_path, file_list, file_ref, class_type, max_imgs_per_row=5):
    """
    Plot data to see what it looks like
    """
    max_imgs = len(file_list)
    fig, axs = plt.subplots(1, max_imgs_per_row, figsize=(16, max_imgs_per_row))
    # Select n= max_imgs_per_row from subdirectory to plot
    for i in range(max_imgs_per_row):
        img = iio.imread("{}/{}".format(dir_path, file_list[i]))
        axs[i].imshow(img, cmap="gray")
        axs[i].axis("off")
    fig.suptitle("{} Examples from {}".format(class_type, file_ref), fontsize=16)
    fig.tight_layout()
    plt.tight_layout()
    plt.show()

In [None]:
# Printing Background Metadata Sample
meta_dir = (
    data_dir
    / train_subdirs[random.choice(train_background_idxs)]
    / Path("labels/background")
)
label_files = os.listdir(meta_dir)
print_metadata("{}/{}".format(meta_dir, label_files[10]))  # Choose any index you like

> **Question:** What contextual information is available in the data label metadata. What value could this information give?

### Step 4.0: Visualizing the Data
Now that we have taken a look at some of the metdata and found the class balance, let's visualize some examples of frost and no frost (background). Run the code below to plot samples from both classes.
Note: We suggest plotting the data in format `cmap="gray"` to more easily visualize and better represent what the data looked like when collected by HiRISE.

In [57]:
def plot_images(directory_path, file_list, file_ref, class_type, max_imgs_per_row=5):
    """
    Plot data to see what it looks like
    """
    max_imgs = len(file_list)
    fig, axs = plt.subplots(1, max_imgs_per_row, figsize=(16, max_imgs_per_row))
    # Select n= max_imgs_per_row from subdirectory to plot
    for i in range(max_imgs_per_row):
        img = iio.imread("{}/{}".format(meta_dir, file_list[i]))
        axs[i].imshow(img, cmap="gray")
        axs[i].axis("off")
    fig.suptitle("{} Examples from {}".format(class_type, file_ref), fontsize=16)
    fig.tight_layout()
    plt.tight_layout()
    plt.show()

In [None]:
# Plot some Frost samples
sample = str(train_subdirs[random.choice(train_frost_idxs)])
meta_dir = data_dir / Path(sample) / Path("tiles/frost")
tile_files = os.listdir(meta_dir)
plot_images(meta_dir, tile_files, sample, "Frost")

In [None]:
# Plot some Background samples
sample = str(train_subdirs[random.choice(train_background_idxs)])
meta_dir = data_dir / Path(sample) / Path("tiles/background")
tile_files = os.listdir(meta_dir)
plot_images(meta_dir, tile_files, sample, "Background")

> **Question:** Plot examples of the background and frost classes. What differences do you notice?

many visible rocks
frost is smooth and flowy

> **Question:** How can you use visualization tools (plots, figures, diagrams) to communicate information about the dataset (data distribution, etc), that would provide scientific value? Create at least one example of a visual representation that tells something about the data.

### Don't need to run! Step 5.0: Preparing for Modelling (Part 2 prep)
The dataset in its current form is in the form of `.png` files with `json` counterpart labels. If desired, you can save the data as .npy files if you prefer to do further data manipulation. The below code splits the train, test, validation tiles into subfolders for easy loading using Keras in Part 2.

In [60]:
def organize_data(save_dir, idx_list, subdir_list, data_label, data_type):
    """
    Save data into train, test, val folders for further processing later
    """
    save_dir = save_dir / Path(data_label)
    for idx in tqdm(idx_list, desc="Processing {} indices".format(data_label)):
        subdir = subdir_list[idx]
        dir_path = data_dir / subdir / Path("{}/{}".format(data_type, data_label))
        for file in dir_path.iterdir():
            shutil.copy2(file, save_dir / file.name)

In [None]:
print()

In [None]:
# --- Train Data: Frost ---
save_dir = drive_mount / "AIMS_AI4Science/practical_labels_organized/train"
save_dir.mkdir(parents=True, exist_ok=True)
organize_data(save_dir, train_frost_idxs, train_subdirs, "frost", "labels")

In [None]:
save_dir = Path("/content/drive/MyDrive/AIMS AI4Science/practical_data_organized/train")
organize_data(save_dir, train_frost_idxs, train_subdirs, "frost", "tiles")

In [None]:
# --- Train Data: Background ---
save_dir = Path(
    "/content/drive/MyDrive/AIMS AI4Science/practical_labels_organized/train/"
)
organize_data(save_dir, train_background_idxs, train_subdirs, "background", "labels")

save_dir = Path(
    "/content/drive/MyDrive/AIMS AI4Science/practical_data_organized/train/"
)
organize_data(save_dir, train_background_idxs, train_subdirs, "background", "tiles")

In [None]:
# --- Test Data: Frost ---
save_dir = Path(
    "/content/drive/MyDrive/AIMS AI4Science/practical_labels_organized/test/"
)
organize_data(save_dir, test_frost_idxs, test_subdirs, "frost", "labels")

save_dir = Path("/content/drive/MyDrive/AIMS AI4Science/practical_data_organized/test/")
organize_data(save_dir, test_frost_idxs, test_subdirs, "frost", "tiles")

In [None]:
# --- Test Data: Background ---
save_dir = Path(
    "/content/drive/MyDrive/AIMS AI4Science/practical_labels_organized/test"
)
organize_data(save_dir, test_background_idxs, test_subdirs, "background", "labels")

save_dir = Path("/content/drive/MyDrive/AIMS AI4Science/practical_data_organized/test")
organize_data(save_dir, test_background_idxs, test_subdirs, "background", "tiles")

In [None]:
# --- Val Data: Frost ---
save_dir = Path(
    "/content/drive/MyDrive/AIMS AI4Science/practical_labels_organized/validate"
)
organize_data(save_dir, validate_frost_idxs, validate_subdirs, "frost", "labels")

save_dir = Path(
    "/content/drive/MyDrive/AIMS AI4Science/practical_data_organized/validate"
)
organize_data(save_dir, validate_frost_idxs, validate_subdirs, "frost", "tiles")

In [None]:
# --- Val Data: Background ---
save_dir = Path(
    "/content/drive/MyDrive/AIMS AI4Science/practical_labels_organized/validate"
)
organize_data(
    save_dir, validate_background_idxs, validate_subdirs, "background", "labels"
)

save_dir = Path(
    "/content/drive/MyDrive/AIMS AI4Science/practical_data_organized/validate"
)
organize_data(save_dir, validate_background_idxs, validate_subdirs, "frost", "tiles")