<a href="https://colab.research.google.com/github/btsmith29/AMLS_II_assignment23_24/blob/main/cld_data_download.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cassava Leaf Disease Data Download

Interactive Notebook to download the data from Kaggle and store it on Google Drive for the purposes of the assignment.

Not designed to be re-run as part of the assessment, per se, as it has a dependency on Google Drive and Kaggle API secrets.

To limit the amount of storage required, the `tfrecords` are ignored.



In [1]:
!pip install -q kaggle

In [2]:
from google.colab import drive, files, userdata
from pathlib import Path
from PIL import Image

import json
import matplotlib.pyplot as plt
import shutil
import zipfile

In [3]:
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [4]:
def mkdir(path_str: str) -> Path:
  path = Path(path_str)
  path.mkdir(parents=True, exist_ok=True)
  return path


def load_kaggle_creds() -> None:
  """
  Creates the kaggle.json credentials file, which the API expects.
  """
  # Stored in the Colab secrets store.
  kaggle_username = userdata.get("kaggle_username")
  kaggle_key = userdata.get("kaggle_key")
  creds_dict = {"username": kaggle_username, "key": kaggle_key}

  with open("kaggle.json", "w") as file:
     file.write(json.dumps(creds_dict))

  !mkdir -p ~/.kaggle
  !cp kaggle.json ~/.kaggle/
  !chmod 600 ~/.kaggle/kaggle.json


def download_competition_data() -> Path:
  """
  Download & Extract the competition dataset from Kaggle.
  """
  load_kaggle_creds()
  !kaggle competitions download -c cassava-leaf-disease-classification
  path = mkdir("/content/data/cassava-leaf-disease-classification")
  with zipfile.ZipFile("cassava-leaf-disease-classification.zip", "r") as z:
    z.extractall(path)
  return path


def create_assignment_dataset(raw_path: Path) -> Path:
  """
  Processes the raw competition dataset to make it more suitable for the
  purposes of the assignement.  Just take the trainining images (ignoring the
  tfrecords format) and metadata (labels etc).
  """
  dataset_path = mkdir("/content/data/cldc_assignment_data")

  src_path = raw_path / "train_images"
  dest_path = dataset_path / "train_images"
  dest_path.mkdir(parents=True, exist_ok=True)

  for img in src_path.glob("*.jpg"):
    # im = Image.open(img)
    # # images are (800, 600), resize to (X, 255)
    # h = 255
    # w = int(h*(8/6))
    # im = im.resize((w, h), 0)
    # im.save((dest_path / img.name))
    shutil.copy(img, (dest_path / img.name))

  # also need the labels and the mapping description
  for f in ["train.csv", "label_num_to_disease_map.json"]:
    shutil.copy((raw_path / f), dataset_path)

  return dataset_path


def zip_and_copy_to_drive(data: Path) -> None:
  zip_file = shutil.make_archive("/content/assignment_data", 'zip', data)
  shutil.copy(zip_file,
              "/content/gdrive/MyDrive/Study/ds_and_ml/UCL_AMLSII/assignment/")


def main() -> None:
  raw_data_path = download_competition_data()
  dataset_path = create_assignment_dataset(raw_data_path)
  zip_and_copy_to_drive(dataset_path)

In [5]:
main()

Downloading cassava-leaf-disease-classification.zip to /content
100% 5.76G/5.76G [01:01<00:00, 161MB/s]
100% 5.76G/5.76G [01:01<00:00, 100MB/s]


In [6]:
#!rm -rf "/content/data"