# Setting up the project in google colab

In [None]:
# Cloning repository into current folder
!git clone https://github.com/andraspalasti/deeplearning-hw.git
!mv deeplearning-hw/* .
!rm -rf deeplearning-hw/

# Install the packages used
%pip install -r requirements.txt

In [None]:
from pathlib import Path
data_dir = Path('data')

## Download raw dataset

To download the dataset from kaggle you need to be signed in.

What do these cells do?
1. Download raw dataset from kaggle
1. Unzip the downloaded dataset
1. Divide dataset into train, val, test datasets


In [None]:
#Set the enviroment variables for authentication
import os
os.environ['KAGGLE_USERNAME'] = "palstiandrs"
os.environ['KAGGLE_KEY'] = "fbdfe3ac6bdf77c68b2c3da0e8dedd47"

# Download the dataset
!mkdir -p data/raw/
!kaggle competitions download -c airbus-ship-detection -p data/raw/

In [None]:
# Unzipping the downloaded data
!unzip -u -d data/raw/ data/raw/airbus-ship-detection.zip

In [None]:
!echo "Number of images in raw dataset: $(ls -l data/raw/train_v2/ | wc -l)"

In [None]:
from math import floor

# Limit the number of images that we use to 100,000
# othwerise dataset would be too big
num_images = len(list((data_dir / 'raw' / 'train_v2').glob('*.jpg')))
num_images = min(num_images, 100_000)

train_size = floor(num_images * 0.8)
val_size = floor(num_images * 0.1)
test_size = floor(num_images * 0.1)
num_images

In [None]:
# Split training images into train, val, test sets using images from the raw dataset
!mkdir -p data/processed/

# Create training dataset
!mkdir -p data/processed/train/
!find data/raw/train_v2/ -name "*.jpg" | head -n {train_size} | tr '\n' '\0' \
    | xargs -0 mv -t data/processed/train/
!cp data/raw/train_ship_segmentations_v2.csv data/processed/train_ship_segmentations.csv

# Create validation dataset
!mkdir -p data/processed/val/
!find data/raw/train_v2/ -name "*.jpg" | head -n {val_size} \
    | tr '\n' '\0' | xargs -0 mv -t data/processed/val/
!cp data/raw/train_ship_segmentations_v2.csv data/processed/val_ship_segmentations.csv

# Create test dataset
!mkdir -p data/processed/test/
!find data/raw/train_v2/ -name "*.jpg" | head -n {test_size} \
    | tr '\n' '\0' | xargs -0 mv -t data/processed/test/
!cp data/raw/train_ship_segmentations_v2.csv data/processed/test_ship_segmentations.csv

In [None]:
from src.data import filter_missing
proc_dir = data_dir / 'processed'

# Filter missing annotations
for dataset in ['train', 'val', 'test']:
    filter_missing(proc_dir / f'{dataset}_ship_segmentations.csv',
                   proc_dir / f'{dataset}')

In [None]:
!echo "Number of images in dataset: $(find data/processed/*/ -name "*.jpg" | wc -l)"
!echo "Size of dataset on disk: $(du -sh data/processed)"