<a href="https://colab.research.google.com/github/amedyukhina/biomassters/blob/main/biomassters_dataprep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%writefile requirements.txt

pandas==1.3.5
tqdm==4.64.0
numpy==1.21.6
scikit-image==0.18.3
matplotlib==3.2.2
rasterio==1.2.0
boto3==1.26.16

Writing requirements.txt


In [2]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tqdm==4.64.0
  Downloading tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 2.7 MB/s 
Collecting rasterio==1.2.0
  Downloading rasterio-1.2.0-cp38-cp38-manylinux1_x86_64.whl (19.1 MB)
[K     |████████████████████████████████| 19.1 MB 29.5 MB/s 
[?25hCollecting boto3==1.26.16
  Downloading boto3-1.26.16-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 50.8 MB/s 
Collecting snuggs>=1.4.1
  Downloading snuggs-1.4.7-py3-none-any.whl (5.4 kB)
Collecting affine
  Downloading affine-2.3.1-py2.py3-none-any.whl (16 kB)
Collecting click-plugins
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting botocore<1.30.0,>=1.29.16
  Downloading botocore-1.29.37-py3-none-any.whl (10.3 MB)
[K     |████████████████████████████████| 10.

In [20]:
from google.colab import drive
import os
import rasterio
import pandas as pd
from tqdm import tqdm
import numpy as np
from skimage import io

import boto3
from botocore import UNSIGNED
from botocore.config import Config

from cachetools import cached, TTLCache

import warnings
from tqdm.notebook import tqdm
import shutil

# Our rasters contain no geolocation info, so silence this warning from rasterio
warnings.filterwarnings("ignore", category=rasterio.errors.NotGeoreferencedWarning)

In [4]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


### Prepare list of chip IDs

In [5]:
feature_path = 'gdrive/MyDrive/biomassters/info/features_metadata.csv'
data_path = 'gdrive/MyDrive/biomassters/data/'
train_img_dir = 'train_features'
test_img_dir = 'test_features'
label_dir = 'train_agbm'

In [6]:
df = pd.read_csv(feature_path)

In [7]:
df_pred = df[df['split'] == 'test'].reset_index(drop=True)
pred_ids = np.unique(df_pred['chip_id'])
len(df_pred), len(pred_ids)

(63348, 2773)

In [8]:
df = df[df['split'] == 'train'].reset_index(drop=True)
all_ids = np.unique(df['chip_id'])
len(df), len(all_ids)

(189078, 8689)

In [9]:
np.random.seed(42)
np.random.shuffle(all_ids)

In [10]:
train_ids = all_ids[:500]
val_ids = all_ids[500:550]

### Set up data loading

In [11]:
BUCKET_NAME = 'drivendata-competition-biomassters-public-us'
os.environ["AWS_NO_SIGN_REQUEST"] = 'YES'
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

In [12]:
cache = TTLCache(maxsize=1000, ttl=86400)

In [13]:
@cached(cache)
def get_image(fn):
  obj = s3.get_object(Bucket=BUCKET_NAME, Key=fn)
  with rasterio.open(obj['Body']) as src:
    img = src.read()
  return img

### Calculate mean and std of the dataset + save images to gdrive

In [25]:
# training data
os.makedirs(data_path + train_img_dir, exist_ok=True)
os.makedirs(data_path + label_dir, exist_ok=True)
imgs = []
for chip_id in tqdm(train_ids):
    img = get_image(os.path.join(train_img_dir, 
                                 rf"{chip_id}_S2_10.tif"))[:10].astype(np.float32)
    label = get_image(os.path.join(label_dir, 
                                   rf"{chip_id}_agbm.tif"))
                                
    imgs.append(img)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        io.imsave(os.path.join(data_path, train_img_dir, 
                               rf"{chip_id}_S2_10.tif"), 
                  img.astype(np.uint16))
        io.imsave(os.path.join(data_path, label_dir, 
                               rf"{chip_id}_agbm.tif"), 
                  label.astype(np.uint16))

imgs = np.array(imgs)

  0%|          | 0/500 [00:00<?, ?it/s]

In [26]:
imgs.shape

(500, 10, 256, 256)

In [27]:
MEANS = np.array([imgs[:,i].mean() for i in range(imgs.shape[1])])
STDS = np.array([imgs[j,i].std() for j in range(imgs.shape[0]) 
  for i in range(imgs.shape[1])]).reshape((imgs.shape[:2])).mean(0)
MEANS, STDS

(array([ 380.4021 ,  535.0182 ,  421.57675,  817.7593 , 1889.2316 ,
        2205.7764 , 2321.24   , 2394.7937 , 1272.0664 ,  702.64954],
       dtype=float32),
 array([319.6245 , 349.22687, 364.47675, 454.4305 , 777.2495 , 899.8386 ,
        970.10986, 952.7745 , 607.3759 , 443.7978 ], dtype=float32))

In [28]:
np.save(data_path + 'mean_std.npy', [MEANS, STDS])

In [29]:
# validation data
for chip_id in tqdm(val_ids):
    img = get_image(os.path.join(train_img_dir, 
                                 rf"{chip_id}_S2_10.tif"))[:10].astype(np.float32)
    label = get_image(os.path.join(label_dir, 
                                   rf"{chip_id}_agbm.tif"))
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        io.imsave(os.path.join(data_path, train_img_dir, 
                               rf"{chip_id}_S2_10.tif"), 
                  img.astype(np.uint16))
        io.imsave(os.path.join(data_path, label_dir, 
                               rf"{chip_id}_agbm.tif"), 
                  label.astype(np.uint16))


  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
# inference data
os.makedirs(data_path + test_img_dir, exist_ok=True)
for chip_id in tqdm(pred_ids):
    img = get_image(os.path.join(test_img_dir, 
                                 rf"{chip_id}_S2_10.tif"))[:10].astype(np.float32)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        io.imsave(os.path.join(data_path, test_img_dir, 
                               rf"{chip_id}_S2_10.tif"), 
                  img.astype(np.uint16))

  0%|          | 0/2773 [00:00<?, ?it/s]