In [1]:
import os
import random
from pathlib import Path
import pandas as pd

from tqdm.auto import tqdm

import boto3
from botocore import UNSIGNED
from botocore.client import Config

# from concurrent.futures import ThreadPoolExecutor
# import subprocess
# import multiprocessing

In [2]:
ROOT_DIR = Path(os.path.abspath('.'))

DATA_DIR = ROOT_DIR/'data'

TRAIN_FEAT_DIR = DATA_DIR/'train_features'
TRAIN_AGBM_DIR = DATA_DIR/'train_agbm'
TEST_FEAT_DIR = DATA_DIR/'test_features'

DIRS = [
    DATA_DIR,
    TRAIN_FEAT_DIR,
    TRAIN_AGBM_DIR,
    TEST_FEAT_DIR
    ]

for dir in DIRS:
    if not os.path.exists(dir):
        os.mkdir(dir)

In [3]:
RANDOM_SEED = 42

random.seed(RANDOM_SEED)

# num_workers = multiprocessing.cpu_count()//2

# # Create new `pandas` methods which use `tqdm` progress
# # (can use tqdm_gui, optional kwargs, etc.)
# tqdm.pandas()

In [4]:
server_location = 'as' # asia

AWS_BUCKET=f'drivendata-competition-biomassters-public-{server_location}'

In [5]:
session = boto3.Session()
client = session.client("s3", config=Config(signature_version=UNSIGNED))

# Download features_metadata.csv

In [6]:
metadata_filename = 'features_metadata.csv'

if not os.path.exists(DATA_DIR/metadata_filename):
    # # direct aws cli, take longer
    # metadata_download = f"aws s3 cp s3://drivendata-competition-biomassters-public-{server_location}/{metadata_filename} {DATA_DIR} --no-sign-request"
    # os.system(metadata_download)
    
    # use boto, faster
    client.download_file(
        Bukect=AWS_BUCKET,
        Key=metadata_filename,
        Filename=DATA_DIR/metadata_filename
    )

f_mdt = pd.read_csv(DATA_DIR/metadata_filename)

In [7]:
f_mdt.head(2)

Unnamed: 0,filename,chip_id,satellite,split,month,size,cksum,s3path_us,s3path_eu,s3path_as,corresponding_agbm
0,0003d2eb_S1_00.tif,0003d2eb,S1,train,September,1049524,3953454613,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,0003d2eb_agbm.tif
1,0003d2eb_S1_01.tif,0003d2eb,S1,train,October,1049524,3531005382,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,0003d2eb_agbm.tif


In [8]:
f_mdt.split.value_counts()

split
train    189078
test      63348
Name: count, dtype: int64

In [9]:
f_mdt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252426 entries, 0 to 252425
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   filename            252426 non-null  object
 1   chip_id             252426 non-null  object
 2   satellite           252426 non-null  object
 3   split               252426 non-null  object
 4   month               252426 non-null  object
 5   size                252426 non-null  int64 
 6   cksum               252426 non-null  int64 
 7   s3path_us           252426 non-null  object
 8   s3path_eu           252426 non-null  object
 9   s3path_as           252426 non-null  object
 10  corresponding_agbm  252426 non-null  object
dtypes: int64(2), object(9)
memory usage: 21.2+ MB


# Downloading Train AGBM

In [10]:
# use 'sync' to make sure it won't redownload data that already exist in local
try:
    os.system(f"aws s3 sync s3://drivendata-competition-biomassters-public-{server_location}/{TRAIN_AGBM_DIR.name}/ {TRAIN_AGBM_DIR} --no-sign-request")
except:
    os.system(f"aws s3 cp s3://drivendata-competition-biomassters-public-{server_location}/{TRAIN_AGBM_DIR.name}/ {TRAIN_AGBM_DIR} --no-sign-request")

# Download Train and Test Feature

## Randomly select 10% from each split

In [11]:
# Function to sample 1% from each split
def sample_one_percent(df):
    # Sample 1% of each group (using ceil to ensure at least one sample from small groups), excluding the group keys in apply
    sampled_df = (
        df
        .groupby('split', group_keys=False)
        .apply(
            lambda x: x.sample(frac=0.1, random_state=42),
            # include_groups=True
            )
        .reset_index(drop=True)
        )
    return sampled_df

In [12]:
# Sampling from the DataFrame (this is just an illustrative example, not the actual sampling since we need more data)
f_mdt_onepct = sample_one_percent(f_mdt)

# Show the resulting sampled DataFrame
f_mdt_onepct

  df


Unnamed: 0,filename,chip_id,satellite,split,month,size,cksum,s3path_us,s3path_eu,s3path_as,corresponding_agbm
0,3df1d9c2_S2_09.tif,3df1d9c2,S2,test,June,1443550,2005262886,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,3df1d9c2_agbm.tif
1,8ffd145e_S1_04.tif,8ffd145e,S1,test,January,1049524,410538992,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,8ffd145e_agbm.tif
2,0eeb64cd_S2_06.tif,0eeb64cd,S2,test,March,1443550,408583910,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,0eeb64cd_agbm.tif
3,950ad0c4_S2_05.tif,950ad0c4,S2,test,February,1443550,3877475525,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,950ad0c4_agbm.tif
4,f5ec2df1_S2_08.tif,f5ec2df1,S2,test,May,1443550,1969163599,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,f5ec2df1_agbm.tif
...,...,...,...,...,...,...,...,...,...,...,...
25238,8be17de7_S1_02.tif,8be17de7,S1,train,November,1049524,4207869702,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,8be17de7_agbm.tif
25239,ecc20a65_S2_02.tif,ecc20a65,S2,train,November,1443550,987515178,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,ecc20a65_agbm.tif
25240,0e8b2ac7_S1_02.tif,0e8b2ac7,S1,train,November,1049524,2139342619,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,0e8b2ac7_agbm.tif
25241,7519c115_S2_11.tif,7519c115,S2,train,August,1443550,235689151,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,7519c115_agbm.tif


In [13]:
f_mdt_onepct_dir = DATA_DIR/'f_mdt_onepct.csv'
if not os.path.exists(f_mdt_onepct_dir):
    f_mdt_onepct.to_csv(f_mdt_onepct_dir)
else:
    f_mdt_onepct = pd.read_csv(f_mdt_onepct_dir)

## Start downloading

In [14]:
f_mdt_onepct_train = f_mdt_onepct[f_mdt_onepct['split']=='train']
f_mdt_onepct_test = f_mdt_onepct[f_mdt_onepct['split']=='test']

In [15]:
f_mdt_onepct_train

Unnamed: 0.1,Unnamed: 0,filename,chip_id,satellite,split,month,size,cksum,s3path_us,s3path_eu,s3path_as,corresponding_agbm
6335,6335,043d8d0c_S2_09.tif,043d8d0c,S2,train,June,1443550,2862574393,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,043d8d0c_agbm.tif
6336,6336,c5986c9f_S2_08.tif,c5986c9f,S2,train,May,1443550,3294598623,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,c5986c9f_agbm.tif
6337,6337,75fb4acf_S2_01.tif,75fb4acf,S2,train,October,1443550,1298117144,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,75fb4acf_agbm.tif
6338,6338,391fa553_S1_05.tif,391fa553,S1,train,February,1049524,4242341275,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,391fa553_agbm.tif
6339,6339,685f6a42_S2_08.tif,685f6a42,S2,train,May,1443550,3858241946,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,685f6a42_agbm.tif
...,...,...,...,...,...,...,...,...,...,...,...,...
25238,25238,8be17de7_S1_02.tif,8be17de7,S1,train,November,1049524,4207869702,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,8be17de7_agbm.tif
25239,25239,ecc20a65_S2_02.tif,ecc20a65,S2,train,November,1443550,987515178,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,ecc20a65_agbm.tif
25240,25240,0e8b2ac7_S1_02.tif,0e8b2ac7,S1,train,November,1049524,2139342619,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,0e8b2ac7_agbm.tif
25241,25241,7519c115_S2_11.tif,7519c115,S2,train,August,1443550,235689151,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,7519c115_agbm.tif


In [17]:
# download training data
for i, row in tqdm(f_mdt_onepct_train.iterrows(), total=len(f_mdt_onepct_train)):
    url = row['s3path_as']
    f_dir = Path(row['s3path_as']).parent.name
    f_name = row['filename']
    f_loc = DATA_DIR / f_dir / f_name
    
    if not f_loc.exists():
        client.download_file(
            Bucket=AWS_BUCKET,
            Key=str(f_dir / Path(f_name)),
            Filename=f_loc
        )

  0%|          | 0/18908 [00:00<?, ?it/s]

In [18]:
f_mdt_onepct_test

Unnamed: 0.1,Unnamed: 0,filename,chip_id,satellite,split,month,size,cksum,s3path_us,s3path_eu,s3path_as,corresponding_agbm
0,0,3df1d9c2_S2_09.tif,3df1d9c2,S2,test,June,1443550,2005262886,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,3df1d9c2_agbm.tif
1,1,8ffd145e_S1_04.tif,8ffd145e,S1,test,January,1049524,410538992,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,8ffd145e_agbm.tif
2,2,0eeb64cd_S2_06.tif,0eeb64cd,S2,test,March,1443550,408583910,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,0eeb64cd_agbm.tif
3,3,950ad0c4_S2_05.tif,950ad0c4,S2,test,February,1443550,3877475525,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,950ad0c4_agbm.tif
4,4,f5ec2df1_S2_08.tif,f5ec2df1,S2,test,May,1443550,1969163599,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,f5ec2df1_agbm.tif
...,...,...,...,...,...,...,...,...,...,...,...,...
6330,6330,bf1e9090_S1_11.tif,bf1e9090,S1,test,August,1049524,1331376018,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,bf1e9090_agbm.tif
6331,6331,863d5b41_S2_03.tif,863d5b41,S2,test,December,1443550,4097865321,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,863d5b41_agbm.tif
6332,6332,ea25d3cb_S2_05.tif,ea25d3cb,S2,test,February,1443550,1356174826,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,ea25d3cb_agbm.tif
6333,6333,171ddcab_S1_04.tif,171ddcab,S1,test,January,1049524,3990734130,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,171ddcab_agbm.tif


In [19]:
# download test data
for i, row in tqdm(f_mdt_onepct_test.iterrows(), total=len(f_mdt_onepct_test)):
    url = row['s3path_as']
    f_dir = Path(row['s3path_as']).parent.name
    f_name = row['filename']
    f_loc = DATA_DIR / f_dir / f_name
    
    if not f_loc.exists():
        client.download_file(
            Bucket=AWS_BUCKET,
            Key=str(f_dir / Path(f_name)),
            Filename=f_loc
        )

  0%|          | 0/6335 [00:00<?, ?it/s]

In [None]:
# # download training data
# for i, row in tqdm(f_mdt_onepct_train.iterrows(), total=len(f_mdt_onepct_train)):
#     url = row['s3path_as']
#     f_dir = Path(row['s3path_as']).parent.name
#     f_name = row['filename']
#     f_loc = DATA_DIR / f_dir / f_name

#     if not os.path.exists(f_loc):
#         cmd = f"aws s3 cp {url} {f_loc} --no-sign-request"
#         os.system(cmd)
#     else:
#         print(f'{f_loc} already exist')