In [None]:
import ee
import pandas as pd

In [None]:
import sys
sys.path.append("..")

In [None]:
from data_utils import get_data_by_zone_year
from data_utils import save_regional_data
from data_utils import split_dataset

### initialize earth engine

In [None]:
ee.Initialize()

### select bucket to store dataset

In [None]:
s3_bucket = "sagemaker-gis"

### select satellite data, year and bands

In [None]:
base_sat_data = "LANDSAT/LC08/C01/T1_SR"
year = 2013
bands = "B[1-7]"

meta_dict = {"src_dataset": base_sat_data.replace("/", "_"), "year": year}
date_range = [f"{year}-01-01", f"{year}-12-31"]

### read representative coordinates for each region

In [None]:
df_zones = pd.read_csv("../zones.csv").set_index("region")
df_zones.head()

### create dataset for each region

In [None]:
for area in df_zones.index:
    print(f"processing data for {area}...")
    point_of_int = df_zones.loc[area, ["lon", "lat"]].tolist()
    data_dict = get_data_by_zone_year(
        point_of_int, date_range, base_sat_data, bands
    )
    meta_dict["poi"] = area.replace(" ", "_")
    save_regional_data(data_dict, meta_dict, s3_bucket)

### split the dataset between training and test sets

In [None]:
areas_for_test = ["Vietnam2", "Myanmar3", "Cuba2", "India"]
folder = f"{meta_dict['src_dataset']}/Year{meta_dict['year']}"
split_dataset(areas_for_test, s3_bucket, folder)

### Check the training and test datasets

In [None]:
df_tr = pd.read_csv(f"s3://{s3_bucket}/{meta_dict['src_dataset']}/Year{meta_dict['year']}/train.csv")
df_te = pd.read_csv(f"s3://{s3_bucket}/{meta_dict['src_dataset']}/Year{meta_dict['year']}/test.csv")

In [None]:
df_tr.head()

In [None]:
df_te.head()

In [None]:
df_tr.shape, df_te.shape

### Check the class composition

In [None]:
df_tr.label.value_counts(normalize=True)

In [None]:
df_te.label.value_counts(normalize=True)