In [1]:
from os import path, listdir

import pandas
import airathon.paths as paths 


In [2]:
filepath = path.join(paths.dataset_metadata(), "satellite_metadata.csv")
filepath


'/home/zc2616/competition/dataset/metadata/satellite_metadata.csv'

# Overview


In [3]:
df = pandas.read_csv(filepath)
df.head()

Unnamed: 0,granule_id,time_start,time_end,product,location,split,us_url,eu_url,as_url,cksum,granule_size
0,20180201T191000_maiac_la_0.hdf,2018-02-01T17:25:00.000Z,2018-02-01 19:10:00+00:00,maiac,la,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,911405771,10446736
1,20180202T195000_maiac_la_0.hdf,2018-02-02T18:05:00.000Z,2018-02-02 19:50:00+00:00,maiac,la,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,2244451908,11090180
2,20180203T203000_maiac_la_0.hdf,2018-02-03T17:10:00.000Z,2018-02-03 20:30:00+00:00,maiac,la,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,3799527997,12468482
3,20180204T194000_maiac_la_0.hdf,2018-02-04T17:55:00.000Z,2018-02-04 19:40:00+00:00,maiac,la,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,4105997844,13064424
4,20180205T202000_maiac_la_0.hdf,2018-02-05T17:00:00.000Z,2018-02-05 20:20:00+00:00,maiac,la,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,1805072340,12549313


# Features


## `location`

Note that `location` follows a different naming convention from 
[Grid Metadata](GridMetadata.ipynb)

## `product`

- [`maiac`](https://sites.bu.edu/haqast-highrestt/research/about-maiac/):
  Multiangle Implementation of Atmospheric Correction
- [`misr`](https://terra.nasa.gov/about/terra-instruments/misr):
  Multi-angle Imaging SpectroRadiometer

### `maiac`

> The “Multiangle Implementation of Atmospheric Correction (MAIAC)” algorithm
> produces a 1-km global AOD from daily MODIS TERRA and AQUA satellites with
> 10:30am and 1:30pm equatorial crossing time. MAIAC is a new advanced NASA
> algorithm which uses the time series analysis and spatial processing to
> separate atmospheric and surface contributions, and improve quality of cloud
> detection and aerosol and surface characterization. MAIAC MODIS Collection 6
> suite of products (MCD19) is expected to be released in Spring 2018. A journal
> publication on the MAIAC C6 algorithm has been submitted (A. Lyapustin, Y.
> Wang, S. Korkin, D. Huang, “MODIS Collection 6 MAIAC Algorithm,” ACP).

### `misr`

> Most satellite instruments look only straight down, or toward the edge of the
> planet. To fully understand Earth’s climate, and to determine how it may be
> changing, we need to know the amount of sunlight that is scattered in different
> directions under natural conditions. MISR is a new type of instrument designed
> to address this need — it views the Earth with cameras pointed at nine
> different angles. One camera points toward nadir, and the others provide
> forward and aftward view angles, at the Earth’s surface, of 26.1°, 45.6°, 60.0°,
> and 70.5°. As the instrument flies overhead, each region of the Earth’s surface
> is successively imaged by all nine cameras in each of four wavelengths (blue, 
> green, red, and near-infrared).
>
> In addition to improving our understanding of the fate of sunlight in the
> Earth’s environment, MISR data can distinguish different types of clouds,
> aerosol particles, and surfaces. Specifically, MISR will monitor the monthly,
> seasonal, and long-term trends in:
>
> - the amount and type of atmospheric aerosol particles, including those formed
>   by natural sources and by human activities;
> - the amount, types, and heights of clouds; and
> - the distribution of land surface cover, including vegetation canopy structure


In [4]:
products = df.loc[:, "product"]
set(products)

{'maiac', 'misr'}

## `split`

- `train` is used for `train_labels.csv`, i.e. **what we use to train our model**
- `test` is intended to be used by `submission_format.csv`, i.e. **what we upload
  to the leaderboard**

In [5]:
set(df.loc[:, "split"])

{'test', 'train'}

In [6]:
train = df.loc[:, "split"] == "train"
test = df.loc[:, "split"] == "test"

print(f"# of train = {train.sum()}")
print(f"# of test = {test.sum()}")

# of train = 5048
# of test = 2673


In [7]:
def count_hdf(split: str) -> int:
    root = path.join(paths.dataset(), split, "maiac")
    years = listdir(root) 

    total = 0

    for year in years:
        year_dir = path.join(root, year)
        files = listdir(year_dir)
        files = filter(lambda file: ".hdf" in file, files)
        files = list(files)

        total += len(files)

    return total

train_hdf_count = count_hdf("train")
test_hdf_count = count_hdf("test")

print(f"# of train files = {train_hdf_count}")
print(f"# of test files = {test_hdf_count}")


# of train files = 4260
# of test files = 2444
