# Dataset Stats

With all of the public Earth Engine code scraped and parsed, let's start seeing what we can learn about specific datasets.

In [1]:
import os
import pandas as pd
import plotly.express as px
from collections import Counter
import datetime

## Collection Stats

First, let's load a count of how many times each image collection has been imported.

In [None]:
collections_df = pd.read_csv(os.path.join("..", "data", "collections.csv")).sort_values(by="imports", ascending=False)

How many times have the top 10 image collections been imported?

In [None]:
collections_df[:10]

### By Platform

What if we break down imports by platform? Let's group some of the most popular collections together by platform.

In [None]:
def get_landsat_platform(s):
    """Parse the Landsat platform from a collection name."""
    try:
        platform = int(s.split("/")[1][-2:])
        if platform not in [5, 7, 8]:
            return "Landsat ..."
        
        return "Landsat " + str(int(platform))
        
    except (IndexError, ValueError):
        return "Landsat ..."

def get_collection_program(collection):
    if "LANDSAT/" in collection:
        return get_landsat_platform(collection)
    elif "COPERNICUS/S2" in collection:
        return "Sentinel-2"
    elif "COPERNICUS/S1" in collection:
        return "Sentinel-1"
    elif "MODIS/" in collection:
        return "MODIS"
    else:
        return "Other"

In [None]:
collections_df["platform"] = collections_df.collection.apply(get_collection_program)
collection_programs = collections_df.groupby("platform").sum().reset_index().sort_values(by="imports", ascending=False)

In [None]:
fig = px.bar(collection_programs, x="platform", y="imports", color="platform", template="ggplot2",
             labels={"imports": "Total Imports", "platform": "Platform"}, 
             color_discrete_sequence=px.colors.diverging.Temps, text="imports")

fig.update_traces(texttemplate="<b>%{value:.2s}</b>")
fig.update_layout(font=dict(size=24), showlegend=False, height=500, width=800)

fig

In [None]:
collections_df[collections_df.platform.str.startswith("Landsat")].imports.sum()

In [None]:
fig.write_image(os.path.join("..", "figures", "platform_imports.png"), scale=2)

### Landsat Analysis

Let's break down the Landsat data further, comparing the number of imports between platforms.

In [None]:
def get_landsat_platform(s):
    """Parse the Landsat platform from a collection name."""
    try:
        platform = int(s.split("/")[1][-2:])
        if platform not in [1, 2, 3, 4, 5, 7, 8, 9]:
            return None
        
        return "Landsat " + str(int(platform))
        
    except (IndexError, ValueError):
        return None

In [None]:
landsat_df = collections_df[collections_df.platform.str.startswith("Landsat")]
landsat_df["short_platform"] = landsat_df.platform.apply(lambda x: "L" + x.split(" ")[1])

In [None]:
fig = px.pie(landsat_df, names="short_platform", values="imports", hole=0.6, 
             color_discrete_sequence=px.colors.qualitative.T10)

fig.update_traces(
    textposition='inside',
    texttemplate = "<b>%{label}</b><br>(%{percent:.0%})",
    insidetextorientation="horizontal"
)

fig.update_layout(
    annotations=[dict(text='<b><span style="font-variant: small-caps;">Landsat<br>Platform</span></b>', x=0.5, y=0.5, font_size=36, showarrow=False)])

fig.update_layout(height=400, width=400, showlegend=False, font=dict(size=24), margin=dict(l=10, r=10, b=10, t=10))

In [None]:
fig.write_image(os.path.join("..", "figures", "landsat_platforms.png"), scale=2)

## Image Stats

In [24]:
image_df = pd.read_csv(os.path.join("..", "data", "images.csv")).sort_values(by="imports", ascending=False)

In [25]:
image_df[:10]

Unnamed: 0,image,imports
0,USGS/SRTMGL1_003,13293
10,CGIAR/SRTM90_V4,4640
4,JRC/GSW1_0/GlobalSurfaceWater,3065
9,UMD/hansen/global_forest_change_2015,1605
122,UMD/hansen/global_forest_change_2018_v1_6,1517
15,UMD/hansen/global_forest_change_2019_v1_7,1422
723,JAXA/ALOS/AW3D30_V1_1,1414
28,UMD/hansen/global_forest_change_2020_v1_8,1314
2,USGS/NED,1229
1073,WWF/HydroSHEDS/03VFDEM,1172


What's the most frequently imported Landsat image?

In [26]:
image_df[image_df.image.str.startswith("LANDSAT").fillna(False)].sort_values("imports", ascending=False)

Unnamed: 0,image,imports
1110,LANDSAT/LC08/C01/T1_TOA/LC08_044034_20140318,447
622,LANDSAT/LC08/C01/T1/LC08_044034_20140318,435
1239,LANDSAT/LE7_TOA_1YEAR/2001,187
1151,LANDSAT/LC08/C01/T1_TOA/LC08_123032_20140515,129
7965,LANDSAT/LC8_L1T_TOA/LC80440342013106LGN01,121
...,...,...
6079,LANDSAT/LC08/C01/T1_SR/LC08_033032_20170601,1
6080,LANDSAT/LC08/C01/T1_SR/LC08_033032_20171124,1
64062,LANDSAT/LC08/C01/T1_SR/LC08_232067_20190828,1
64061,LANDSAT/LC08/C01/T1_SR/LC08_232067_20130726,1


How about Sentinel-2?

In [35]:
image_df[image_df.image.str.startswith("COPERNICUS/S2").fillna(False)].sort_values("imports", ascending=False)

Unnamed: 0,image,imports
1767,COPERNICUS/S2/20180422T012719_20180422T012714_...,72
53431,COPERNICUS/S2/20180903T181921_20180903T182327_...,44
42380,COPERNICUS/S2/20190510T052651_20190510T053439_...,42
42379,COPERNICUS/S2/20180505T052651_20180505T053419_...,42
42377,COPERNICUS/S2/20160604T052652_20160604T053931_...,42
...,...,...
4874,COPERNICUS/S2/20191206T000239_20191206T000234_...,1
4873,COPERNICUS/S2/20191206T000239_20191206T000234_...,1
4872,COPERNICUS/S2/20200125T000239_20200125T000233_...,1
4871,COPERNICUS/S2/20200125T000239_20200125T000233_...,1


## CRS Stats

In [None]:
crs_df = pd.read_csv(os.path.join("..", "data", "crs.csv")).sort_values(by="exports", ascending=False)

In [None]:
crs_df[crs_df.crs.isin(["EPSG:4326", "EPSG:3857"])].exports.sum() / crs_df.exports.sum()

## Date Stats

In [2]:
date_df = pd.read_csv(os.path.join("..", "data", "dates.csv")).sort_values(by="date", ascending=False)

In [4]:
def parse_date(date):
    """Parse a date in format yyyy-mm-dd into a datetime."""
    y, m, d = date.split("-")
    
    try:
        dt = datetime.datetime(year=int(y), month=int(m), day=int(d))
    # Skip invalid dates
    except ValueError:
        return None
    
    return dt

In [26]:
date_df["datetime"] = date_df.date.apply(parse_date)
date_df = date_df.dropna(subset="datetime")

In [10]:
date_df["year"] = date_df.datetime.apply(lambda d: d.year)
year_df = date_df.groupby("year").sum().reset_index()

In [16]:
px.bar(year_df[year_df.year.gt(1980) & year_df.year.lt(2023)], x="year", y="used")