In [None]:
import collections
import datetime
import os
import pprint
import random

import ee
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import openet.core

from IPython.display import Image, display
import ipyplot

# gsutil -m cp "gs://openet_temp/skip_scene_stats/2025/p1*.csv" ./stats/2025/


In [None]:
ee.Initialize(
    project='ee-cmorton',
    opt_url='https://earthengine-highvolume.googleapis.com'
)

stats_ws = os.path.join(os.getcwd(), 'stats')
if not os.path.isdir(stats_ws):
    os.makedirs(stats_ws)
    

In [None]:
# Denmark WRS2 list
wrs2_list = [
    'p198r021', 'p198r020', 
    'p197r022', 'p197r021', 'p197r020', 
    'p196r022', 'p196r021', 'p196r020', 
    'p195r022', 'p195r021', 
    'p194r021', 'p194r022'
]

wrs2_skip_list = [
    # 'p010r030', 
]

# wrs2_list = sorted(
#     ee.FeatureCollection('projects/openet/assets/features/wrs2/custom')
#     .filterBounds(ee.Geometry.BBox(-124, 26, -67.9, 50))
#     .filter(ee.Filter.inList('wrs2_tile', wrs2_skip_list).Not())
#     .aggregate_histogram('wrs2_tile').keys().getInfo(),
#     reverse=True
# )
# # print(len(wrs2_list))

ocean_wrs2_list = [
]


In [None]:
land_mask = ee.Image('projects/openet/assets/features/water_mask').Not()
# Apply the NLCD/NALCMS water mask (anywhere it is water, set the ocean mask 
# land_mask = land_mask.where(ee.Image("USGS/NLCD_RELEASES/2020_REL/NALCMS").unmask(18).eq(18), 0)
# land_mask = land_mask.And(ee.Image("USGS/NLCD_RELEASES/2020_REL/NALCMS").unmask(18).neq(18))

rgb_bands = {
    'LT04': ['SR_B3', 'SR_B2', 'SR_B1'],
    'LT05': ['SR_B3', 'SR_B2', 'SR_B1'],
    'LE07': ['SR_B3', 'SR_B2', 'SR_B1'],
    'LC08': ['SR_B4', 'SR_B3', 'SR_B2'],
    'LC09': ['SR_B4', 'SR_B3', 'SR_B2'],
}

# 0 - white, 1 - no fill (green), 2 - shadow (dark blue), 3 - snow (light blue), 4 - cloud (light gray), 5 - water (purple), 6 - ocean mask
fmask_palette = "ffffff, 9effa1, blue, 00aff2, dddddd, purple, bfbfbf"
fmask_max = 6


In [None]:
def fmask(landsat_img):
    # Add the fmask image on top of the true color image
    qa_img = landsat_img.select('QA_PIXEL')
    fill_mask = qa_img.bitwiseAnd(1).neq(0)                  # bits: 0
    dilate_mask = qa_img.rightShift(1).bitwiseAnd(1).neq(0)  # bits: 1
    cirrus_mask = qa_img.rightShift(2).bitwiseAnd(1).neq(0)  # bits: 2
    cloud_mask = qa_img.rightShift(3).bitwiseAnd(1).neq(0)   # bits: 3
    shadow_mask = qa_img.rightShift(4).bitwiseAnd(1).neq(0)  # bits: 4
    snow_mask = qa_img.rightShift(5).bitwiseAnd(1).neq(0)    # bits: 5
    clear_mask = qa_img.rightShift(6).bitwiseAnd(1).neq(0)   # bits: 6
    water_mask = qa_img.rightShift(7).bitwiseAnd(1).neq(0)   # bits: 7
    cloud_conf = qa_img.rightShift(8).bitwiseAnd(3)          # bits: 8, 9
    shadow_conf = qa_img.rightShift(10).bitwiseAnd(3)        # bits: 10, 11
    snow_conf = qa_img.rightShift(12).bitwiseAnd(3)          # bits: 12, 13
    cirrus_conf = qa_img.rightShift(14).bitwiseAnd(3)        # bits: 14, 15

    # Saturated pixels
    # Flag as saturated if any of the RGB bands are saturated
    #   or change .gt(0) to .gt(7) to flag if all RGB bands are saturated
    # Comment out rightShift line to flag if saturated in any band
    bitshift = ee.Dictionary({'LANDSAT_4': 0, 'LANDSAT_5': 0, 'LANDSAT_7': 0, 'LANDSAT_8': 1, 'LANDSAT_9': 1});
    saturated_mask = (
        landsat_img.select('QA_RADSAT')
        .rightShift(ee.Number(bitshift.get(ee.String(landsat_img.get('SPACECRAFT_ID'))))).bitwiseAnd(7)
        .gt(0)
    )
    
    # Old "Fmask" style image
    fmask_img = (
        qa_img.multiply(0)
        .where(landsat_img.select(['SR_B4']).mask().eq(0), 1)
        # .where(saturated_mask, 6)
        .where(water_mask, 5)
        .where(shadow_mask, 2)
        .where(snow_mask, 3)
        .where(cloud_mask.Or(dilate_mask).Or(cirrus_mask), 4)
        # .add(shadow_mask.multiply(2))
        # .add(snow_mask.multiply(3))
        # .add(cloud_mask.Or(dilate_mask).Or(cirrus_mask).multiply(4))
        # .add(cloud_mask.Or(dilate_mask).multiply(4))
        # .add(cloud_mask.And(cloud_conf).multiply(4))
        # .add(water_mask.multiply(5))
    )
    
    return fmask_img.updateMask(fmask_img.neq(0)).rename(['fmask'])


In [None]:
# # Clean up the scene skip list file
# skip_path = '../v2p1_denmark.csv'
# print(f'\n{skip_path}')

# with open(skip_path, 'r') as csv_f:
#     scene_skip_lines = csv_f.readlines()
# scene_skip_header = scene_skip_lines.pop(0)

# # Drop the comments and empty lines
# scene_skip_lines = [line.strip() for line in scene_skip_lines if line.strip() and line[0] != '#']

# # Sort by date then by tile
# scene_skip_lines = sorted(scene_skip_lines, key=lambda x:x.split(',')[0].split('_')[-1] + '_' + x.split(',')[0].split('_')[-2])

# # Identify duplicate scene IDs (as opposed to duplicate lines)
# # Note, this block is not removing any lines, just printing
# print('Duplicate Scene IDs:')

# if len({l.split(',')[0] for l in scene_skip_lines}) != len(scene_skip_lines):
#     for item, count in collections.Counter([l.split(',')[0] for l in scene_skip_lines]).items():
#         if count > 1:
#             print(item)

# # Identify lines with no reason
# print('\nMissing reason Scene IDs:')
# for l in scene_skip_lines:
#     if ',' not in l:
#         print(l)
#     elif l.split(',')[1].strip() == '':
#         print(l)
#     elif len(l.split(',')) > 2:
#         print(l)

# # # Identify duplicate lines (not duplicate SCENE IDs)
# # if len({line for line in scene_skip_lines}) != len(scene_skip_lines):
# #     print('Duplicate Lines:')
# #     for item, count in collections.Counter(scene_skip_lines).items():
# #         if count > 1:
# #             print(item)
# # 
# #     # # Uncomment to have the tool remove duplicate lines
# #     # scene_remove_lines = []
# #     # for item, count in collections.Counter(scene_skip_lines).items():
# #     #     if count > 1:
# #     #         scene_remove_lines.append(item)
# #     #         # print(item)
# #      
# #     # # Does this only remove the first one?
# #     # if scene_remove_lines:
# #     #     print(f'Removing {len(scene_remove_lines)} duplicate lines in file')
# #     #     for line in scene_remove_lines:
# #     #         print(line)
# #     #         scene_skip_lines.remove(line)
# # 
# # # Then recheck for duplicate SCENE_IDs (but different notes or dates)
# # scenes = {line.split(',')[0] for line in scene_skip_lines}           
# # if len(scenes) != len(scene_skip_lines):
# #     print('Duplicate scene IDs still in file')
    
# print('\nWriting updated scene skip list CSV')
# with open(skip_path.replace('.csv', '_sorted.csv'), 'w') as csv_f:
#     csv_f.write(scene_skip_header)
#     for i, line in enumerate(scene_skip_lines):
#         csv_f.write(line + '\n')

# print('\nDone')

In [None]:
# Remove existing images that are in the skip list
scene_skip_url = '../v2p1_denmark.csv'
#scene_skip_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1_denmark.csv'
scene_skip_list = list(pd.read_csv(scene_skip_url)['SCENE_ID'].values)
scene_skip_list = sorted(scene_skip_list, key=lambda k: k.split('_')[-2], reverse=True)
print(f'Skip list images: {len(scene_skip_list)}')

collections = [
    'projects/openet/assets/ssebop/eu/era5land/landsat/v2_1',
]

for coll_id in collections:
    print(f'\n{coll_id}')
    scene_id_list = set(
        ee.ImageCollection(coll_id)
        # .filterDate('2022-09-01', '2025-01-01')
        # .filterBounds(ee.Geometry.BBox(-125, 25, -124, 42))
        # .filterBounds(ee.Geometry.BBox(-90, 25, -65, 50))
        .aggregate_array('system:index').getInfo()
    )
    print(f'  Images: {len(scene_id_list)}')

    for scene_id in scene_skip_list:
        # print(scene_id)
        if scene_id.lower() in scene_id_list:
            image_id = f'{coll_id}/{scene_id.lower()}'
            print(f'Delete {image_id}')
            try:
                ee.data.deleteAsset(image_id)
            except:
                print('  could not delete asset, skipping')
                continue

print('\nDone')

In [None]:
### Print scenes with high masked count percentages
# count_threshold_pct_min = 75
# count_threshold_pct_max = 80
count_threshold_pct_min = 80
count_threshold_pct_max = 101

start_year = 2022
end_year = 2025
years = list(range(start_year, end_year + 1))
#months = [6, 7, 8]
months = []

print_count = 10
image_size = 1200
#image_size = 700

# Read in the scene skip list
scene_skip_url = '../v2p1_denmark.csv'
# scene_skip_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1_denmark.csv'
scene_skip_df = pd.read_csv(scene_skip_url)
scene_skip_list = list(scene_skip_df['SCENE_ID'].values)
print(f'Skip list images: {len(scene_skip_list)}')

# scene_cloudscore_url = '../v2p1_cloudscore.csv'
# # scene_cloudscore_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1_cloudscore.csv'
# scene_cloudscore_list = list(pd.read_csv(scene_cloudscore_url)['SCENE_ID'].values)
# print(f'Skip cloudscore images: {len(scene_cloudscore_list)}')


print('Reading image stats CSV files')
stats_df_list = []
for wrs2_tile in wrs2_list:
    # if int(wrs2_tile[1:4]) not in range(10, 25):
    #     continue
        
    for year in range(start_year, end_year + 1):
        wrs2_stats_path = os.path.join(stats_ws, f'{year}', f'{wrs2_tile}_{year}.csv')
        if not os.path.isfile(wrs2_stats_path):
            # print(f'  {wrs2_tile}_{year} - Missing stats CSV, skipping')
            continue
        try:
            wrs2_stats_df = pd.read_csv(wrs2_stats_path, index_col=False)
        except Exception as e:
            print(f'  {wrs2_tile}_{year} - Error reading CSV, skipping')
            continue
        if wrs2_stats_df.empty:
            continue
        wrs2_stats_df['DATE'] = wrs2_stats_df['SCENE_ID'].str.slice(12, 20)
        wrs2_stats_df['WRS2'] = 'p' + wrs2_stats_df['SCENE_ID'].str.slice(5, 8) + 'r' + wrs2_stats_df['SCENE_ID'].str.slice(8, 11)
        stats_df_list.append(wrs2_stats_df)

stats_df = pd.concat(stats_df_list)

# Add the high CLOUD_COVER_LAND scenes to the skip list but don't remove from the dataframe
scene_skip_list.extend(stats_df[stats_df['CLOUD_COVER_LAND'] >= 71]['SCENE_ID'].values)

# Skip the Landsat 7 scenes in 2023
l7_2022_mask = (
    (stats_df['DATE'].str.slice(0,4) >= '2022') &
    (stats_df['SCENE_ID'].str.slice(0,4) == 'LE07')
)
stats_df = stats_df[~l7_2022_mask]

# Only check specific months scenes
if months:
    stats_df = stats_df[stats_df['DATE'].str.slice(4,6).astype(int).isin(months)]

# Compute the ratios
# stats_df['ACCA_COUNT_RATIO'] = stats_df['ACCA_PIXELS'] / stats_df['TOTAL_PIXELS']
stats_df['SNOW_COUNT_RATIO'] = stats_df['SNOW_PIXELS'] / stats_df['TOTAL_PIXELS']
# stats_df['SHADOW_COUNT_RATIO'] = stats_df['SHADOW_PIXELS'] / stats_df['TOTAL_PIXELS']
stats_df['WATER_COUNT_RATIO'] = stats_df['WATER_PIXELS'] / stats_df['TOTAL_PIXELS']
stats_df['MASKED_PIXELS'] = (
    stats_df['CLOUD_PIXELS'] + stats_df['CIRRUS_PIXELS'] + stats_df['DILATE_PIXELS']
    + stats_df['SHADOW_PIXELS']
    + stats_df['SNOW_PIXELS']
    + stats_df['WATER_PIXELS']
    + stats_df['ACCA_PIXELS']
    # + stats_df['SATURATED_PIXELS']
)
stats_df['CLOUD_COUNT_RATIO'] = stats_df['MASKED_PIXELS'] / stats_df['TOTAL_PIXELS']
# stats_df['CLOUD_COUNT_RATIO'] = stats_df['UNMASKED_PIXELS'] / stats_df['TOTAL_PIXELS']

print(f'  {len(stats_df.count(axis=1))}')

# Work through the tiles based on which ones already have the most skipped scenes
wrs2_tiles = list(stats_df.groupby(['WRS2'])['SCENE_ID'].count().sort_values(ascending=False).index)
# wrs2_tiles = ['']

new_skip_scenes = []
new_skip_count = 0

wrs2_i = 0

# for wrs2 in reversed(wrs2_tiles):
# for wrs2 in sorted(wrs2_tiles):
# for wrs2 in reversed(sorted(wrs2_tiles)):
for wrs2 in random.sample(wrs2_tiles, len(wrs2_tiles)):
    if wrs2_i >= 20:
        break
    if wrs2_skip_list and (wrs2 in wrs2_skip_list):
        continue
    # if california_wrs2_list and (wrs2 not in california_wrs2_list) and wrs2 not in ['p042r033']:
    #     continue
    # if int(wrs2[1:4]) != 10:
    #     continue
    # if int(wrs2[1:4]) != 24:
    #     continue
    # if int(wrs2[5:8]) >= 30:
    #     continue
    #if int(wrs2[5:8]) == 25 or int(wrs2[5:8]) == 50:
    #   continue
    print(wrs2)
    
    wrs2_path = int(wrs2[1:4])
    wrs2_row = int(wrs2[5:8])
    wrs2_tgt = f'{wrs2_path:03d}{wrs2_row:03d}'
    wrs2_above = f'{wrs2_path:03d}{wrs2_row-1:03d}'
    wrs2_below = f'{wrs2_path:03d}{wrs2_row+1:03d}'    

    wrs2_stats_df = stats_df[stats_df['WRS2'] == wrs2].copy()
    # Applying skip list here so that main stats DF has all scenes
    wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_skip_list)]
    #wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_cloudscore_list)]
    
    # Filter on the overall cloud count ratio
    wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COUNT_RATIO'] < (count_threshold_pct_max / 100)]
    wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COUNT_RATIO'] >= (count_threshold_pct_min / 100)]
    wrs2_stats_df.sort_values('CLOUD_COUNT_RATIO', ascending=False, inplace=True)

    # # Filter on the CLOUD_COVER_LAND property
    # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COVER_LAND'] < 71]
    # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COVER_LAND'] >= 69]
    #wrs2_stats_df.sort_values('CLOUD_COVER_LAND', ascending=False, inplace=True)

    if len(wrs2_stats_df.count(axis=1)) == 0:
        continue
    print(f'{wrs2} - {len(wrs2_stats_df.count(axis=1))}')

    wrs2_skip_scenes = []
    wrs2_skip_count = 0
    
    # for i, row in wrs2_stats_df.iterrows():
    for i, row in wrs2_stats_df.sample(n=min(print_count, len(wrs2_stats_df.index))).iterrows():

        scene_id = row["SCENE_ID"].upper()

        above_scene_id = scene_id.upper().replace(wrs2_tgt, wrs2_above)
        above_stats_df = stats_df.loc[stats_df['SCENE_ID'] == above_scene_id]
        if len(above_stats_df):
            above_cloud_pct = float(above_stats_df.iloc[0]['CLOUD_COVER_LAND'])
        else:
            above_cloud_pct = None
            
        below_scene_id = scene_id.upper().replace(wrs2_tgt, wrs2_below)
        below_stats_df = stats_df.loc[stats_df['SCENE_ID'] == below_scene_id]
        if len(below_stats_df):
            below_cloud_pct = float(below_stats_df.iloc[0]['CLOUD_COVER_LAND'])
        else:
            below_cloud_pct = None

        # # Only show scenes that have above & below both skipped or None
        # if (((above_scene_id not in scene_skip_list) and (above_cloud_pct is not None)) or 
        #     ((below_scene_id not in scene_skip_list) and (below_cloud_pct is not None))):
        #     continue   

        # # Only show scenes that have either above & below skipped or None
        # if (((above_scene_id not in scene_skip_list) and (above_cloud_pct is not None)) and 
        #     ((below_scene_id not in scene_skip_list) and (below_cloud_pct is not None))):
        #     continue   
            
        landsat_type = scene_id.split('_')[0].upper()
        landsat_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{scene_id}')
        landsat_region = landsat_img.geometry().bounds(1, 'EPSG:4326')
        landsat_sr_img = landsat_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])

        # Landsat true color image
        landsat_url = (
            landsat_sr_img.where(land_mask.unmask().eq(0), 0.25)
            .getThumbURL({'min': 0.0, 'max': 0.30, 'gamma': 1.25, 'region': landsat_region, 'dimensions': image_size})
        )
    
        # Landsat true color with Fmask
        fmask_url = (
            landsat_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
            .blend(fmask(landsat_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
            .getThumbURL({'region': landsat_region, 'dimensions': image_size})
        )
    
        print('#'*80)
        print(
            f'  {scene_id}  {row["TOTAL_PIXELS"]:>10d}  {row["UNMASKED_PIXELS"]:>10d}'
            f'  ({row["CLOUD_COUNT_RATIO"]:>0.2f}) ({row["SNOW_COUNT_RATIO"]:>0.2f}) {row["CLOUD_COVER_LAND"]}'
            f'  {row["SR_RED"]:0.2f}  {row["SR_GREEN"]:0.2f}  {row["SR_BLUE"]:0.2f}'
        )
        ipyplot.plot_images([landsat_url, fmask_url], img_width=image_size)
    
        # Show the images above and below the target wrs2
        above_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{above_scene_id}')
        above_region = above_img.geometry().bounds(1, 'EPSG:4326')
        above_sr_img = above_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])
        try:
            above_url = (
                above_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
                .blend(fmask(above_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
                .getThumbURL({'region': above_region, 'dimensions': image_size})
            )
        except:
            above_url = None
            
        below_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{below_scene_id}')
        below_region = below_img.geometry().bounds(1, 'EPSG:4326')
        below_sr_img = below_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])
        try:
            below_url = (
                below_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
                .blend(fmask(below_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
                .getThumbURL({'region': below_region, 'dimensions': image_size})
            )
        except:
            below_url = None
        
        above_skipped = f' (skipped)' if above_scene_id in scene_skip_list else ''   
        below_skipped = f' (skipped)' if below_scene_id in scene_skip_list else ''
        
        if above_url and below_url:
            print(f'{below_scene_id} ({below_cloud_pct}){below_skipped}  {above_scene_id} ({above_cloud_pct}){above_skipped}')
            ipyplot.plot_images([below_url, above_url], img_width=image_size)
        elif above_url:
            print(f'{above_scene_id} ({above_cloud_pct}){above_skipped}')
            ipyplot.plot_images([above_url], img_width=image_size)
        elif below_url:
            print(f'{below_scene_id} ({below_cloud_pct}){below_skipped}')
            ipyplot.plot_images([below_url], img_width=image_size)
    
        wrs2_skip_scenes.append(scene_id)
        wrs2_skip_count += 1
        if wrs2_skip_count >= print_count:
            break

    if wrs2_skip_scenes:
        wrs2_i += 1
        for scene_id in wrs2_skip_scenes:
            print(scene_id)
        new_skip_scenes.extend(wrs2_skip_scenes)

print('\nNew Skip Scenes')
if new_skip_scenes:
    for scene_id in new_skip_scenes:
        print(scene_id)

print('\nDone')
