In [None]:
import collections
import datetime
import os
import pprint
import random

import ee
import numpy as np
import pandas as pd
import openet.core

from IPython.display import Image, display
import ipyplot


In [None]:
ee.Initialize(
    project='ee-cmorton',
    opt_url='https://earthengine-highvolume.googleapis.com'
)

stats_ws = os.path.join(os.getcwd(), 'stats')
if not os.path.isdir(stats_ws):
    os.makedirs(stats_ws)
    

In [None]:
# # Get the list of WRS2 tiles from the SSEBop collection
# wrs2_list = sorted(
#     # ee.ImageCollection('projects/openet/assets/intercomparison/ssebop/landsat/c02/v0p2p6')
#     ee.ImageCollection('projects/usgs-gee-nhm-ssebop/assets/ssebop/landsat/c02')
#     .filterDate('2020-01-01', '2024-01-01')
#     .aggregate_histogram('wrs2_tile').keys().getInfo(),
#     reverse=True
# )
# wrs2_list = wrs2_list + ['p018r028']
# # pprint.pprint(wrs2_list)

wrs2_skip_list = [
    'p010r027', 'p010r030', 
]

wrs2_list = sorted(
    ee.FeatureCollection('projects/openet/assets/features/wrs2/custom')
    .filterBounds(ee.Geometry.BBox(-124, 26, -68, 50))
    .filter(ee.Filter.inList('wrs2_tile', wrs2_skip_list).Not())
    .aggregate_histogram('wrs2_tile').keys().getInfo(),
    reverse=True
)
print(len(wrs2_list))


ocean_wrs2_list = [
    'p048r027', 'p047r031', 'p047r030', 'p047r029', 'p046r033', 
    'p045r034', 'p044r035', 'p043r036', 'p041r037', 'p040r038', 
    'p038r041', 'p038r040',
    'p025r040', 'p024r040', 'p024r027', 'p023r040', 
    'p023r027', 'p022r040', 'p021r040', 'p020r029',
    'p017r041', 'p016r038', 'p015r040', 'p015r037', 
    'p013r033', 'p012r032', 'p011r031', 'p011r030', 
]

california_wrs2_list = [
    'p038r036', 'p038r037', 
    'p039r035', 'p039r036', 'p039r037',
    'p040r034', 'p040r035', 'p040r036', 'p040r037',
    'p041r034', 'p041r035', 'p041r036', 'p041r037',
    'p042r033', 'p042r034', 'p042r035', 'p042r036',
    'p043r031', 'p043r032', 'p043r033', 'p043r034', 'p043r035',
    'p044r031', 'p044r032', 'p044r033', 'p044r034',
    'p045r031', 'p045r032', 'p045r033',
    'p046r031', 'p046r032', 'p047r031',
]


In [None]:
land_mask = ee.Image('projects/openet/assets/features/water_mask').Not()
# Apply the NLCD/NALCMS water mask (anywhere it is water, set the ocean mask 
land_mask = land_mask.where(ee.Image("USGS/NLCD_RELEASES/2020_REL/NALCMS").unmask(18).eq(18), 0)
# land_mask = land_mask.And(ee.Image("USGS/NLCD_RELEASES/2020_REL/NALCMS").unmask(18).neq(18))
# # land_mask = ee.Image('projects/openet/assets/meteorology/conus404/ancillary/land_mask')

# etf_coll_id = 'projects/openet/assets/ssebop/conus/gridmet/landsat/c02'
etf_coll_id = 'projects/usgs-gee-nhm-ssebop/assets/ssebop/landsat/c02'
# etf_coll_id = 'projects/openet/assets/intercomparison/ssebop/landsat/c02/v0p2p6'
band_name = 'et_fraction'

rgb_bands = {
    'LT04': ['SR_B3', 'SR_B2', 'SR_B1'],
    'LT05': ['SR_B3', 'SR_B2', 'SR_B1'],
    'LE07': ['SR_B3', 'SR_B2', 'SR_B1'],
    'LC08': ['SR_B4', 'SR_B3', 'SR_B2'],
    'LC09': ['SR_B4', 'SR_B3', 'SR_B2'],
}

# 0 - white, 1 - no fill (green), 2 - shadow (dark blue), 3 - snow (light blue), 4 - cloud (light gray), 5 - water (purple), 6 - ocean mask
fmask_palette = "ffffff, 9effa1, blue, 00aff2, dddddd, purple, bfbfbf"
fmask_max = 6


In [None]:
# # Intercomparison sites and dates

# sites_csv = '/Users/Charles.Morton@dri.edu/Projects/openet-tools/intercomparison/master_flux_station_list.csv'
# sites_df = pd.read_csv(sites_csv)

# interp_days = 32
# site_keep_list = []
# wrs2_delimiter = ';'

# # Hardcoding the sites CSV field names for now
# start_field = 'START_DATE'
# end_field = 'END_DATE'
# site_field = 'SITE_ID'
# lat_field = 'LATITUDE'
# lon_field = 'LONGITUDE'
# wrs2_field = 'WRS2_TILES'

# from dateutil.relativedelta import relativedelta

# # Group the date ranges by WRS2 tile
# # print(f'\nGrouping overlapping dates')
# wrs2_dates = collections.defaultdict(list)
# wrs2_sites = collections.defaultdict(list)
# for (site_i, site) in sites_df.iterrows():
#     # print(site_i)
#     # print(site)
#     if site['RANDOM_SELECTION'] not in [0, 1]:
#         # print('  Unsupported RANDOM_SELECTION value')
#         input('ENTER')
#         continue
#     # if site['RANDOM_SELECTION'] != 1:
#     #     continue
#     if site_keep_list and site.loc[site_field] not in site_keep_list:
#         # print('  Site not in keep list - skipping')
#         continue

#     # Include all sites in INI file, even those outside the date range
#     for wrs2 in site.loc[wrs2_field].split(wrs2_delimiter):
#         wrs2_sites[wrs2.strip()].append([
#             round(site.loc[lon_field], 6), round(site.loc[lat_field], 6)
#         ])

#     start_dt = datetime.datetime.strptime(site.loc[start_field], '%Y-%m-%d')
#     end_dt = datetime.datetime.strptime(site.loc[end_field], '%Y-%m-%d')
#     # print(f'  Start Date: {start_dt.strftime("%Y-%m-%d")}')
#     # print(f'  End Date:   {end_dt.strftime("%Y-%m-%d")}')

#     # If start/end dates are within N gap days of the start/end of the month
#     #   consider it a "full" month
#     gap_days = 5
#     # print('  Snapping start date to month')
#     month_start_dt = datetime.datetime(start_dt.year, start_dt.month, 1)
#     if (start_dt - month_start_dt).days <= gap_days:
#         # print('    full month')
#         start_dt = month_start_dt
#     else:
#         # print('    not full month')
#         start_dt = month_start_dt + relativedelta(months=1)

#     # print('  Snapping end date to month')
#     month_end_dt = end_dt + relativedelta(months=1)
#     month_end_dt = datetime.datetime(month_end_dt.year, month_end_dt.month, 1)
#     month_end_dt = month_end_dt - relativedelta(days=1)
#     if (month_end_dt - end_dt).days <= gap_days:
#         # print('    full month')
#         end_dt = month_end_dt
#     else:
#         # print('    not full month')
#         end_dt = (month_end_dt + relativedelta(days=1) -
#                   relativedelta(months=1) - relativedelta(days=1))
#     # print(f'  Start Date: {start_dt.strftime("%Y-%m-%d")}')
#     # print(f'  End Date:   {end_dt.strftime("%Y-%m-%d")}')

#     if interp_days > 0:
#         # Buffer the date ranges by the interpolate days value if set
#         # print('  Buffering start/end dates')
#         start_dt = start_dt - datetime.timedelta(days=interp_days)
#         end_dt = end_dt + datetime.timedelta(days=interp_days)
#         # print(f'  Start Date: {start_dt.strftime("%Y-%m-%d")}')
#         # print(f'  End Date:   {end_dt.strftime("%Y-%m-%d")}')

#     # CM - Changing conditionals to get single date ranges to work
#     # if end_dt <= start_dt or start_dt >= end_dt:
#     if end_dt < start_dt or start_dt > end_dt:
#         # print(f'  Start: {start_dt.strftime("%Y-%m-%d")}')
#         # print(f'  End:   {end_dt.strftime("%Y-%m-%d")}')
#         # print('  Date range outside min/max, skipping')
#         continue
#     else:
#         # print(f'  Start: {start_dt.strftime("%Y-%m-%d")}')
#         # print(f'  End:   {end_dt.strftime("%Y-%m-%d")}')
#         pass

#     for wrs2 in site.loc[wrs2_field].split(wrs2_delimiter):
#         wrs2_dates[wrs2.strip()].append([start_dt, end_dt])

# # pprint.pprint(wrs2_dates)

# # Merge the date ranges that overlap
# print(f'\nMerging overlapping dates')
# merged_dates = {}
# for wrs2, dates in sorted(wrs2_dates.items()):
#     # print(f'  {wrs2}')
#     # pprint.pprint(sorted(dates))

#     # Push the first interval on to the stack
#     merged_dates[wrs2] = [sorted(dates)[0]]

#     # Only check for overlapping ranges if there is more than 1 range
#     if len(dates) == 1:
#         continue

#     for d in sorted(dates)[1:]:
#         # If the current date range doesn't overlap, add it to the stack
#         if d[0] > merged_dates[wrs2][-1][1]:
#             merged_dates[wrs2].append(d)
#         # If the ranges overlap and the end date is later,
#         #   update the end time of the stack value
#         elif ((d[0] <= merged_dates[wrs2][-1][1]) and
#               (d[1] > merged_dates[wrs2][-1][1])):
#             merged_dates[wrs2][-1][1] = d[1]

# # pprint.pprint(merged_dates)

# # # CGM - Splitting by year for DisALEXI is not needed if the NLCD
# # #   is set to the image collection instead of the image
# # # For DisALEXI split the date ranges by year after merging
# # # For other models, index by the first year in the range
# # # This may be functionality we will want for other models later
# # year_dates = collections.defaultdict(dict)
# # # if model in ['DISALEXI_TAIR_10K', 'DISALEXI_TAIR_1K', 'DISALEXI', 'DISALEXI_TAIR_DIRECT']:
# # #     for wrs2, dates in merged_dates.items():
# # #         # split_dates = {}
# # #         for date_i, date in enumerate(dates):
# # #             for year in range(date[0].year, date[1].year+1):
# # #                 year_date = [
# # #                     max(date[0], datetime.datetime(year, 1, 1)),
# # #                     min(date[1], datetime.datetime(year, 12, 31)),
# # #                 ]
# # #                 try:
# # #                     year_dates[wrs2][year].append(year_date)
# # #                 except:
# # #                     year_dates[wrs2][year] = [year_date]
# # # else:
# # for wrs2, dates in merged_dates.items():
# #     year_dates[wrs2][dates[0][0].year] = dates

# # pprint.pprint(year_dates)


In [None]:
def fmask(landsat_img):
    # Add the fmask image on top of the true color image
    qa_img = landsat_img.select('QA_PIXEL')
    fill_mask = qa_img.bitwiseAnd(1).neq(0)                  # bits: 0
    dilate_mask = qa_img.rightShift(1).bitwiseAnd(1).neq(0)  # bits: 1
    cirrus_mask = qa_img.rightShift(2).bitwiseAnd(1).neq(0)  # bits: 2
    cloud_mask = qa_img.rightShift(3).bitwiseAnd(1).neq(0)   # bits: 3
    shadow_mask = qa_img.rightShift(4).bitwiseAnd(1).neq(0)  # bits: 4
    snow_mask = qa_img.rightShift(5).bitwiseAnd(1).neq(0)    # bits: 5
    clear_mask = qa_img.rightShift(6).bitwiseAnd(1).neq(0)   # bits: 6
    water_mask = qa_img.rightShift(7).bitwiseAnd(1).neq(0)   # bits: 7
    # cloud_conf = qa_img.rightShift(8).bitwiseAnd(3)          # bits: 8, 9
    # shadow_conf = qa_img.rightShift(10).bitwiseAnd(3)        # bits: 10, 11
    # snow_conf = qa_img.rightShift(12).bitwiseAnd(3)          # bits: 12, 13
    # cirrus_conf = qa_img.rightShift(14).bitwiseAnd(3)        # bits: 14, 15

    # Saturated pixels
    # Flag as saturated if any of the RGB bands are saturated
    #   or change .gt(0) to .gt(7) to flag if all RGB bands are saturated
    # Comment out rightShift line to flag if saturated in any band
    bitshift = ee.Dictionary({'LANDSAT_4': 0, 'LANDSAT_5': 0, 'LANDSAT_7': 0, 'LANDSAT_8': 1, 'LANDSAT_9': 1});
    saturated_mask = (
        landsat_img.select('QA_RADSAT')
        .rightShift(ee.Number(bitshift.get(ee.String(landsat_img.get('SPACECRAFT_ID'))))).bitwiseAnd(7)
        .gt(0)
    )
    
    # Old "Fmask" style image
    fmask_img = (
        qa_img.multiply(0)
        .where(landsat_img.select(['SR_B4']).mask().eq(0), 1)
        # .where(saturated_mask, 6)
        .where(water_mask, 5)
        .where(shadow_mask, 2)
        .where(snow_mask, 3)
        .where(cloud_mask.Or(dilate_mask).Or(cirrus_mask), 4)
        # .add(shadow_mask.multiply(2))
        # .add(snow_mask.multiply(3))
        # .add(cloud_mask.Or(dilate_mask).Or(cirrus_mask).multiply(4))
        # .add(cloud_mask.Or(dilate_mask).multiply(4))
        # .add(cloud_mask.And(cloud_conf).multiply(4))
        # .add(water_mask.multiply(5))
    )
    
    return fmask_img.updateMask(fmask_img.neq(0)).rename(['fmask'])


In [None]:
# Clean up the scene skip list file
skip_path = '../v2p1.csv'
print(f'\n{skip_path}')

with open(skip_path, 'r') as csv_f:
    scene_skip_lines = csv_f.readlines()
scene_skip_header = scene_skip_lines.pop(0)

# Drop the comments and empty lines
scene_skip_lines = [line.strip() for line in scene_skip_lines if line.strip() and line[0] != '#']

# Sort by date then by tile
scene_skip_lines = sorted(scene_skip_lines, key=lambda x:x.split(',')[0].split('_')[-1] + '_' + x.split(',')[0].split('_')[-2])

# Identify duplicate scene IDs (as opposed to duplicate lines)
# Note, this block is not removing any lines, just printing
print('Duplicate Scene IDs:')

if len({l.split(',')[0] for l in scene_skip_lines}) != len(scene_skip_lines):
    for item, count in collections.Counter([l.split(',')[0] for l in scene_skip_lines]).items():
        if count > 1:
            print(item)

# Identify lines with no reason
print('\nMissing reason Scene IDs:')
for l in scene_skip_lines:
    if ',' not in l:
        print(l)
    elif l.split(',')[1].strip() == '':
        print(l)
    elif len(l.split(',')) > 2:
        print(l)

# # Identify duplicate lines (not duplicate SCENE IDs)
# if len({line for line in scene_skip_lines}) != len(scene_skip_lines):
#     print('Duplicate Lines:')
#     for item, count in collections.Counter(scene_skip_lines).items():
#         if count > 1:
#             print(item)
# 
#     # # Uncomment to have the tool remove duplicate lines
#     # scene_remove_lines = []
#     # for item, count in collections.Counter(scene_skip_lines).items():
#     #     if count > 1:
#     #         scene_remove_lines.append(item)
#     #         # print(item)
#      
#     # # Does this only remove the first one?
#     # if scene_remove_lines:
#     #     print(f'Removing {len(scene_remove_lines)} duplicate lines in file')
#     #     for line in scene_remove_lines:
#     #         print(line)
#     #         scene_skip_lines.remove(line)
# 
# # Then recheck for duplicate SCENE_IDs (but different notes or dates)
# scenes = {line.split(',')[0] for line in scene_skip_lines}           
# if len(scenes) != len(scene_skip_lines):
#     print('Duplicate scene IDs still in file')
    
print('\nWriting updated scene skip list CSV')
with open(skip_path.replace('.csv', '_sorted.csv'), 'w') as csv_f:
    csv_f.write(scene_skip_header)
    for i, line in enumerate(scene_skip_lines):
        csv_f.write(line + '\n')

print('\nDone')

In [None]:
# Build the EEMETRIC skip list by merging the EEMETRIC error list and the full skip list
scene_skip_path = '../v2p1_sorted.csv'
eemetric_error_path = '../v2p1_eemetric_error.csv'
eemetric_skip_path = '../v2p1_eemetric.csv'

with open(eemetric_error_path, 'r') as csv_f:
    eemetric_skip_lines = csv_f.readlines()
    
with open(scene_skip_path, 'r') as csv_f:
    scene_skip_lines = csv_f.readlines()

print('\nWriting eemetric scene skip list CSV')
with open(eemetric_skip_path, 'w') as csv_f:
    for i, line in enumerate(eemetric_skip_lines):
        csv_f.write(line)
    csv_f.write('\n')
    for i, line in enumerate(scene_skip_lines):
        if i == 0:
            continue
        csv_f.write(line)

print('\nDone')

In [None]:
### Print scenes with high masked count percentages
#count_threshold_pct_min = 80
#count_threshold_pct_min = 89
#count_threshold_pct_max = 101

count_threshold_pct_min = 75
count_threshold_pct_max = 101

start_year = 1984
#start_year = 2003
#start_year = 2015
#start_year = 2025
end_year = 2025
years = list(range(start_year, end_year + 1))

print_count = 10
# image_size = 700
# image_size = 900
# image_size = 1024
image_size = 1400

# Read in the scene skip list
scene_skip_url = '../v2p1.csv'
# scene_skip_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1.csv'
scene_skip_df = pd.read_csv(scene_skip_url)
scene_skip_list = list(scene_skip_df['SCENE_ID'].values)
print(f'Skip list images: {len(scene_skip_list)}')

scene_cloudscore_url = '../v2p1_cloudscore.csv'
# scene_cloudscore_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1_cloudscore.csv'
scene_cloudscore_list = list(pd.read_csv(scene_cloudscore_url)['SCENE_ID'].values)
print(f'Skip cloudscore images: {len(scene_cloudscore_list)}')


print('Reading image stats CSV files')
stats_df_list = []
for wrs2_tile in wrs2_list:
    # if int(wrs2_tile[1:4]) not in range(10, 25):
    #     continue
        
    for year in range(start_year, end_year + 1):
        wrs2_stats_path = os.path.join(stats_ws, f'{year}', f'{wrs2_tile}_{year}.csv')
        if not os.path.isfile(wrs2_stats_path):
            # print(f'  {wrs2_tile}_{year} - Missing stats CSV, skipping')
            continue
        try:
            wrs2_stats_df = pd.read_csv(wrs2_stats_path, index_col=False)
        except Exception as e:
            print(f'  {wrs2_tile}_{year} - Error reading CSV, skipping')
            continue
        if wrs2_stats_df.empty:
            continue
        wrs2_stats_df['DATE'] = wrs2_stats_df['SCENE_ID'].str.slice(12, 20)
        wrs2_stats_df['WRS2'] = 'p' + wrs2_stats_df['SCENE_ID'].str.slice(5, 8) + 'r' + wrs2_stats_df['SCENE_ID'].str.slice(8, 11)
        stats_df_list.append(wrs2_stats_df)

stats_df = pd.concat(stats_df_list)

# Add the high CLOUD_COVER_LAND scenes to the skip list but don't remove from the dataframe
scene_skip_list.extend(stats_df[stats_df['CLOUD_COVER_LAND'] >= 71]['SCENE_ID'].values)

# Skip the Landsat 7 scenes in 2023
l7_2022_mask = (
    (stats_df['DATE'].str.slice(0,4) >= '2022') &
    (stats_df['SCENE_ID'].str.slice(0,4) == 'LE07')
)
stats_df = stats_df[~l7_2022_mask]

# Compute the ratios
# stats_df['ACCA_COUNT_RATIO'] = stats_df['ACCA_PIXELS'] / stats_df['TOTAL_PIXELS']
stats_df['SNOW_COUNT_RATIO'] = stats_df['SNOW_PIXELS'] / stats_df['TOTAL_PIXELS']
# stats_df['SHADOW_COUNT_RATIO'] = stats_df['SHADOW_PIXELS'] / stats_df['TOTAL_PIXELS']
stats_df['WATER_COUNT_RATIO'] = stats_df['WATER_PIXELS'] / stats_df['TOTAL_PIXELS']
stats_df['MASKED_PIXELS'] = (
    stats_df['CLOUD_PIXELS'] + stats_df['CIRRUS_PIXELS'] + stats_df['DILATE_PIXELS']
    + stats_df['SHADOW_PIXELS']
    + stats_df['SNOW_PIXELS']
    # + stats_df['WATER_PIXELS']
    + stats_df['ACCA_PIXELS']
    # + stats_df['SATURATED_PIXELS']
)
stats_df['CLOUD_COUNT_RATIO'] = stats_df['MASKED_PIXELS'] / stats_df['TOTAL_PIXELS']
# stats_df['CLOUD_COUNT_RATIO'] = stats_df['UNMASKED_PIXELS'] / stats_df['TOTAL_PIXELS']

print(f'  {len(stats_df.count(axis=1))}')

# Work through the tiles based on which ones already have the most skipped scenes
wrs2_tiles = list(stats_df.groupby(['WRS2'])['SCENE_ID'].count().sort_values(ascending=False).index)
# wrs2_tiles = ['']

new_skip_scenes = []
new_skip_count = 0

wrs2_i = 0

# for wrs2 in reversed(wrs2_tiles):
# for wrs2 in reversed(sorted(wrs2_tiles)):
# for wrs2 in sorted(wrs2_tiles):
for wrs2 in random.sample(wrs2_tiles, len(wrs2_tiles)):
    if wrs2_i >= 20:
        break
    if wrs2_skip_list and (wrs2 in wrs2_skip_list):
        continue
    # if california_wrs2_list and (wrs2 not in california_wrs2_list) and wrs2 not in ['p042r033']:
    #     continue
    # if int(wrs2[5:8]) >= 29:
    #     continue
    
    wrs2_path = int(wrs2[1:4])
    wrs2_row = int(wrs2[5:8])
    wrs2_tgt = f'{wrs2_path:03d}{wrs2_row:03d}'
    wrs2_above = f'{wrs2_path:03d}{wrs2_row-1:03d}'
    wrs2_below = f'{wrs2_path:03d}{wrs2_row+1:03d}'    

    wrs2_stats_df = stats_df[stats_df['WRS2'] == wrs2].copy()
    # Applying skip list here so that main stats DF has all scenes
    wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_skip_list)]
    wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_cloudscore_list)]
    
    # Only check winter scenes
    #wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['DATE'].str.slice(4,6).astype(int).isin([11, 12, 1, 2, 3])]
    #wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['DATE'].str.slice(4,6).astype(int).isin([10, 11, 12, 1, 2, 3, 4])]
    
    # Filter on the overall cloud count ratio
    wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COUNT_RATIO'] < (count_threshold_pct_max / 100)]
    wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COUNT_RATIO'] >= (count_threshold_pct_min / 100)]
    wrs2_stats_df.sort_values('CLOUD_COUNT_RATIO', ascending=False, inplace=True)

    # # Filter on the CLOUD_COVER_LAND property
    wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COVER_LAND'] < 71]
    #wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COVER_LAND'] >= 60]
    #wrs2_stats_df.sort_values('CLOUD_COVER_LAND', ascending=False, inplace=True)

    if len(wrs2_stats_df.count(axis=1)) == 0:
        continue
    print(f'{wrs2} - {len(wrs2_stats_df.count(axis=1))}')

    new_skip_scenes = []
    new_skip_count = 0
    
    # for i, row in wrs2_stats_df.iterrows():
    for i, row in wrs2_stats_df.sample(n=min(print_count, len(wrs2_stats_df.index))).iterrows():

        scene_id = row["SCENE_ID"].upper()

        above_scene_id = scene_id.upper().replace(wrs2_tgt, wrs2_above)
        above_stats_df = stats_df.loc[stats_df['SCENE_ID'] == above_scene_id]
        if len(above_stats_df):
            above_cloud_pct = float(above_stats_df.iloc[0]['CLOUD_COVER_LAND'])
        else:
            above_cloud_pct = None
            
        below_scene_id = scene_id.upper().replace(wrs2_tgt, wrs2_below)
        below_stats_df = stats_df.loc[stats_df['SCENE_ID'] == below_scene_id]
        if len(below_stats_df):
            below_cloud_pct = float(below_stats_df.iloc[0]['CLOUD_COVER_LAND'])
        else:
            below_cloud_pct = None

        # # Only show scenes that have above & below both skipped or None
        # if (((above_scene_id not in scene_skip_list) and (above_cloud_pct is not None)) or 
        #     ((below_scene_id not in scene_skip_list) and (below_cloud_pct is not None))):
        #     continue   
        
        landsat_type = scene_id.split('_')[0].upper()
        landsat_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{scene_id}')
        landsat_region = landsat_img.geometry().bounds(1, 'EPSG:4326')
        landsat_sr_img = landsat_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])

        # Landsat true color image
        landsat_url = (
            landsat_sr_img.where(land_mask.unmask().eq(0), 0.25)
            .getThumbURL({'min': 0.0, 'max': 0.30, 'gamma': 1.25, 'region': landsat_region, 'dimensions': image_size})
        )
    
        # Landsat true color with Fmask
        fmask_url = (
            landsat_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
            .blend(fmask(landsat_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
            .getThumbURL({'region': landsat_region, 'dimensions': image_size})
        )
    
        print('#'*80)
        print(
            f'  {scene_id}  {row["TOTAL_PIXELS"]:>10d}  {row["UNMASKED_PIXELS"]:>10d}'
            f'  ({row["CLOUD_COUNT_RATIO"]:>0.2f}) ({row["SNOW_COUNT_RATIO"]:>0.2f}) {row["CLOUD_COVER_LAND"]}'
            f'  {row["SR_RED"]:0.2f}  {row["SR_GREEN"]:0.2f}  {row["SR_BLUE"]:0.2f}'
        )
        ipyplot.plot_images([landsat_url, fmask_url], img_width=image_size)
    
        # Show the images above and below the target wrs2
        above_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{above_scene_id}')
        above_region = above_img.geometry().bounds(1, 'EPSG:4326')
        above_sr_img = above_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])
        try:
            above_url = (
                above_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
                .blend(fmask(above_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
                .getThumbURL({'region': above_region, 'dimensions': image_size})
            )
        except:
            above_url = None
            
        below_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{below_scene_id}')
        below_region = below_img.geometry().bounds(1, 'EPSG:4326')
        below_sr_img = below_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])
        try:
            below_url = (
                below_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
                .blend(fmask(below_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
                .getThumbURL({'region': below_region, 'dimensions': image_size})
            )
        except:
            below_url = None
    
        above_skipped = f' (skipped)' if above_scene_id in scene_skip_list else ''   
        below_skipped = f' (skipped)' if below_scene_id in scene_skip_list else ''
        
        if above_url and below_url:
            print(f'{below_scene_id} ({below_cloud_pct}){below_skipped}  {above_scene_id} ({above_cloud_pct}){above_skipped}')
            ipyplot.plot_images([below_url, above_url], img_width=image_size)
        elif above_url:
            print(f'{above_scene_id} ({above_cloud_pct}){above_skipped}')
            ipyplot.plot_images([above_url], img_width=image_size)
        elif below_url:
            print(f'{below_scene_id} ({below_cloud_pct}){below_skipped}')
            ipyplot.plot_images([below_url], img_width=image_size)
    
        new_skip_scenes.append(scene_id)
        new_skip_count += 1
        if new_skip_count >= print_count:
            break

    if new_skip_scenes:
        wrs2_i += 1
        for scene_id in new_skip_scenes:
            print(scene_id)     

# print('\nNew Skip Scenes')
# if new_skip_scenes:
#     for scene_id in new_skip_scenes:
#         print(scene_id)

print('\nDone')


In [None]:
# ### Print scenes with high numbers of unmasked clouds
# #count_threshold_pct_min = 20
# #count_threshold_pct_min = 10
# count_threshold_pct_min = 5
# #count_threshold_pct_min = 2
# #count_threshold_pct_min = 1
# count_threshold_pct_max = 101

# count_threshold = 2000000

# start_year = 1984
# end_year = 2025
# years = list(range(start_year, end_year + 1))

# print_count = 20
# image_size = 1400

# # Read in the scene skip list
# scene_skip_url = '../v2p1.csv'
# # scene_skip_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1.csv'
# scene_skip_list = list(pd.read_csv(scene_skip_url)['SCENE_ID'].values)
# print(f'Skip list images: {len(scene_skip_list)}')

# scene_cloudscore_url = '../v2p1_cloudscore.csv'
# # scene_cloudscore_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1_cloudscore.csv'
# scene_cloudscore_list = list(pd.read_csv(scene_cloudscore_url)['SCENE_ID'].values)
# print(f'Skip cloudscore images: {len(scene_cloudscore_list)}')


# red_band = 'SR_RED'
# green_band = 'SR_GREEN'
# blue_band = 'SR_BLUE'


# print('Reading image stats CSV files')
# stats_df_list = []
# for wrs2_tile in wrs2_list:
#     # if int(wrs2_tile[1:4]) not in range(10, 25):
#     #     continue
    
#     for year in range(start_year, end_year + 1):
#         wrs2_stats_path = os.path.join(stats_ws, f'{year}', f'{wrs2_tile}_{year}.csv')
#         if not os.path.isfile(wrs2_stats_path):
#             # print(f'  {wrs2_tile}_{year} - Missing stats CSV, skipping')
#             continue
#         try:
#             wrs2_stats_df = pd.read_csv(wrs2_stats_path)
#         except Exception as e:
#             print(f'  {wrs2_tile}_{year} - Error reading CSV, skipping')
#             os.remove(wrs2_stats_path)
#             continue
#         if wrs2_stats_df.empty:
#             continue
#         wrs2_stats_df.drop(columns=['system:index', '.geo'], inplace=True)
#         wrs2_stats_df['DATE'] = wrs2_stats_df['SCENE_ID'].str.slice(12, 20)
#         wrs2_stats_df['WRS2'] = 'p' + wrs2_stats_df['SCENE_ID'].str.slice(5, 8) + 'r' + wrs2_stats_df['SCENE_ID'].str.slice(8, 11)
#         stats_df_list.append(wrs2_stats_df)

# stats_df = pd.concat(stats_df_list)

# # Add the high CLOUD_COVER_LAND scenes to the skip list but don't remove from the dataframe
# scene_skip_list.extend(stats_df[stats_df['CLOUD_COVER_LAND'] >= 71]['SCENE_ID'].values)
    
# # Skip the Landsat 7 scenes in 2023
# l7_2022_mask = (
#     (stats_df['DATE'].str.slice(0,4) >= '2022') &
#     (stats_df['SCENE_ID'].str.slice(0,4) == 'LE07')
# )
# stats_df = stats_df[~l7_2022_mask]

# # Compute the ratios
# stats_df['ACCA_COUNT_RATIO'] = stats_df['ACCA_PIXELS'] / stats_df['TOTAL_PIXELS']
# stats_df['SNOW_COUNT_RATIO'] = stats_df['SNOW_PIXELS'] / stats_df['TOTAL_PIXELS']
# stats_df['MASKED_PIXELS'] = (
#     stats_df['CLOUD_PIXELS']
#     + stats_df['CIRRUS_PIXELS']
#     + stats_df['DILATE_PIXELS']
#     + stats_df['SHADOW_PIXELS']
#     + stats_df['SNOW_PIXELS']
#     # + stats_df['WATER_PIXELS']
#     + stats_df['ACCA_PIXELS']
#     # + stats_df['SATURATED_PIXELS']
# )
# stats_df['CLOUD_COUNT_RATIO'] = stats_df['MASKED_PIXELS'] / stats_df['TOTAL_PIXELS']
# # stats_df['CLOUD_COUNT_RATIO'] = stats_df['UNMASKED_PIXELS'] / stats_df['TOTAL_PIXELS']

# print(f'  {len(stats_df.count(axis=1))}')

# # Work through the tiles based on which ones already have the most skipped scenes
# wrs2_tiles = list(stats_df.groupby(['WRS2'])['SCENE_ID'].count().sort_values(ascending=False).index)
# # wrs2_tiles = ['']

# # new_skip_count = 0
# # new_skip_scenes = []

# # for wrs2 in reversed(wrs2_tiles):
# # for wrs2 in random.sample(wrs2_tiles, len(wrs2_tiles)):
# # for wrs2 in sorted(wrs2_tiles):
# for wrs2 in reversed(sorted(wrs2_tiles)):
#     # if wrs2_i > 20:
#     #     break
#     if wrs2 in ['p021r040', 'p038r041']:
#         continue
#     # if wrs2 in ['p033r037', 'p039r031', 'p039r032', 'p039r033', 'p042r034', 'p042r032']:
#     #     continue
#     # if wrs2_skip_list and (wrs2 in wrs2_skip_list):
#     #     continue
#     if ocean_wrs2_list and (wrs2 in ocean_wrs2_list):
#         continue
#     # if int(wrs2[1:4]) not in range(10, 24):
#     #     continue
#     # if int(wrs2[5:8]) not in range(35, 50):
#     #     continue

#     wrs2_path = int(wrs2[1:4])
#     wrs2_row = int(wrs2[5:8])
#     wrs2_tgt = f'{wrs2_path:03d}{wrs2_row:03d}'
#     wrs2_above = f'{wrs2_path:03d}{wrs2_row-1:03d}'
#     wrs2_below = f'{wrs2_path:03d}{wrs2_row+1:03d}'    

#     wrs2_stats_df = stats_df[stats_df['WRS2'] == wrs2].copy()
#     # Applying skip list here so that main stats DF has all scenes
#     wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_skip_list)]
#     wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_cloudscore_list)]
#     # # Only look at Landsat 8 and 9 for this test
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['SCENE_ID'].str.slice(0,4).isin(['LC08', 'LC09'])]

#     # # Filter on the snow pixel count ratio
#     # wrs2_stats_df = wrs2_stats_df[stats_df['SNOW_COUNT_RATIO'] > 0.8]
#     # wrs2_stats_df.sort_values('SNOW_COUNT_RATIO', ascending=False, inplace=True)
    
#     # # Filter on the overall cloud count ratio
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COUNT_RATIO'] < (count_threshold_pct_max / 100)]
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COUNT_RATIO'] >= (count_threshold_pct_min / 100)]
#     # # wrs2_stats_df.sort_values('CLOUD_COUNT_RATIO', ascending=False, inplace=True)
    
#     # # Only check summer scenes
#     # stats_df = stats_df[stats_df['DATE'].str.slice(4,6).astype(int).isin([5, 6, 7, 8, 9])]
    
#     # Only check winter scenes
#     # stats_df = stats_df[~stats_df['DATE'].str.slice(4,6).astype(int).isin(range(4, 11))]
#     # stats_df = stats_df[~stats_df['DATE'].str.slice(4,6).astype(int).isin(range(5, 10))]
#     # stats_df = stats_df[~stats_df['DATE'].str.slice(4,6).astype(int).isin(range(7, 10))]
    
#     # # Filter on the overall cloud count ratio
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['ACCA_COUNT_RATIO'] < (count_threshold_pct_max / 100)]
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['ACCA_COUNT_RATIO'] >= (count_threshold_pct_min / 100)]
#     # wrs2_stats_df.sort_values('ACCA_COUNT_RATIO', ascending=False, inplace=True)

#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['UNMASKED_PIXELS'] < count_threshold]
#     # wrs2_stats_df.sort_values('UNMASKED_PIXELS', ascending=True, inplace=True)
#     wrs2_stats_df.sort_values('CLOUD_COUNT_RATIO', ascending=False, inplace=True)
    
#     if len(wrs2_stats_df.count(axis=1)) == 0:
#         continue
#     print(f'{wrs2} - {len(wrs2_stats_df.count(axis=1))}')

#     new_skip_count = 0
#     new_skip_scenes = []
    
#     # for i, row in wrs2_stats_df.iterrows():
#     for i, row in wrs2_stats_df.sample(n=min(10, len(wrs2_stats_df.index))).iterrows():

#         scene_id = row["SCENE_ID"].upper()

#         # # Only review scenes that have the image above and below in the skip list
#         # if above_scene_id not in scene_skip_list:
#         #     continue
#         # if below_scene_id not in scene_skip_list:
#         #     continue

#         above_scene_id = scene_id.upper().replace(wrs2_tgt, wrs2_above)
#         above_stats_df = stats_df.loc[stats_df['SCENE_ID'] == above_scene_id]
#         if len(above_stats_df):
#             above_cloud_pct = float(above_stats_df.iloc[0]['CLOUD_COVER_LAND'])
#         else:
#             above_cloud_pct = None
            
#         below_scene_id = scene_id.upper().replace(wrs2_tgt, wrs2_below)
#         below_stats_df = stats_df.loc[stats_df['SCENE_ID'] == below_scene_id]
#         if len(below_stats_df):
#             below_cloud_pct = float(below_stats_df.iloc[0]['CLOUD_COVER_LAND'])
#         else:
#             below_cloud_pct = None

#         # # Only show scenes that have above & below both skipped or None
#         # if (((above_scene_id not in scene_skip_list) and (above_cloud_pct is not None)) or 
#         #     ((below_scene_id not in scene_skip_list) and (below_cloud_pct is not None))):
#         #     continue   
        
#         landsat_type = scene_id.split('_')[0].upper()
#         landsat_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{scene_id}')
#         landsat_region = landsat_img.geometry().bounds(1, 'EPSG:4326')
#         landsat_sr_img = landsat_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])

#         # Landsat true color image
#         landsat_url = (
#             landsat_sr_img.where(land_mask.unmask().eq(0), 0.25)
#             .getThumbURL({'min': 0.0, 'max': 0.30, 'gamma': 1.25, 'region': landsat_region, 'dimensions': image_size})
#         )
    
#         # Landsat true color with Fmask
#         fmask_url = (
#             landsat_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#             .blend(fmask(landsat_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#             .getThumbURL({'region': landsat_region, 'dimensions': image_size})
#         )
    
#         print('#'*80)
#         print(
#             f'  {scene_id}  {row["TOTAL_PIXELS"]:>10d}  {row["UNMASKED_PIXELS"]:>10d}'
#             f'  ({row["CLOUD_COUNT_RATIO"]:>0.2f}) ({row["SNOW_COUNT_RATIO"]:>0.2f}) {row["CLOUD_COVER_LAND"]}'
#             f'  {row[red_band]:0.2f}  {row[green_band]:0.2f}  {row[blue_band]:0.2f}'
#         )
#         ipyplot.plot_images([landsat_url, fmask_url], img_width=image_size)
    
#         # Show the images above and below the target wrs2
#         above_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{above_scene_id}')
#         above_region = above_img.geometry().bounds(1, 'EPSG:4326')
#         above_sr_img = above_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])
#         try:
#             above_url = (
#                 above_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#                 .blend(fmask(above_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#                 .getThumbURL({'region': above_region, 'dimensions': image_size})
#             )
#         except:
#             above_url = None
            
#         below_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{below_scene_id}')
#         below_region = below_img.geometry().bounds(1, 'EPSG:4326')
#         below_sr_img = below_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])
#         try:
#             below_url = (
#                 below_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#                 .blend(fmask(below_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#                 .getThumbURL({'region': below_region, 'dimensions': image_size})
#             )
#         except:
#             below_url = None
    
#         above_skipped = f' (skipped)' if above_scene_id in scene_skip_list else ''   
#         below_skipped = f' (skipped)' if below_scene_id in scene_skip_list else ''
        
#         if above_url and below_url:
#             print(f'{below_scene_id} ({below_cloud_pct}){below_skipped}  {above_scene_id} ({above_cloud_pct}){above_skipped}')
#             ipyplot.plot_images([below_url, above_url], img_width=image_size)
#         elif above_url:
#             print(f'{above_scene_id} ({above_cloud_pct}){above_skipped}')
#             ipyplot.plot_images([above_url], img_width=image_size)
#         elif below_url:
#             print(f'{below_scene_id} ({below_cloud_pct}){below_skipped}')
#             ipyplot.plot_images([below_url], img_width=image_size)
    
#         new_skip_count += 1
#         new_skip_scenes.append(scene_id)
#         if new_skip_count >= print_count:
#             break

#     if new_skip_scenes:
#         for scene_id in new_skip_scenes:
#             print(scene_id)
#     if new_skip_count:
#         wrs2_i += 1
        
# # if new_skip_scenes:
# #     for scene_id in new_skip_scenes:
# #         print(scene_id)

# print('\nDone')


In [None]:
# ### Print scenes with high average reflectance
# # refl_min = 0.4
# # refl_min = 0.35
# # refl_min = 0.3
# # refl_min = 0.25
# refl_min = 0.2
# # refl_min = 0.19
# # refl_min = 0.18
# # refl_min = -0.1

# refl_max = 1.5
# # refl_max = 0.5

# #image_size = 900
# #image_size = 1024
# image_size = 1400
# print_count = 10

# #start_year = 1984
# #start_year = 2015
# start_year = 2024
# end_year = 2025
# years = list(range(start_year, end_year + 1))

# # # These are the average reflectance values after masking (for the unmasked pixels)
# red_band = 'UNMASKED_SR_RED'
# green_band = 'UNMASKED_SR_GREEN'
# blue_band = 'UNMASKED_SR_BLUE'
# # red_band = 'UNMASKED_TOA_RED'
# # green_band = 'UNMASKED_TOA_GREEN'
# # blue_band = 'UNMASKED_TOA_BLUE'

# # # These are the average reflectance values for the full scene before masking
# # red_band = 'SR_RED'
# # green_band = 'SR_GREEN'
# # blue_band = 'SR_BLUE'
# # red_band = 'TOA_RED'
# # green_band = 'TOA_GREEN'
# # blue_band = 'TOA_BLUE'

# # Read in the scene skip list
# scene_skip_url = '/Users/Charles.Morton@dri.edu/Projects/scene-skip-list/v2p1.csv'
# # scene_skip_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1.csv'
# scene_skip_df = pd.read_csv(scene_skip_url)
# scene_skip_list = list(scene_skip_df['SCENE_ID'].values)
# print(f'Skip list images: {len(scene_skip_list)}')

# scene_cloudscore_url = '/Users/Charles.Morton@dri.edu/Projects/scene-skip-list/v2p1_cloudscore.csv'
# # scene_cloudscore_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1_cloudscore.csv'
# scene_cloudscore_list = list(pd.read_csv(scene_cloudscore_url)['SCENE_ID'].values)
# print(f'Skip cloudscore images: {len(scene_cloudscore_list)}')


# scene_keep_list = [
#     # 'LE07_039032_20000604',
#     # 'LE07_039032_19990805',
#     # 'LE07_039032_19990704',
#     # 'LE07_039032_20000604',
#     # 'LE07_037038_20110707',
#     # 'LE07_033038_20000525',
#     # 'LE07_040030_20190107',
#     # 'LE07_037032_20211006',
#     # 'LE07_041036_20001008',
# ]

# print('Reading image stats CSV files')
# stats_df_list = []
# for wrs2 in wrs2_list:
#     # if int(wrs2[1:4]) not in range(31, 35):
#     #     continue
    
#     for year in range(start_year, end_year + 1):
#         wrs2_stats_path = os.path.join(stats_ws, f'{year}', f'{wrs2}_{year}.csv')
#         if not os.path.isfile(wrs2_stats_path):
#             # print(f'  {wrs2}_{year} - Missing stats CSV, skipping')
#             continue
#         try:
#             wrs2_stats_df = pd.read_csv(wrs2_stats_path)
#         except Exception as e:
#             print(f'  {wrs2}_{year} - Error reading CSV, skipping')
#             os.remove(wrs2_stats_path)
#             continue
#         if wrs2_stats_df.empty:
#             continue
#         wrs2_stats_df.drop(columns=['system:index', '.geo'], inplace=True)
#         wrs2_stats_df['DATE'] = wrs2_stats_df['SCENE_ID'].str.slice(12, 20)
#         wrs2_stats_df['WRS2'] = 'p' + wrs2_stats_df['SCENE_ID'].str.slice(5, 8) + 'r' + wrs2_stats_df['SCENE_ID'].str.slice(8, 11)
#         stats_df_list.append(wrs2_stats_df)

# stats_df = pd.concat(stats_df_list)

# # Add the high CLOUD_COVER_LAND scenes to the skip list but don't remove from the dataframe
# scene_skip_list.extend(stats_df[stats_df['CLOUD_COVER_LAND'] >= 71]['SCENE_ID'].values)

# # # Skip and keep scenes will both be skipped in this processing since they have already been reviewed
# # if scene_skip_list:
# #     stats_df = stats_df[~stats_df['SCENE_ID'].isin(scene_skip_list)]
# # if scene_keep_list:
# #     stats_df = stats_df[~stats_df['SCENE_ID'].isin(scene_keep_list)]

# # Skip the Landsat 7 scenes in 2023
# l7_2022_mask = (
#     (stats_df['DATE'].str.slice(0,4) >= '2022') &
#     (stats_df['SCENE_ID'].str.slice(0,4) == 'LE07')
# )
# stats_df = stats_df[~l7_2022_mask]

# # Compute the cloud count ratios
# stats_df['SNOW_COUNT_RATIO'] = stats_df['SNOW_PIXELS'] / stats_df['TOTAL_PIXELS']
# # stats_df['SNOW_COUNT_RATIO'] = (stats_df['SNOW_PIXELS'] + stats_df['SHADOW_PIXELS'] + stats_df['WATER_PIXELS'] + stats_df['DILATE_PIXELS']) / stats_df['TOTAL_PIXELS']
# stats_df['MASKED_PIXELS'] = (
#     stats_df['CLOUD_PIXELS']
#     + stats_df['CIRRUS_PIXELS']
#     + stats_df['DILATE_PIXELS']
#     + stats_df['SHADOW_PIXELS']
#     + stats_df['SNOW_PIXELS']
#     # + stats_df['WATER_PIXELS']
# )
# stats_df['CLOUD_COUNT_RATIO'] = stats_df['MASKED_PIXELS'] / stats_df['TOTAL_PIXELS']
# # stats_df['CLOUD_COUNT_RATIO'] = stats_df['UNMASKED_PIXELS'] / stats_df['TOTAL_PIXELS']

# print(f'  {len(stats_df.count(axis=1))}')
# print('  Done\n')


# # Work through the tiles based on which ones already have the most skipped scenes
# wrs2_tiles = list(stats_df.groupby(['WRS2'])['SCENE_ID'].count().sort_values(ascending=False).index)
# # wrs2_tiles = ['']

# new_skip_scenes = []
# wrs2_i = 0

# # for wrs2 in reversed(wrs2_tiles):
# # for wrs2 in reversed(sorted(wrs2_tiles)):
# # for wrs2 in sorted(wrs2_tiles):
# for wrs2 in random.sample(wrs2_tiles, len(wrs2_tiles)):
#     if wrs2_i > 20:
#         break
#     if wrs2_skip_list and (wrs2 in wrs2_skip_list):
#         continue
#     # if wrs2 in ['p040r036', 'p040r036', 'p039r032', 'p038r038']:
#     #     continue
#     # if california_wrs2_list and (wrs2 not in california_wrs2_list):
#     #     continue
#     # if int(wrs2[1:4]) not in range(29, 35):
#     #     continue
#     if int(wrs2[5:8]) >= 30:
#         continue

#     wrs2_path = int(wrs2[1:4])
#     wrs2_row = int(wrs2[5:8])
#     wrs2_tgt = f'{wrs2_path:03d}{wrs2_row:03d}'
#     wrs2_above = f'{wrs2_path:03d}{wrs2_row-1:03d}'
#     wrs2_below = f'{wrs2_path:03d}{wrs2_row+1:03d}'    

#     wrs2_stats_df = stats_df[stats_df['WRS2'] == wrs2].copy()
#     # Applying skip list here so that main stats DF has all scenes
#     wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_skip_list)]
#     wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_cloudscore_list)]
#     # # Only look at Landsat 8 and 9 for this test
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['SCENE_ID'].str.slice(0,4).isin(['LC08', 'LC09'])]
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['DATE'] >= '19841201']

#     # Compute the average reflectance
#     wrs2_stats_df = wrs2_stats_df[(wrs2_stats_df[red_band] > refl_min) | (wrs2_stats_df[green_band] > refl_min) | (wrs2_stats_df[blue_band] > refl_min)]
#     # wrs2_stats_df = wrs2_stats_df[(wrs2_stats_df[red_band] > refl_min) & (wrs2_stats_df[green_band] > refl_min) & (wrs2_stats_df[blue_band] > refl_min)]
#     # wrs2_stats_df.sort_values(red_band, ascending=False, inplace=True)
#     wrs2_stats_df['REFL_SORT'] = (wrs2_stats_df[red_band] + wrs2_stats_df[green_band] + wrs2_stats_df[blue_band]) / 3
    
#     # Filter on the overall cloud count ratio
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['REFL_SORT'] < (refl_max)]
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['REFL_SORT'] >= (refl_min)]
#     wrs2_stats_df.sort_values('REFL_SORT', ascending=False, inplace=True)
#     # wrs2_stats_df.sort_values('CLOUD_COUNT_RATIO', ascending=False, inplace=True)
    
#     if len(wrs2_stats_df.count(axis=1)) == 0:
#         # print(f'{wrs2} - {len(wrs2_stats_df.count(axis=1))}')
#         continue
#     print(f'{wrs2} - {len(wrs2_stats_df.count(axis=1))}')

#     new_skip_count = 0
#     # for i, row in wrs2_stats_df.iterrows():
#     for i, row in wrs2_stats_df.sample(n=min(10, len(wrs2_stats_df.index))).iterrows():
        
#         scene_id = row["SCENE_ID"].upper()
        
#         above_scene_id = scene_id.upper().replace(wrs2_tgt, wrs2_above)
#         above_stats_df = stats_df.loc[stats_df['SCENE_ID'] == above_scene_id]
#         if len(above_stats_df):
#             above_cloud_pct = float(above_stats_df.iloc[0]['CLOUD_COVER_LAND'])
#         else:
#             above_cloud_pct = None
            
#         below_scene_id = scene_id.upper().replace(wrs2_tgt, wrs2_below)
#         below_stats_df = stats_df.loc[stats_df['SCENE_ID'] == below_scene_id]
#         if len(below_stats_df):
#             below_cloud_pct = float(below_stats_df.iloc[0]['CLOUD_COVER_LAND'])
#         else:
#             below_cloud_pct = None

#         # # Only show scenes that have above & below both skipped or None
#         # if (((above_scene_id not in scene_skip_list) and (above_cloud_pct is not None)) or 
#         #     ((below_scene_id not in scene_skip_list) and (below_cloud_pct is not None))):
#         #     continue   
        
#         landsat_type = scene_id.split('_')[0].upper()
#         landsat_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{scene_id}')
#         landsat_region = landsat_img.geometry().bounds(1, 'EPSG:4326')
#         landsat_sr_img = landsat_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])

#         # Landsat true color image
#         landsat_url = (
#             landsat_sr_img.where(land_mask.unmask().eq(0), 0.25)
#             .getThumbURL({'min': 0.0, 'max': 0.30, 'gamma': 1.25, 'region': landsat_region, 'dimensions': image_size})
#         )
    
#         # Landsat true color with Fmask
#         fmask_url = (
#             landsat_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#             .blend(fmask(landsat_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#             .getThumbURL({'region': landsat_region, 'dimensions': image_size})
#         )
    
#         print('#'*80)
#         print(
#             f'  {scene_id}  {row["TOTAL_PIXELS"]:>10d}  {row["UNMASKED_PIXELS"]:>10d}'
#             f'  ({row["CLOUD_COUNT_RATIO"]:>0.2f}) ({row["SNOW_COUNT_RATIO"]:>0.2f}) {row["CLOUD_COVER_LAND"]}'
#             f'  {row[red_band]:0.2f}  {row[green_band]:0.2f}  {row[blue_band]:0.2f}'
#         )
#         ipyplot.plot_images([landsat_url, fmask_url], img_width=image_size)
    
    
#         # Show the images above and below the target wrs2
#         above_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{above_scene_id}')
#         above_region = above_img.geometry().bounds(1, 'EPSG:4326')
#         above_sr_img = above_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])
#         try:
#             above_url = (
#                 above_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#                 .blend(fmask(above_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#                 .getThumbURL({'region': above_region, 'dimensions': image_size})
#             )
#         except:
#             above_url = None
            
#         below_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{below_scene_id}')
#         below_region = below_img.geometry().bounds(1, 'EPSG:4326')
#         below_sr_img = below_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])
#         try:
#             below_url = (
#                 below_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#                 .blend(fmask(below_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#                 .getThumbURL({'region': below_region, 'dimensions': image_size})
#             )
#         except:
#             below_url = None
    
#         above_skipped = f' (skipped)' if above_scene_id in scene_skip_list else ''   
#         below_skipped = f' (skipped)' if below_scene_id in scene_skip_list else ''
        
#         if above_url and below_url:
#             print(f'{below_scene_id} ({below_cloud_pct}){below_skipped}  {above_scene_id} ({above_cloud_pct}){above_skipped}')
#             ipyplot.plot_images([below_url, above_url], img_width=image_size)
#         elif above_url:
#             print(f'{above_scene_id} ({above_cloud_pct}){above_skipped}')
#             ipyplot.plot_images([above_url], img_width=image_size)
#         elif below_url:
#             print(f'{below_scene_id} ({below_cloud_pct}){below_skipped}')
#             ipyplot.plot_images([below_url], img_width=image_size)
    
#         new_skip_count += 1
#         if new_skip_count >= print_count:
#             break

#     # if new_skip_scenes:
#     #     for scene_id in new_skip_scenes:
#     #         print(scene_id)
#     if new_skip_count:
#         wrs2_i += 1

# print('\nDone')

In [None]:
# ### Print scenes with low pixel count ratios (few unmasked pixels)
# count_threshold_pct_min = 70
# count_threshold_pct_max = 100
# count_threshold = 1000000

# start_year = 1984
# end_year = 2025
# years = list(range(start_year, end_year + 1))

# print_count = 10
# # image_size = 700
# # image_size = 900
# # image_size = 1024
# image_size = 1400

# # Read in the scene skip list
# scene_skip_url = '../v2p1.csv'
# # scene_skip_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1.csv'
# scene_skip_df = pd.read_csv(scene_skip_url)
# scene_skip_list = list(scene_skip_df['SCENE_ID'].values)
# print(f'Skip list images: {len(scene_skip_list)}')

# scene_cloudscore_url = '../v2p1_cloudscore.csv'
# # scene_cloudscore_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1_cloudscore.csv'
# scene_cloudscore_list = list(pd.read_csv(scene_cloudscore_url)['SCENE_ID'].values)
# print(f'Skip cloudscore images: {len(scene_cloudscore_list)}')


# print('Reading image stats CSV files')
# stats_df_list = []
# for wrs2_tile in wrs2_list:
#     for year in range(start_year, end_year + 1):
#         wrs2_stats_path = os.path.join(stats_ws, f'{year}', f'{wrs2_tile}_{year}.csv')
#         if not os.path.isfile(wrs2_stats_path):
#             # print(f'  {wrs2_tile}_{year} - Missing stats CSV, skipping')
#             continue
#         try:
#             wrs2_stats_df = pd.read_csv(wrs2_stats_path, index_col=False)
#         except Exception as e:
#             print(f'  {wrs2_tile}_{year} - Error reading CSV, skipping')
#             continue
#         if wrs2_stats_df.empty:
#             continue
#         wrs2_stats_df['DATE'] = wrs2_stats_df['SCENE_ID'].str.slice(12, 20)
#         wrs2_stats_df['WRS2'] = 'p' + wrs2_stats_df['SCENE_ID'].str.slice(5, 8) + 'r' + wrs2_stats_df['SCENE_ID'].str.slice(8, 11)
#         stats_df_list.append(wrs2_stats_df)

# stats_df = pd.concat(stats_df_list)

# # Add the high CLOUD_COVER_LAND scenes to the skip list but don't remove from the dataframe
# scene_skip_list.extend(stats_df[stats_df['CLOUD_COVER_LAND'] >= 71]['SCENE_ID'].values)
    
# # Skip the Landsat 7 scenes in 2023
# l7_2022_mask = (
#     (stats_df['DATE'].str.slice(0,4) >= '2022') &
#     (stats_df['SCENE_ID'].str.slice(0,4) == 'LE07')
# )
# stats_df = stats_df[~l7_2022_mask]

# # Compute the ratios
# # stats_df['ACCA_COUNT_RATIO'] = stats_df['ACCA_PIXELS'] / stats_df['TOTAL_PIXELS']
# stats_df['SNOW_COUNT_RATIO'] = stats_df['SNOW_PIXELS'] / stats_df['TOTAL_PIXELS']
# # stats_df['SHADOW_COUNT_RATIO'] = stats_df['SHADOW_PIXELS'] / stats_df['TOTAL_PIXELS']
# stats_df['WATER_COUNT_RATIO'] = stats_df['WATER_PIXELS'] / stats_df['TOTAL_PIXELS']
# stats_df['MASKED_PIXELS'] = (
#     stats_df['CLOUD_PIXELS'] + stats_df['CIRRUS_PIXELS'] + stats_df['DILATE_PIXELS']
#     + stats_df['SHADOW_PIXELS']
#     + stats_df['SNOW_PIXELS']
#     + stats_df['WATER_PIXELS']
#     + stats_df['ACCA_PIXELS']
#     # + stats_df['SATURATED_PIXELS']
# )
# stats_df['CLOUD_COUNT_RATIO'] = stats_df['MASKED_PIXELS'] / stats_df['TOTAL_PIXELS']
# # stats_df['CLOUD_COUNT_RATIO'] = stats_df['UNMASKED_PIXELS'] / stats_df['TOTAL_PIXELS']

# print(f'  {len(stats_df.count(axis=1))}')

# # Work through the tiles based on which ones already have the most skipped scenes
# wrs2_tiles = list(stats_df.groupby(['WRS2'])['SCENE_ID'].count().sort_values(ascending=False).index)
# # wrs2_tiles = ['']

# # new_skip_scenes = []
# # new_skip_count = 0

# wrs2_i = 0

# # for wrs2 in reversed(wrs2_tiles):
# # for wrs2 in sorted(wrs2_tiles):
# # for wrs2 in reversed(sorted(wrs2_tiles)):
# for wrs2 in random.sample(wrs2_tiles, len(wrs2_tiles)):
#     # if wrs2_i > 20:
#     #     break
#     if wrs2_skip_list and (wrs2 in wrs2_skip_list):
#         continue
#     # if wrs2 in ['p046r033', 'p047r031']:
#     # if wrs2 in ['p011r030', 'p011r031', 'p012r032', 'p015r040', 'p021r040', 'p024r040', 'p041r037']:
#     #     continue
#     if ocean_wrs2_list and (wrs2 in ocean_wrs2_list):
#         continue

#     wrs2_path = int(wrs2[1:4])
#     wrs2_row = int(wrs2[5:8])
#     wrs2_tgt = f'{wrs2_path:03d}{wrs2_row:03d}'
#     wrs2_above = f'{wrs2_path:03d}{wrs2_row-1:03d}'
#     wrs2_below = f'{wrs2_path:03d}{wrs2_row+1:03d}'    

#     wrs2_stats_df = stats_df[stats_df['WRS2'] == wrs2].copy()
    
#     # Applying skip list here so that main stats DF has all scenes
#     wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_skip_list)]
#     wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_cloudscore_list)]

#     # # Only look at Landsat 8 and 9 for this test
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['SCENE_ID'].str.slice(0,4).isin(['LC08', 'LC09'])]
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['DATE'] >= '19841201']

#     # # Filter on the CLOUD_COVER_LAND property
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COVER_LAND'] < 71]
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COVER_LAND'] >= 60]
#     # wrs2_stats_df.sort_values('CLOUD_COVER_LAND', ascending=False, inplace=True)

#     # Filter on the overall cloud count ratio
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COUNT_RATIO'] < (count_threshold_pct_max / 100)]
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COUNT_RATIO'] >= (count_threshold_pct_min / 100)]
#     wrs2_stats_df.sort_values('CLOUD_COUNT_RATIO', ascending=False, inplace=True)

#     # Filter on the masked pixel count
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['UNMASKED_PIXELS'] < count_threshold]
#     wrs2_stats_df.sort_values('UNMASKED_PIXELS', ascending=True, inplace=True)

#     if len(wrs2_stats_df.count(axis=1)) == 0:
#         continue
#     print(f'{wrs2} - {len(wrs2_stats_df.count(axis=1))}')

#     new_skip_scenes = []
#     new_skip_count = 0
    
#     # for i, row in wrs2_stats_df.sample(n=min(10, len(wrs2_stats_df.index))).iterrows():
#     for i, row in wrs2_stats_df.iterrows():

#         scene_id = row["SCENE_ID"].upper()

#         above_scene_id = scene_id.upper().replace(wrs2_tgt, wrs2_above)
#         above_stats_df = stats_df.loc[stats_df['SCENE_ID'] == above_scene_id]
#         if len(above_stats_df):
#             above_cloud_pct = float(above_stats_df.iloc[0]['CLOUD_COVER_LAND'])
#         else:
#             above_cloud_pct = None
            
#         below_scene_id = scene_id.upper().replace(wrs2_tgt, wrs2_below)
#         below_stats_df = stats_df.loc[stats_df['SCENE_ID'] == below_scene_id]
#         if len(below_stats_df):
#             below_cloud_pct = float(below_stats_df.iloc[0]['CLOUD_COVER_LAND'])
#         else:
#             below_cloud_pct = None

#         # # Only show scenes that have above & below both skipped or None
#         # if (((above_scene_id not in scene_skip_list) and (above_cloud_pct is not None)) or 
#         #     ((below_scene_id not in scene_skip_list) and (below_cloud_pct is not None))):
#         #     continue   
    
#         # # Only show scenes that have either the above & below scene skipped or None
#         # if (((above_scene_id not in scene_skip_list) and (above_cloud_pct is not None)) and 
#         #     ((below_scene_id not in scene_skip_list) and (below_cloud_pct is not None))):
#         #     continue  

#         landsat_type = scene_id.split('_')[0].upper()
#         landsat_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{scene_id}')
#         landsat_region = landsat_img.geometry().bounds(1, 'EPSG:4326')
#         landsat_sr_img = landsat_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])

#         # Landsat true color image
#         landsat_url = (
#             landsat_sr_img.where(land_mask.unmask().eq(0), 0.25)
#             .getThumbURL({'min': 0.0, 'max': 0.30, 'gamma': 1.25, 'region': landsat_region, 'dimensions': image_size})
#         )
    
#         # Landsat true color with Fmask
#         fmask_url = (
#             landsat_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#             .blend(fmask(landsat_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#             .getThumbURL({'region': landsat_region, 'dimensions': image_size})
#         )
    
#         print('#'*80)
#         print(
#             f'  {scene_id}  {row["TOTAL_PIXELS"]:>10d}  {row["UNMASKED_PIXELS"]:>10d}'
#             f'  ({row["CLOUD_COUNT_RATIO"]:>0.2f}) ({row["SNOW_COUNT_RATIO"]:>0.2f}) {row["CLOUD_COVER_LAND"]}'
#             f'  {row["SR_RED"]:0.2f}  {row["SR_GREEN"]:0.2f}  {row["SR_BLUE"]:0.2f}'
#         )
#         ipyplot.plot_images([landsat_url, fmask_url], img_width=image_size)
    
    
#         # Show the images above and below the target wrs2
#         above_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{above_scene_id}')
#         above_region = above_img.geometry().bounds(1, 'EPSG:4326')
#         above_sr_img = above_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])
#         try:
#             above_url = (
#                 above_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#                 .blend(fmask(above_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#                 .getThumbURL({'region': above_region, 'dimensions': image_size})
#             )
#         except:
#             above_url = None
            
#         below_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{below_scene_id}')
#         below_region = below_img.geometry().bounds(1, 'EPSG:4326')
#         below_sr_img = below_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])
#         try:
#             below_url = (
#                 below_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#                 .blend(fmask(below_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#                 .getThumbURL({'region': below_region, 'dimensions': image_size})
#             )
#         except:
#             below_url = None
    
#         above_skipped = f' (skipped)' if above_scene_id in scene_skip_list else ''   
#         below_skipped = f' (skipped)' if below_scene_id in scene_skip_list else ''
        
#         if above_url and below_url:
#             print(f'{below_scene_id} ({below_cloud_pct}){below_skipped}  {above_scene_id} ({above_cloud_pct}){above_skipped}')
#             ipyplot.plot_images([below_url, above_url], img_width=image_size)
#         elif above_url:
#             print(f'{above_scene_id} ({above_cloud_pct}){above_skipped}')
#             ipyplot.plot_images([above_url], img_width=image_size)
#         elif below_url:
#             print(f'{below_scene_id} ({below_cloud_pct}){below_skipped}')
#             ipyplot.plot_images([below_url], img_width=image_size)
    
#         new_skip_scenes.append(scene_id)
#         new_skip_count += 1
#         if new_skip_count >= print_count:
#             break

#     if new_skip_scenes:
#         wrs2_i += 1
#         for scene_id in new_skip_scenes:
#             print(scene_id)
        
# # if new_skip_scenes:
# #     for scene_id in new_skip_scenes:
# #         print(scene_id)

# print('\nDone')


In [None]:
# ### Print scenes with high numbers of unmasked ACCA cloudscore pixels
# count_threshold_pct_min = 0.1
# count_threshold_pct_max = 100

# #count_threshold_min = 1000000
# #count_threshold_max = 2000000
# count_threshold_min = 10000000
# count_threshold_max = 100000000

# #image_size = 900
# #image_size = 1024
# image_size = 1400
# print_count = 10

# start_year = 1985
# #start_year = 2003
# #start_year = 2015
# start_year = 2024
# end_year = 2025
# years = list(range(start_year, end_year + 1))

# red_band = 'SR_RED'
# green_band = 'SR_GREEN'
# blue_band = 'SR_BLUE'

# # Read in the scene skip list
# scene_skip_url = '/Users/Charles.Morton@dri.edu/Projects/scene-skip-list/v2p1.csv'
# # scene_skip_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1.csv'
# scene_skip_df = pd.read_csv(scene_skip_url)
# scene_skip_list = list(scene_skip_df['SCENE_ID'].values)
# print(f'Skip list images: {len(scene_skip_list)}')

# scene_cloudscore_url = '/Users/Charles.Morton@dri.edu/Projects/scene-skip-list/v2p1_cloudscore.csv'
# # scene_cloudscore_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1_cloudscore.csv'
# scene_cloudscore_list = list(pd.read_csv(scene_cloudscore_url)['SCENE_ID'].values)
# print(f'Skip cloudscore images: {len(scene_cloudscore_list)}')

# # scene_keep_list = []

# print('Reading image stats CSV files')
# stats_df_list = []
# for wrs2 in wrs2_list:
#     # if int(wrs2[1:4]) not in range(10, 25):
#     #     continue
    
#     for year in range(start_year, end_year + 1):
#         wrs2_stats_path = os.path.join(stats_ws, f'{year}', f'{wrs2}_{year}.csv')
#         if not os.path.isfile(wrs2_stats_path):
#             # print(f'  {wrs2}_{year} - Missing stats CSV, skipping')
#             continue
#         try:
#             wrs2_stats_df = pd.read_csv(wrs2_stats_path)
#         except Exception as e:
#             print(f'  {wrs2}_{year} - Error reading CSV, skipping')
#             os.remove(wrs2_stats_path)
#             continue
#         if wrs2_stats_df.empty:
#             continue
#         wrs2_stats_df.drop(columns=['system:index', '.geo'], inplace=True)
#         wrs2_stats_df['DATE'] = wrs2_stats_df['SCENE_ID'].str.slice(12, 20)
#         wrs2_stats_df['WRS2'] = 'p' + wrs2_stats_df['SCENE_ID'].str.slice(5, 8) + 'r' + wrs2_stats_df['SCENE_ID'].str.slice(8, 11)
#         stats_df_list.append(wrs2_stats_df)

# stats_df = pd.concat(stats_df_list)

# # Add the high CLOUD_COVER_LAND scenes to the skip list but don't remove from the dataframe
# scene_skip_list.extend(stats_df[stats_df['CLOUD_COVER_LAND'] >= 71]['SCENE_ID'].values)

# # Skip the Landsat 7 scenes in 2023
# l7_2022_mask = (
#     (stats_df['DATE'].str.slice(0,4) >= '2022') &
#     (stats_df['SCENE_ID'].str.slice(0,4) == 'LE07')
# )
# stats_df = stats_df[~l7_2022_mask]

# # Compute the cloud count ratios
# stats_df['ACCA_COUNT_RATIO'] = stats_df['ACCA_PIXELS'] / stats_df['TOTAL_PIXELS']
# stats_df['SATURATED_COUNT_RATIO'] = stats_df['SATURATED_PIXELS'] / stats_df['TOTAL_PIXELS']
# stats_df['SNOW_COUNT_RATIO'] = stats_df['SNOW_PIXELS'] / stats_df['TOTAL_PIXELS']
# # stats_df['SNOW_COUNT_RATIO'] = (stats_df['SNOW_PIXELS'] + stats_df['SHADOW_PIXELS'] + stats_df['WATER_PIXELS'] + stats_df['DILATE_PIXELS']) / stats_df['TOTAL_PIXELS']
# stats_df['MASKED_PIXELS'] = (
#     stats_df['CLOUD_PIXELS']
#     + stats_df['CIRRUS_PIXELS']
#     + stats_df['DILATE_PIXELS']
#     + stats_df['SHADOW_PIXELS']
#     + stats_df['SNOW_PIXELS']
#     # + stats_df['WATER_PIXELS']
# )
# stats_df['CLOUD_COUNT_RATIO'] = stats_df['MASKED_PIXELS'] / stats_df['TOTAL_PIXELS']
# # stats_df['CLOUD_COUNT_RATIO'] = stats_df['UNMASKED_PIXELS'] / stats_df['TOTAL_PIXELS']

# print(f'  {len(stats_df.count(axis=1))}')
# print('  Done\n')


# # Work through the tiles based on which ones already have the most skipped scenes
# wrs2_tiles = list(stats_df.groupby(['WRS2'])['SCENE_ID'].count().sort_values(ascending=False).index)


# # for wrs2 in sorted(wrs2_tiles):
# for wrs2 in reversed(sorted(wrs2_tiles)):
#     if wrs2_skip_list and (wrs2 in wrs2_skip_list):
#         continue
#     # if california_wrs2_list and (wrs2 not in california_wrs2_list):
#     #     continue

#     wrs2_path = int(wrs2[1:4])
#     wrs2_row = int(wrs2[5:8])
#     wrs2_tgt = f'{wrs2_path:03d}{wrs2_row:03d}'
#     wrs2_above = f'{wrs2_path:03d}{wrs2_row-1:03d}'
#     wrs2_below = f'{wrs2_path:03d}{wrs2_row+1:03d}'    

#     wrs2_stats_df = stats_df[stats_df['WRS2'] == wrs2].copy()
#     # Applying skip list here so that main stats DF has all scenes
#     wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_skip_list)]
#     wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_cloudscore_list)]

#     # Filter on the overall cloud count ratio
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['ACCA_PIXELS'] > count_threshold_min]
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['ACCA_PIXELS'] < count_threshold_max]
#     #wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['ACCA_COUNT_RATIO'] < (count_threshold_pct_max / 100)]
#     #wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['ACCA_COUNT_RATIO'] >= (count_threshold_pct_min / 100)]
#     #wrs2_stats_df.sort_values('ACCA_COUNT_RATIO', ascending=False, inplace=True)

#     # # Filter on the overall cloud count ratio
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['SATURATED_PIXELS'] > count_threshold]
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['SATURATED_COUNT_RATIO'] < (count_threshold_pct_max / 100)]
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['SATURATED_COUNT_RATIO'] >= (count_threshold_pct_min / 100)]
#     # wrs2_stats_df.sort_values('SATURATED_COUNT_RATIO', ascending=False, inplace=True)
    
#     if len(wrs2_stats_df.count(axis=1)) == 0:
#         # print(f'{wrs2} - {len(wrs2_stats_df.count(axis=1))}')
#         continue
#     print(f'{wrs2} - {len(wrs2_stats_df.count(axis=1))}')

#     # for i, row in wrs2_stats_df.iterrows():
#     for i, row in wrs2_stats_df.sample(n=min(print_count, len(wrs2_stats_df.index))).iterrows():
        
#         scene_id = row["SCENE_ID"].upper()

#         # # Only review scenes that have the image above and below in the skip list
#         # if above_scene_id not in scene_skip_list:
#         #     continue
#         # if below_scene_id not in scene_skip_list:
#         #     continue
        
#         above_scene_id = scene_id.upper().replace(wrs2_tgt, wrs2_above)
#         above_stats_df = stats_df.loc[stats_df['SCENE_ID'] == above_scene_id]
#         if len(above_stats_df):
#             above_cloud_pct = float(above_stats_df.iloc[0]['CLOUD_COVER_LAND'])
#         else:
#             above_cloud_pct = None
            
#         below_scene_id = scene_id.upper().replace(wrs2_tgt, wrs2_below)
#         below_stats_df = stats_df.loc[stats_df['SCENE_ID'] == below_scene_id]
#         if len(below_stats_df):
#             below_cloud_pct = float(below_stats_df.iloc[0]['CLOUD_COVER_LAND'])
#         else:
#             below_cloud_pct = None

#         # # Only show scenes that have above & below both skipped or None
#         # if (((above_scene_id not in scene_skip_list) and (above_cloud_pct is not None)) or 
#         #     ((below_scene_id not in scene_skip_list) and (below_cloud_pct is not None))):
#         #     continue   
        
#         landsat_type = scene_id.split('_')[0].upper()
#         landsat_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{scene_id}')
#         landsat_region = landsat_img.geometry().bounds(1, 'EPSG:4326')
#         landsat_sr_img = landsat_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])

#         # Landsat true color image
#         landsat_url = (
#             landsat_sr_img.where(land_mask.unmask().eq(0), 0.25)
#             .getThumbURL({'min': 0.0, 'max': 0.30, 'gamma': 1.25, 'region': landsat_region, 'dimensions': image_size})
#         )
    
#         # Landsat true color with Fmask
#         fmask_url = (
#             landsat_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#             .blend(fmask(landsat_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#             .getThumbURL({'region': landsat_region, 'dimensions': image_size})
#         )
    
#         print('#'*80)
#         print(
#             f'  {scene_id}  {row["TOTAL_PIXELS"]:>10d}  {row["UNMASKED_PIXELS"]:>10d}'
#             f'  ({row["CLOUD_COUNT_RATIO"]:>0.2f}) ({row["SNOW_COUNT_RATIO"]:>0.2f}) {row["CLOUD_COVER_LAND"]}'
#             f'  {row[red_band]:0.2f}  {row[green_band]:0.2f}  {row[blue_band]:0.2f}'
#         )
#         ipyplot.plot_images([landsat_url, fmask_url], img_width=image_size)
    
    
#         # Show the images above and below the target wrs2
#         above_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{above_scene_id}')
#         above_region = above_img.geometry().bounds(1, 'EPSG:4326')
#         above_sr_img = above_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])
#         try:
#             above_url = (
#                 above_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#                 .blend(fmask(above_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#                 .getThumbURL({'region': above_region, 'dimensions': image_size})
#             )
#         except:
#             above_url = None
            
#         below_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{below_scene_id}')
#         below_region = below_img.geometry().bounds(1, 'EPSG:4326')
#         below_sr_img = below_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])
#         try:
#             below_url = (
#                 below_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#                 .blend(fmask(below_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#                 .getThumbURL({'region': below_region, 'dimensions': image_size})
#             )
#         except:
#             below_url = None
    
#         above_skipped = f' (skipped)' if above_scene_id in scene_skip_list else ''   
#         below_skipped = f' (skipped)' if below_scene_id in scene_skip_list else ''
        
#         if above_url and below_url:
#             print(f'{below_scene_id} ({below_cloud_pct}){below_skipped}  {above_scene_id} ({above_cloud_pct}){above_skipped}')
#             ipyplot.plot_images([below_url, above_url], img_width=image_size)
#         elif above_url:
#             print(f'{above_scene_id} ({above_cloud_pct}){above_skipped}')
#             ipyplot.plot_images([above_url], img_width=image_size)
#         elif below_url:
#             print(f'{below_scene_id} ({below_cloud_pct}){below_skipped}')
#             ipyplot.plot_images([below_url], img_width=image_size)

# print('\nDone')

In [None]:
# ### Print scenes with lots of snowy pixels
# count_threshold_pct_min = 70
# count_threshold_pct_max = 101

# # snow_threshold_pct_min = 80
# # snow_threshold_pct_max = 101
# snow_threshold_pct_min = 70
# snow_threshold_pct_max = 101

# start_year = 1984
# #start_year = 2003
# #start_year = 2015
# #start_year = 2024
# end_year = 2025
# years = list(range(start_year, end_year + 1))

# print_count = 10
# # image_size = 700
# # image_size = 900
# # image_size = 1024
# image_size = 1400

# # Read in the scene skip list
# scene_skip_url = '../v2p1.csv'
# # scene_skip_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1.csv'
# scene_skip_df = pd.read_csv(scene_skip_url)
# scene_skip_list = list(scene_skip_df['SCENE_ID'].values)
# print(f'Skip list images: {len(scene_skip_list)}')

# scene_cloudscore_url = '../v2p1_cloudscore.csv'
# # scene_cloudscore_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1_cloudscore.csv'
# scene_cloudscore_list = list(pd.read_csv(scene_cloudscore_url)['SCENE_ID'].values)
# print(f'Skip cloudscore images: {len(scene_cloudscore_list)}')


# print('Reading image stats CSV files')
# stats_df_list = []
# for wrs2_tile in wrs2_list:
#     # if int(wrs2_tile[1:4]) not in range(10, 25):
#     #     continue
        
#     for year in range(start_year, end_year + 1):
#         wrs2_stats_path = os.path.join(stats_ws, f'{year}', f'{wrs2_tile}_{year}.csv')
#         if not os.path.isfile(wrs2_stats_path):
#             # print(f'  {wrs2_tile}_{year} - Missing stats CSV, skipping')
#             continue
#         try:
#             wrs2_stats_df = pd.read_csv(wrs2_stats_path, index_col=False)
#         except Exception as e:
#             print(f'  {wrs2_tile}_{year} - Error reading CSV, skipping')
#             continue
#         if wrs2_stats_df.empty:
#             continue
#         wrs2_stats_df['DATE'] = wrs2_stats_df['SCENE_ID'].str.slice(12, 20)
#         wrs2_stats_df['WRS2'] = 'p' + wrs2_stats_df['SCENE_ID'].str.slice(5, 8) + 'r' + wrs2_stats_df['SCENE_ID'].str.slice(8, 11)
#         stats_df_list.append(wrs2_stats_df)

# stats_df = pd.concat(stats_df_list)

# # Add the high CLOUD_COVER_LAND scenes to the skip list but don't remove from the dataframe
# scene_skip_list.extend(stats_df[stats_df['CLOUD_COVER_LAND'] >= 71]['SCENE_ID'].values)

# # Skip the Landsat 7 scenes in 2023
# l7_2022_mask = (
#     (stats_df['DATE'].str.slice(0,4) >= '2022') &
#     (stats_df['SCENE_ID'].str.slice(0,4) == 'LE07')
# )
# stats_df = stats_df[~l7_2022_mask]
# # # Filter by date
# # if start_date:
# #     stats_df = stats_df[stats_df['DATE'].str >= start_date.replace('-', '')]
# # if start_date:
# #     stats_df = stats_df[stats_df['DATE'].str < end_date.replace('-', '')]

# # Compute the ratios
# # stats_df['ACCA_COUNT_RATIO'] = stats_df['ACCA_PIXELS'] / stats_df['TOTAL_PIXELS']
# stats_df['SNOW_COUNT_RATIO'] = stats_df['SNOW_PIXELS'] / stats_df['TOTAL_PIXELS']
# # stats_df['SHADOW_COUNT_RATIO'] = stats_df['SHADOW_PIXELS'] / stats_df['TOTAL_PIXELS']
# stats_df['WATER_COUNT_RATIO'] = stats_df['WATER_PIXELS'] / stats_df['TOTAL_PIXELS']
# stats_df['MASKED_PIXELS'] = (
#     stats_df['CLOUD_PIXELS'] + stats_df['CIRRUS_PIXELS'] + stats_df['DILATE_PIXELS']
#     + stats_df['SHADOW_PIXELS']
#     + stats_df['SNOW_PIXELS']
#     + stats_df['WATER_PIXELS']
#     + stats_df['ACCA_PIXELS']
#     # + stats_df['SATURATED_PIXELS']
# )
# stats_df['CLOUD_COUNT_RATIO'] = stats_df['MASKED_PIXELS'] / stats_df['TOTAL_PIXELS']
# # stats_df['CLOUD_COUNT_RATIO'] = stats_df['UNMASKED_PIXELS'] / stats_df['TOTAL_PIXELS']
# print(f'  {len(stats_df.count(axis=1))}')


# # Work through the tiles based on which ones already have the most skipped scenes
# wrs2_tiles = list(stats_df.groupby(['WRS2'])['SCENE_ID'].count().sort_values(ascending=False).index)
# # wrs2_tiles = ['']

# new_skip_scenes = []
# new_skip_count = 0

# wrs2_i = 0

# # for wrs2 in reversed(sorted(wrs2_tiles)):
# # for wrs2 in sorted(wrs2_tiles):
# for wrs2 in random.sample(wrs2_tiles, len(wrs2_tiles)):
#     if wrs2_i > 10:
#         break
#     if wrs2_skip_list and (wrs2 in wrs2_skip_list):
#         continue
#     # if int(wrs2[5:8]) >= 30:
#     #     continue

#     wrs2_path = int(wrs2[1:4])
#     wrs2_row = int(wrs2[5:8])
#     wrs2_tgt = f'{wrs2_path:03d}{wrs2_row:03d}'
#     wrs2_above = f'{wrs2_path:03d}{wrs2_row-1:03d}'
#     wrs2_below = f'{wrs2_path:03d}{wrs2_row+1:03d}'    

#     wrs2_stats_df = stats_df[stats_df['WRS2'] == wrs2].copy()
#     # Applying skip list here so that main stats DF has all scenes
#     wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_skip_list)]
#     wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_cloudscore_list)]

#     # # Only check winter scenes
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['DATE'].str.slice(4,6).astype(int).isin([11, 12, 1, 2])]

#     # Filter on the snow pixel count ratio
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['SNOW_COUNT_RATIO'] < (snow_threshold_pct_max / 100)].copy()
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['SNOW_COUNT_RATIO'] >= (snow_threshold_pct_min / 100)].copy()
#     # wrs2_stats_df.sort_values('SNOW_COUNT_RATIO', ascending=False, inplace=True)

#     # Filter on the snow pixel count ratio
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COUNT_RATIO'] < (count_threshold_pct_max / 100)].copy()
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COUNT_RATIO'] >= (count_threshold_pct_min / 100)].copy()

#     # # Filter on the CLOUD_COVER_LAND property
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COVER_LAND'] < 71]
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COVER_LAND'] >= 60]

#     if len(wrs2_stats_df.count(axis=1)) == 0:
#         continue
#     print(f'{wrs2} - {len(wrs2_stats_df.count(axis=1))}')

#     new_skip_scenes = []
#     new_skip_count = 0
    
#     # for i, row in wrs2_stats_df.iterrows():
#     for i, row in wrs2_stats_df.sample(n=min(print_count, len(wrs2_stats_df.index))).iterrows():

#         scene_id = row["SCENE_ID"].upper()

#         above_scene_id = scene_id.upper().replace(wrs2_tgt, wrs2_above)
#         above_stats_df = stats_df.loc[stats_df['SCENE_ID'] == above_scene_id]
#         if len(above_stats_df):
#             above_cloud_pct = float(above_stats_df.iloc[0]['CLOUD_COVER_LAND'])
#         else:
#             above_cloud_pct = None
            
#         below_scene_id = scene_id.upper().replace(wrs2_tgt, wrs2_below)
#         below_stats_df = stats_df.loc[stats_df['SCENE_ID'] == below_scene_id]
#         if len(below_stats_df):
#             below_cloud_pct = float(below_stats_df.iloc[0]['CLOUD_COVER_LAND'])
#         else:
#             below_cloud_pct = None

#         # # Only show scenes that have above & below both skipped or None
#         # if (((above_scene_id not in scene_skip_list) and (above_cloud_pct is not None)) or 
#         #     ((below_scene_id not in scene_skip_list) and (below_cloud_pct is not None))):
#         #     continue   

#         # # Only show scenes that have either above & below skipped or None
#         # if (((above_scene_id not in scene_skip_list) and (above_cloud_pct is not None)) and 
#         #     ((below_scene_id not in scene_skip_list) and (below_cloud_pct is not None))):
#         #     continue   
            
#         landsat_type = scene_id.split('_')[0].upper()
#         landsat_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{scene_id}')
#         landsat_region = landsat_img.geometry().bounds(1, 'EPSG:4326')
#         landsat_sr_img = landsat_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])

#         # Landsat true color image
#         landsat_url = (
#             landsat_sr_img.where(land_mask.unmask().eq(0), 0.25)
#             .getThumbURL({'min': 0.0, 'max': 0.30, 'gamma': 1.25, 'region': landsat_region, 'dimensions': image_size})
#         )
    
#         # Landsat true color with Fmask
#         fmask_url = (
#             landsat_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#             .blend(fmask(landsat_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#             .getThumbURL({'region': landsat_region, 'dimensions': image_size})
#         )
    
#         print('#'*80)
#         print(
#             f'  {scene_id}  {row["TOTAL_PIXELS"]:>10d}  {row["UNMASKED_PIXELS"]:>10d}'
#             f'  ({row["CLOUD_COUNT_RATIO"]:>0.2f}) ({row["SNOW_COUNT_RATIO"]:>0.2f}) {row["CLOUD_COVER_LAND"]}'
#             f'  {row["SR_RED"]:0.2f}  {row["SR_GREEN"]:0.2f}  {row["SR_BLUE"]:0.2f}'
#         )
#         ipyplot.plot_images([landsat_url, fmask_url], img_width=image_size)
    
#         # Show the images above and below the target wrs2
#         above_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{above_scene_id}')
#         above_region = above_img.geometry().bounds(1, 'EPSG:4326')
#         above_sr_img = above_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])
#         try:
#             above_url = (
#                 above_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#                 .blend(fmask(above_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#                 .getThumbURL({'region': above_region, 'dimensions': image_size})
#             )
#         except:
#             above_url = None
            
#         below_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{below_scene_id}')
#         below_region = below_img.geometry().bounds(1, 'EPSG:4326')
#         below_sr_img = below_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])
#         try:
#             below_url = (
#                 below_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#                 .blend(fmask(below_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#                 .getThumbURL({'region': below_region, 'dimensions': image_size})
#             )
#         except:
#             below_url = None
    
#         above_skipped = f' (skipped)' if above_scene_id in scene_skip_list else ''   
#         below_skipped = f' (skipped)' if below_scene_id in scene_skip_list else ''
        
#         if above_url and below_url:
#             print(f'{below_scene_id} ({below_cloud_pct}){below_skipped}  {above_scene_id} ({above_cloud_pct}){above_skipped}')
#             ipyplot.plot_images([below_url, above_url], img_width=image_size)
#         elif above_url:
#             print(f'{above_scene_id} ({above_cloud_pct}){above_skipped}')
#             ipyplot.plot_images([above_url], img_width=image_size)
#         elif below_url:
#             print(f'{below_scene_id} ({below_cloud_pct}){below_skipped}')
#             ipyplot.plot_images([below_url], img_width=image_size)
    
#         new_skip_scenes.append(scene_id)
#         new_skip_count += 1
#         if new_skip_count >= print_count:
#             break

#     if new_skip_scenes:
#         for scene_id in new_skip_scenes:
#             print(scene_id)
#     if new_skip_count:
#         wrs2_i += 1

# # print('\nNew Skip Scenes')
# # if new_skip_scenes:
# #     for scene_id in new_skip_scenes:
# #         print(scene_id)

# print('\nDone')


In [None]:
# ### Identify scenes with lots of ACCA pixels that aren't flagged as missing
# # acca_threshold_pct_min = 10
# # acca_threshold_pct_max = 101

# #acca_threshold_count_min = 10000000
# #acca_threshold_count_min = 5000000
# #acca_threshold_count_min = 2000000
# #acca_threshold_count_min = 1500000
# acca_threshold_count_min = 1000000

# cloud_threshold_pct_min = 0
# cloud_threshold_pct_max = 101

# start_year = 1984
# #start_year = 2015
# #start_year = 2024
# end_year = 2025
# years = list(range(start_year, end_year + 1))

# print_count = 10
# image_size = 1400

# # Read in the scene skip list
# scene_skip_url = '../v2p1.csv'
# # scene_skip_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1.csv'
# scene_skip_df = pd.read_csv(scene_skip_url)
# scene_skip_list = list(scene_skip_df['SCENE_ID'].values)
# print(f'Skip list images: {len(scene_skip_list)}')

# scene_cloudscore_url = '../v2p1_cloudscore.csv'
# # scene_cloudscore_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1_cloudscore.csv'
# scene_cloudscore_list = list(pd.read_csv(scene_cloudscore_url)['SCENE_ID'].values)
# print(f'Skip cloudscore images: {len(scene_cloudscore_list)}')


# print('Reading image stats CSV files')
# stats_df_list = []
# for wrs2_tile in wrs2_list:
#     for year in range(start_year, end_year + 1):
#         wrs2_stats_path = os.path.join(stats_ws, f'{year}', f'{wrs2_tile}_{year}.csv')
#         if not os.path.isfile(wrs2_stats_path):
#             # print(f'  {wrs2_tile}_{year} - Missing stats CSV, skipping')
#             continue
#         try:
#             wrs2_stats_df = pd.read_csv(wrs2_stats_path, index_col=False)
#         except Exception as e:
#             print(f'  {wrs2_tile}_{year} - Error reading CSV, skipping')
#             continue
#         if wrs2_stats_df.empty:
#             continue
#         wrs2_stats_df['DATE'] = wrs2_stats_df['SCENE_ID'].str.slice(12, 20)
#         wrs2_stats_df['WRS2'] = 'p' + wrs2_stats_df['SCENE_ID'].str.slice(5, 8) + 'r' + wrs2_stats_df['SCENE_ID'].str.slice(8, 11)
#         stats_df_list.append(wrs2_stats_df)

# stats_df = pd.concat(stats_df_list)

# # Add the high CLOUD_COVER_LAND scenes to the skip list but don't remove from the dataframe
# scene_skip_list.extend(stats_df[stats_df['CLOUD_COVER_LAND'] >= 71]['SCENE_ID'].values)

# # Skip the Landsat 7 scenes in 2023
# l7_2022_mask = (
#     (stats_df['DATE'].str.slice(0,4) >= '2022') &
#     (stats_df['SCENE_ID'].str.slice(0,4) == 'LE07')
# )
# stats_df = stats_df[~l7_2022_mask]

# # Compute the ratios
# stats_df['ACCA_COUNT_RATIO'] = stats_df['ACCA_PIXELS'] / stats_df['TOTAL_PIXELS']
# stats_df['MASKED_PIXELS'] = (
#     stats_df['CLOUD_PIXELS'] + stats_df['CIRRUS_PIXELS'] + stats_df['DILATE_PIXELS']
#     + stats_df['SHADOW_PIXELS']
#     + stats_df['SNOW_PIXELS']
#     + stats_df['WATER_PIXELS']
#     + stats_df['ACCA_PIXELS']
#     # + stats_df['SATURATED_PIXELS']
# )
# stats_df['CLOUD_COUNT_RATIO'] = stats_df['MASKED_PIXELS'] / stats_df['TOTAL_PIXELS']
# # stats_df['CLOUD_COUNT_RATIO'] = stats_df['UNMASKED_PIXELS'] / stats_df['TOTAL_PIXELS']
# print(f'  {len(stats_df.count(axis=1))}')

# # Work through the tiles based on which ones already have the most skipped scenes
# wrs2_tiles = list(stats_df.groupby(['WRS2'])['SCENE_ID'].count().sort_values(ascending=False).index)
# # wrs2_tiles = ['']

# new_skip_scenes = []
# new_skip_count = 0

# wrs2_i = 0

# # for wrs2 in reversed(sorted(wrs2_tiles)):
# # for wrs2 in sorted(wrs2_tiles):
# for wrs2 in random.sample(wrs2_tiles, len(wrs2_tiles)):
#     if wrs2_i > 20:
#         break
#     if wrs2_skip_list and (wrs2 in wrs2_skip_list):
#         continue
#     if wrs2 in ['p039r032', 'p039r031']:
#         continue

#     wrs2_path = int(wrs2[1:4])
#     wrs2_row = int(wrs2[5:8])
#     wrs2_tgt = f'{wrs2_path:03d}{wrs2_row:03d}'
    
#     wrs2_stats_df = stats_df[stats_df['WRS2'] == wrs2].copy()
    
#     # Check for scenes in the skip list that should be flagged as missing
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['SCENE_ID'].isin(scene_skip_list)]
    
#     # # Check for scenes not in the skip list that should be flagged as missing (and added to the skip list)
#     # wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_skip_list)]

#     # Skip all of the cloud score scenes
#     wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_cloudscore_list)]

#     # Only check scenes that aren't flagged as missing
#     wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_skip_df[scene_skip_df['REASON'].str.contains('Missing')]['SCENE_ID'].values)].copy()

#     # Filter on the ACCA pixel count
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['ACCA_PIXELS'] > (acca_threshold_count_min)].copy()
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['ACCA_PIXELS'] < (acca_threshold_count_max)].copy()

#     # Filter on the acca pixel count ratio
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COUNT_RATIO'] < (cloud_threshold_pct_max / 100)].copy()
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COUNT_RATIO'] >= (cloud_threshold_pct_min / 100)].copy()
#     #wrs2_stats_df.sort_values('CLOUD_COUNT_RATIO', ascending=False, inplace=True)

#     # # Filter on the CLOUD_COVER_LAND property
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COVER_LAND'] < 71]
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COVER_LAND'] >= 60]

#     if len(wrs2_stats_df.count(axis=1)) == 0:
#         continue
#     print(f'{wrs2} - {len(wrs2_stats_df.count(axis=1))}')

#     new_skip_scenes = []
#     new_skip_count = 0
    
#     # for i, row in wrs2_stats_df.iterrows():
#     for i, row in wrs2_stats_df.sample(n=min(print_count, len(wrs2_stats_df.index))).iterrows():
#         scene_id = row["SCENE_ID"].upper()
            
#         landsat_type = scene_id.split('_')[0].upper()
#         landsat_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{scene_id}')
#         landsat_region = landsat_img.geometry().bounds(1, 'EPSG:4326')
#         landsat_sr_img = landsat_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])

#         # Landsat true color image
#         landsat_url = (
#             landsat_sr_img.where(land_mask.unmask().eq(0), 0.25)
#             .getThumbURL({'min': 0.0, 'max': 0.30, 'gamma': 1.25, 'region': landsat_region, 'dimensions': image_size})
#         )
#         # Landsat true color with Fmask
#         fmask_url = (
#             landsat_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#             .blend(fmask(landsat_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#             .getThumbURL({'region': landsat_region, 'dimensions': image_size})
#         )
#         print('#'*80)
#         print(
#             f'  {scene_id}  {row["TOTAL_PIXELS"]:>10d}  {row["UNMASKED_PIXELS"]:>10d}'
#             f'  ({row["CLOUD_COUNT_RATIO"]:>0.2f}) {row["CLOUD_COVER_LAND"]}'
#             f'  {row["SR_RED"]:0.2f}  {row["SR_GREEN"]:0.2f}  {row["SR_BLUE"]:0.2f}'
#         )
#         ipyplot.plot_images([landsat_url, fmask_url], img_width=image_size)
    
#         new_skip_scenes.append(scene_id)
#         new_skip_count += 1
#         if new_skip_count >= print_count:
#             break

#     if new_skip_scenes:
#         for scene_id in new_skip_scenes:
#             print(scene_id)
#     if new_skip_count:
#         wrs2_i += 1

# print('\nDone')


In [None]:
# ### Print scenes that are totally masked
# count_threshold_pct_min = 90
# #count_threshold_pct_min = 95
# #count_threshold_pct_min = 99
# count_threshold_pct_max = 101

# #start_year = 1984
# #start_year = 2003
# #start_year = 2015
# #end_year = 2024
# start_year = 2024
# end_year = 2025
# years = list(range(start_year, end_year + 1))

# print_count = 20
# #image_size = 700
# #image_size = 1024
# image_size = 1400

# # Read in the scene skip list
# scene_skip_url = '../v2p1.csv'
# # scene_skip_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1.csv'
# scene_skip_df = pd.read_csv(scene_skip_url)
# scene_skip_list = list(scene_skip_df['SCENE_ID'].values)
# print(f'Skip list images: {len(scene_skip_list)}')

# scene_cloudscore_url = '../v2p1_cloudscore.csv'
# # scene_cloudscore_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1_cloudscore.csv'
# scene_cloudscore_list = list(pd.read_csv(scene_cloudscore_url)['SCENE_ID'].values)
# print(f'Skip cloudscore images: {len(scene_cloudscore_list)}')


# print('Reading image stats CSV files')
# stats_df_list = []
# for wrs2_tile in wrs2_list:
#     # if int(wrs2_tile[1:4]) not in range(10, 25):
#     #     continue
        
#     for year in range(start_year, end_year + 1):
#         wrs2_stats_path = os.path.join(stats_ws, f'{year}', f'{wrs2_tile}_{year}.csv')
#         if not os.path.isfile(wrs2_stats_path):
#             # print(f'  {wrs2_tile}_{year} - Missing stats CSV, skipping')
#             continue
#         try:
#             wrs2_stats_df = pd.read_csv(wrs2_stats_path, index_col=False)
#         except Exception as e:
#             print(f'  {wrs2_tile}_{year} - Error reading CSV, skipping')
#             continue
#         if wrs2_stats_df.empty:
#             continue
#         wrs2_stats_df['DATE'] = wrs2_stats_df['SCENE_ID'].str.slice(12, 20)
#         wrs2_stats_df['WRS2'] = 'p' + wrs2_stats_df['SCENE_ID'].str.slice(5, 8) + 'r' + wrs2_stats_df['SCENE_ID'].str.slice(8, 11)
#         stats_df_list.append(wrs2_stats_df)

# stats_df = pd.concat(stats_df_list)

# # skip_stats_df = stats_df[stats_df['SCENE_ID'].isin(scene_skip_list)]
# # wrs2_tiles = list(skip_stats_df.groupby(['WRS2'])['SCENE_ID'].count().sort_values(ascending=False).index)

# # Add the high CLOUD_COVER_LAND scenes to the skip list but don't remove from the dataframe
# scene_skip_list.extend(stats_df[stats_df['CLOUD_COVER_LAND'] >= 71]['SCENE_ID'].values)

# # Skip the Landsat 7 scenes in 2023
# l7_2022_mask = (
#     (stats_df['DATE'].str.slice(0,4) >= '2022') &
#     (stats_df['SCENE_ID'].str.slice(0,4) == 'LE07')
# )
# stats_df = stats_df[~l7_2022_mask]

# # Compute the ratios
# # stats_df['ACCA_COUNT_RATIO'] = stats_df['ACCA_PIXELS'] / stats_df['TOTAL_PIXELS']
# stats_df['SNOW_COUNT_RATIO'] = stats_df['SNOW_PIXELS'] / stats_df['TOTAL_PIXELS']
# stats_df['WATER_COUNT_RATIO'] = stats_df['WATER_PIXELS'] / stats_df['TOTAL_PIXELS']
# stats_df['MASKED_PIXELS'] = (
#     stats_df['CLOUD_PIXELS'] + stats_df['CIRRUS_PIXELS'] + stats_df['DILATE_PIXELS']
#     + stats_df['SHADOW_PIXELS']
#     + stats_df['SNOW_PIXELS']
#     + stats_df['WATER_PIXELS']
#     + stats_df['ACCA_PIXELS']
#     # + stats_df['SATURATED_PIXELS']
# )
# stats_df['CLOUD_COUNT_RATIO'] = stats_df['MASKED_PIXELS'] / stats_df['TOTAL_PIXELS']
# # stats_df['CLOUD_COUNT_RATIO'] = stats_df['UNMASKED_PIXELS'] / stats_df['TOTAL_PIXELS']

# print(f'  {len(stats_df.count(axis=1))}')

# # Work through the tiles based on which ones already have the most skipped scenes
# wrs2_tiles = list(stats_df.groupby(['WRS2'])['SCENE_ID'].count().sort_values(ascending=False).index)

# new_skip_scenes = []
# new_skip_count = 0
# wrs2_i = 0

# # for wrs2 in reversed(wrs2_tiles):
# # for wrs2 in reversed(sorted(wrs2_tiles)):
# # for wrs2 in sorted(wrs2_tiles):
# for wrs2 in random.sample(wrs2_tiles, len(wrs2_tiles)):
#     if wrs2_i >= 10:
#         break
#     if wrs2_skip_list and (wrs2 in wrs2_skip_list):
#         continue
#     # if california_wrs2_list and (wrs2 not in california_wrs2_list) and wrs2 not in ['p042r033']:
#     #     continue
#     # if int(wrs2[5:8]) >= 30:
#     #     continue
    
#     wrs2_path = int(wrs2[1:4])
#     wrs2_row = int(wrs2[5:8])
#     wrs2_tgt = f'{wrs2_path:03d}{wrs2_row:03d}'
#     wrs2_above = f'{wrs2_path:03d}{wrs2_row-1:03d}'
#     wrs2_below = f'{wrs2_path:03d}{wrs2_row+1:03d}'    

#     wrs2_stats_df = stats_df[stats_df['WRS2'] == wrs2].copy()
#     # Applying skip list here so that main stats DF has all scenes
#     wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_skip_list)]
#     wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_cloudscore_list)]
    
#     # # Only check winter scenes
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['DATE'].str.slice(4,6).astype(int).isin([11, 12, 1, 2, 3])]
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['DATE'].str.slice(4,6).astype(int).isin([10, 11, 12, 1, 2, 3, 4])]
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['DATE'].str.slice(4,6).astype(int).isin([6, 7, 8])]

#     # Filter on the overall cloud count ratio
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COUNT_RATIO'] < (count_threshold_pct_max / 100)]
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COUNT_RATIO'] >= (count_threshold_pct_min / 100)]
#     wrs2_stats_df.sort_values('CLOUD_COUNT_RATIO', ascending=False, inplace=True)

#     # # Filter on the CLOUD_COVER_LAND property
#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COVER_LAND'] < 71]
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COVER_LAND'] >= 60]
#     # wrs2_stats_df.sort_values('CLOUD_COVER_LAND', ascending=False, inplace=True)

#     if len(wrs2_stats_df.count(axis=1)) == 0:
#         continue
#     print(f'{wrs2} - {len(wrs2_stats_df.count(axis=1))}')

#     wrs2_skip_scenes = []
#     wrs2_skip_count = 0
    
#     # for i, row in wrs2_stats_df.iterrows():
#     for i, row in wrs2_stats_df.sample(n=min(print_count, len(wrs2_stats_df.index))).iterrows():

#         scene_id = row["SCENE_ID"].upper()

#         landsat_type = scene_id.split('_')[0].upper()
#         landsat_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{scene_id}')
#         landsat_region = landsat_img.geometry().bounds(1, 'EPSG:4326')
#         landsat_sr_img = landsat_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])

#         # Landsat true color image
#         landsat_url = (
#             landsat_sr_img.where(land_mask.unmask().eq(0), 0.25)
#             .getThumbURL({'min': 0.0, 'max': 0.30, 'gamma': 1.25, 'region': landsat_region, 'dimensions': image_size})
#         )
    
#         # Landsat true color with Fmask
#         fmask_url = (
#             landsat_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#             .blend(fmask(landsat_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#             .getThumbURL({'region': landsat_region, 'dimensions': image_size})
#         )
    
#         print('#'*80)
#         print(
#             f'  {scene_id}  {row["TOTAL_PIXELS"]:>10d}  {row["UNMASKED_PIXELS"]:>10d}'
#             f'  ({row["CLOUD_COUNT_RATIO"]:>0.2f}) ({row["SNOW_COUNT_RATIO"]:>0.2f}) {row["CLOUD_COVER_LAND"]}'
#             f'  {row["SR_RED"]:0.2f}  {row["SR_GREEN"]:0.2f}  {row["SR_BLUE"]:0.2f}'
#         )
#         ipyplot.plot_images([landsat_url, fmask_url], img_width=image_size)
        
#         new_skip_scenes.append(scene_id)
#         wrs2_skip_scenes.append(scene_id)

#     if wrs2_skip_scenes:
#         wrs2_i += 1
#         for scene_id in wrs2_skip_scenes:
#             print(scene_id)

# if new_skip_scenes:
#     print('')
#     for scene_id in new_skip_scenes:
#         print(scene_id)

# print('\nDone')
