In [None]:
import collections
import datetime
import os
import pprint
import random

import ee
import ipyplot
import numpy as np
import pandas as pd
import openet.core
from sklearn import tree
from sklearn.ensemble import IsolationForest
# from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from scipy import stats
from scipy.optimize import curve_fit

# %matplotlib nbagg
# %matplotlib ipympl
import matplotlib.pyplot as plt
import seaborn as sns
# import jscatter

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from ipywidgets import Output, VBox

import pyperclip

#from IPython.display import Image, display
#from ipywidgets import widgets, interactive

# gsutil -m rm "gs://openet_temp/skip_scene_stats/2025/*.csv"
# gsutil -m cp "gs://openet_temp/skip_scene_stats/2025/*.csv" ./stats/2025/


In [None]:
ee.Initialize(
    project='ee-cmorton',
    opt_url='https://earthengine-highvolume.googleapis.com'
)

stats_ws = os.path.join(os.getcwd(), 'stats')
if not os.path.isdir(stats_ws):
    os.makedirs(stats_ws)
    

In [None]:
wrs2_skip_list = [
    'p010r030', 
]

wrs2_list = sorted(
    ee.FeatureCollection('projects/openet/assets/features/wrs2/custom')
    .filterBounds(ee.Geometry.BBox(-124, 26, -67.9, 50))
    .filter(ee.Filter.inList('wrs2_tile', wrs2_skip_list).Not())
    .aggregate_histogram('wrs2_tile').keys().getInfo(),
    reverse=True
)
# print(len(wrs2_list))


ocean_wrs2_list = [
    'p048r027', 'p047r031', 'p047r030', 'p047r029', 'p046r033', 
    'p045r034', 'p044r035', 'p043r036', 'p041r037', 'p040r038', 
    'p038r041', 'p038r040',
    'p025r040', 'p024r040', 'p024r027', 'p023r040', 
    'p023r027', 'p022r040', 'p021r040', 'p020r029',
    'p017r041', 'p016r038', 'p015r040', 'p015r037', 
    'p013r033', 'p012r032', 'p011r031', 'p011r030', 
]

california_wrs2_list = [
    'p038r036', 'p038r037', 
    'p039r035', 'p039r036', 'p039r037',
    'p040r034', 'p040r035', 'p040r036', 'p040r037',
    'p041r034', 'p041r035', 'p041r036', 'p041r037',
    'p042r033', 'p042r034', 'p042r035', 'p042r036',
    'p043r031', 'p043r032', 'p043r033', 'p043r034', 'p043r035',
    'p044r031', 'p044r032', 'p044r033', 'p044r034',
    'p045r031', 'p045r032', 'p045r033',
    'p046r031', 'p046r032', 'p047r031',
]


In [None]:
land_mask = ee.Image('projects/openet/assets/features/water_mask').Not()
# Apply the NLCD/NALCMS water mask (anywhere it is water, set the ocean mask 
land_mask = land_mask.where(ee.Image("USGS/NLCD_RELEASES/2020_REL/NALCMS").unmask(18).eq(18), 0)
# land_mask = land_mask.And(ee.Image("USGS/NLCD_RELEASES/2020_REL/NALCMS").unmask(18).neq(18))
# # land_mask = ee.Image('projects/openet/assets/meteorology/conus404/ancillary/land_mask')

# etf_coll_id = 'projects/openet/assets/ssebop/conus/gridmet/landsat/c02'
etf_coll_id = 'projects/usgs-gee-nhm-ssebop/assets/ssebop/landsat/c02'
# etf_coll_id = 'projects/openet/assets/intercomparison/ssebop/landsat/c02/v0p2p6'
band_name = 'et_fraction'

rgb_bands = {
    'LT04': ['SR_B3', 'SR_B2', 'SR_B1'],
    'LT05': ['SR_B3', 'SR_B2', 'SR_B1'],
    'LE07': ['SR_B3', 'SR_B2', 'SR_B1'],
    'LC08': ['SR_B4', 'SR_B3', 'SR_B2'],
    'LC09': ['SR_B4', 'SR_B3', 'SR_B2'],
}

# 0 - white, 1 - no fill (green), 2 - shadow (dark blue), 3 - snow (light blue), 4 - cloud (light gray), 5 - water (purple), 6 - ocean mask
fmask_palette = "ffffff, 9effa1, blue, 00aff2, dddddd, purple, bfbfbf"
fmask_max = 6


In [None]:
# # Intercomparison sites and dates

# sites_csv = '/Users/Charles.Morton@dri.edu/Projects/openet-tools/intercomparison/master_flux_station_list.csv'
# sites_df = pd.read_csv(sites_csv)

# interp_days = 32
# site_keep_list = []
# wrs2_delimiter = ';'

# # Hardcoding the sites CSV field names for now
# start_field = 'START_DATE'
# end_field = 'END_DATE'
# site_field = 'SITE_ID'
# lat_field = 'LATITUDE'
# lon_field = 'LONGITUDE'
# wrs2_field = 'WRS2_TILES'

# from dateutil.relativedelta import relativedelta

# # Group the date ranges by WRS2 tile
# # print(f'\nGrouping overlapping dates')
# wrs2_dates = collections.defaultdict(list)
# wrs2_sites = collections.defaultdict(list)
# for (site_i, site) in sites_df.iterrows():
#     # print(site_i)
#     # print(site)
#     if site['RANDOM_SELECTION'] not in [0, 1]:
#         # print('  Unsupported RANDOM_SELECTION value')
#         input('ENTER')
#         continue
#     # if site['RANDOM_SELECTION'] != 1:
#     #     continue
#     if site_keep_list and site.loc[site_field] not in site_keep_list:
#         # print('  Site not in keep list - skipping')
#         continue

#     # Include all sites in INI file, even those outside the date range
#     for wrs2 in site.loc[wrs2_field].split(wrs2_delimiter):
#         wrs2_sites[wrs2.strip()].append([
#             round(site.loc[lon_field], 6), round(site.loc[lat_field], 6)
#         ])

#     start_dt = datetime.datetime.strptime(site.loc[start_field], '%Y-%m-%d')
#     end_dt = datetime.datetime.strptime(site.loc[end_field], '%Y-%m-%d')
#     # print(f'  Start Date: {start_dt.strftime("%Y-%m-%d")}')
#     # print(f'  End Date:   {end_dt.strftime("%Y-%m-%d")}')

#     # If start/end dates are within N gap days of the start/end of the month
#     #   consider it a "full" month
#     gap_days = 5
#     # print('  Snapping start date to month')
#     month_start_dt = datetime.datetime(start_dt.year, start_dt.month, 1)
#     if (start_dt - month_start_dt).days <= gap_days:
#         # print('    full month')
#         start_dt = month_start_dt
#     else:
#         # print('    not full month')
#         start_dt = month_start_dt + relativedelta(months=1)

#     # print('  Snapping end date to month')
#     month_end_dt = end_dt + relativedelta(months=1)
#     month_end_dt = datetime.datetime(month_end_dt.year, month_end_dt.month, 1)
#     month_end_dt = month_end_dt - relativedelta(days=1)
#     if (month_end_dt - end_dt).days <= gap_days:
#         # print('    full month')
#         end_dt = month_end_dt
#     else:
#         # print('    not full month')
#         end_dt = (month_end_dt + relativedelta(days=1) -
#                   relativedelta(months=1) - relativedelta(days=1))
#     # print(f'  Start Date: {start_dt.strftime("%Y-%m-%d")}')
#     # print(f'  End Date:   {end_dt.strftime("%Y-%m-%d")}')

#     if interp_days > 0:
#         # Buffer the date ranges by the interpolate days value if set
#         # print('  Buffering start/end dates')
#         start_dt = start_dt - datetime.timedelta(days=interp_days)
#         end_dt = end_dt + datetime.timedelta(days=interp_days)
#         # print(f'  Start Date: {start_dt.strftime("%Y-%m-%d")}')
#         # print(f'  End Date:   {end_dt.strftime("%Y-%m-%d")}')

#     # CM - Changing conditionals to get single date ranges to work
#     # if end_dt <= start_dt or start_dt >= end_dt:
#     if end_dt < start_dt or start_dt > end_dt:
#         # print(f'  Start: {start_dt.strftime("%Y-%m-%d")}')
#         # print(f'  End:   {end_dt.strftime("%Y-%m-%d")}')
#         # print('  Date range outside min/max, skipping')
#         continue
#     else:
#         # print(f'  Start: {start_dt.strftime("%Y-%m-%d")}')
#         # print(f'  End:   {end_dt.strftime("%Y-%m-%d")}')
#         pass

#     for wrs2 in site.loc[wrs2_field].split(wrs2_delimiter):
#         wrs2_dates[wrs2.strip()].append([start_dt, end_dt])

# # pprint.pprint(wrs2_dates)

# # Merge the date ranges that overlap
# print(f'\nMerging overlapping dates')
# merged_dates = {}
# for wrs2, dates in sorted(wrs2_dates.items()):
#     # print(f'  {wrs2}')
#     # pprint.pprint(sorted(dates))

#     # Push the first interval on to the stack
#     merged_dates[wrs2] = [sorted(dates)[0]]

#     # Only check for overlapping ranges if there is more than 1 range
#     if len(dates) == 1:
#         continue

#     for d in sorted(dates)[1:]:
#         # If the current date range doesn't overlap, add it to the stack
#         if d[0] > merged_dates[wrs2][-1][1]:
#             merged_dates[wrs2].append(d)
#         # If the ranges overlap and the end date is later,
#         #   update the end time of the stack value
#         elif ((d[0] <= merged_dates[wrs2][-1][1]) and
#               (d[1] > merged_dates[wrs2][-1][1])):
#             merged_dates[wrs2][-1][1] = d[1]

# # pprint.pprint(merged_dates)

# # # CGM - Splitting by year for DisALEXI is not needed if the NLCD
# # #   is set to the image collection instead of the image
# # # For DisALEXI split the date ranges by year after merging
# # # For other models, index by the first year in the range
# # # This may be functionality we will want for other models later
# # year_dates = collections.defaultdict(dict)
# # # if model in ['DISALEXI_TAIR_10K', 'DISALEXI_TAIR_1K', 'DISALEXI', 'DISALEXI_TAIR_DIRECT']:
# # #     for wrs2, dates in merged_dates.items():
# # #         # split_dates = {}
# # #         for date_i, date in enumerate(dates):
# # #             for year in range(date[0].year, date[1].year+1):
# # #                 year_date = [
# # #                     max(date[0], datetime.datetime(year, 1, 1)),
# # #                     min(date[1], datetime.datetime(year, 12, 31)),
# # #                 ]
# # #                 try:
# # #                     year_dates[wrs2][year].append(year_date)
# # #                 except:
# # #                     year_dates[wrs2][year] = [year_date]
# # # else:
# # for wrs2, dates in merged_dates.items():
# #     year_dates[wrs2][dates[0][0].year] = dates

# # pprint.pprint(year_dates)


In [None]:
def fmask(landsat_img):
    # Add the fmask image on top of the true color image
    qa_img = landsat_img.select('QA_PIXEL')
    fill_mask = qa_img.bitwiseAnd(1).neq(0)                  # bits: 0
    dilate_mask = qa_img.rightShift(1).bitwiseAnd(1).neq(0)  # bits: 1
    cirrus_mask = qa_img.rightShift(2).bitwiseAnd(1).neq(0)  # bits: 2
    cloud_mask = qa_img.rightShift(3).bitwiseAnd(1).neq(0)   # bits: 3
    shadow_mask = qa_img.rightShift(4).bitwiseAnd(1).neq(0)  # bits: 4
    snow_mask = qa_img.rightShift(5).bitwiseAnd(1).neq(0)    # bits: 5
    clear_mask = qa_img.rightShift(6).bitwiseAnd(1).neq(0)   # bits: 6
    water_mask = qa_img.rightShift(7).bitwiseAnd(1).neq(0)   # bits: 7
    cloud_conf = qa_img.rightShift(8).bitwiseAnd(3)          # bits: 8, 9
    shadow_conf = qa_img.rightShift(10).bitwiseAnd(3)        # bits: 10, 11
    snow_conf = qa_img.rightShift(12).bitwiseAnd(3)          # bits: 12, 13
    cirrus_conf = qa_img.rightShift(14).bitwiseAnd(3)        # bits: 14, 15

    # Saturated pixels
    # Flag as saturated if any of the RGB bands are saturated
    #   or change .gt(0) to .gt(7) to flag if all RGB bands are saturated
    # Comment out rightShift line to flag if saturated in any band
    bitshift = ee.Dictionary({'LANDSAT_4': 0, 'LANDSAT_5': 0, 'LANDSAT_7': 0, 'LANDSAT_8': 1, 'LANDSAT_9': 1});
    saturated_mask = (
        landsat_img.select('QA_RADSAT')
        .rightShift(ee.Number(bitshift.get(ee.String(landsat_img.get('SPACECRAFT_ID'))))).bitwiseAnd(7)
        .gt(0)
    )
    
    # Old "Fmask" style image
    fmask_img = (
        qa_img.multiply(0)
        .where(landsat_img.select(['SR_B4']).mask().eq(0), 1)
        # .where(saturated_mask, 6)
        .where(water_mask, 5)
        .where(shadow_mask, 2)
        .where(snow_mask, 3)
        .where(cloud_mask.Or(dilate_mask).Or(cirrus_mask), 4)
        # .add(shadow_mask.multiply(2))
        # .add(snow_mask.multiply(3))
        # .add(cloud_mask.Or(dilate_mask).Or(cirrus_mask).multiply(4))
        # .add(cloud_mask.Or(dilate_mask).multiply(4))
        # .add(cloud_mask.And(cloud_conf).multiply(4))
        # .add(water_mask.multiply(5))
    )
    
    return fmask_img.updateMask(fmask_img.neq(0)).rename(['fmask'])


In [None]:
# Clean up the scene skip list file
skip_path = '../v2p1.csv'
#skip_path = '../v2p1_eemetric_error.csv'
print(f'\n{skip_path}')

with open(skip_path, 'r') as csv_f:
    scene_skip_lines = csv_f.readlines()
scene_skip_header = scene_skip_lines.pop(0)

# Drop the comments and empty lines
scene_skip_lines = [line.strip() for line in scene_skip_lines if line.strip() and line[0] != '#']

# Sort by date then by tile
scene_skip_lines = sorted(scene_skip_lines, key=lambda x:x.split(',')[0].split('_')[-1] + '_' + x.split(',')[0].split('_')[-2])

# Identify duplicate scene IDs (as opposed to duplicate lines)
# Note, this block is not removing any lines, just printing
print('Duplicate Scene IDs:')

if len({l.split(',')[0] for l in scene_skip_lines}) != len(scene_skip_lines):
    for item, count in collections.Counter([l.split(',')[0] for l in scene_skip_lines]).items():
        if count > 1:
            print(item)

# Identify lines with no reason
print('\nMissing reason Scene IDs:')
for l in scene_skip_lines:
    if ',' not in l:
        print(l)
    elif l.split(',')[1].strip() == '':
        print(l)
    elif len(l.split(',')) > 2:
        print(l)

# # Identify duplicate lines (not duplicate SCENE IDs)
# if len({line for line in scene_skip_lines}) != len(scene_skip_lines):
#     print('Duplicate Lines:')
#     for item, count in collections.Counter(scene_skip_lines).items():
#         if count > 1:
#             print(item)
# 
#     # # Uncomment to have the tool remove duplicate lines
#     # scene_remove_lines = []
#     # for item, count in collections.Counter(scene_skip_lines).items():
#     #     if count > 1:
#     #         scene_remove_lines.append(item)
#     #         # print(item)
#      
#     # # Does this only remove the first one?
#     # if scene_remove_lines:
#     #     print(f'Removing {len(scene_remove_lines)} duplicate lines in file')
#     #     for line in scene_remove_lines:
#     #         print(line)
#     #         scene_skip_lines.remove(line)
# 
# # Then recheck for duplicate SCENE_IDs (but different notes or dates)
# scenes = {line.split(',')[0] for line in scene_skip_lines}           
# if len(scenes) != len(scene_skip_lines):
#     print('Duplicate scene IDs still in file')
    
print('\nWriting updated scene skip list CSV')
with open(skip_path.replace('.csv', '_sorted.csv'), 'w') as csv_f:
    csv_f.write(scene_skip_header)
    for i, line in enumerate(scene_skip_lines):
        csv_f.write(line + '\n')

print('\nDone')

In [None]:
# Build the EEMETRIC skip list by merging the EEMETRIC error list and the full skip list
scene_skip_path = '../v2p1_sorted.csv'
eemetric_error_path = '../v2p1_eemetric_error.csv'
eemetric_skip_path = '../v2p1_eemetric.csv'

with open(eemetric_error_path, 'r') as csv_f:
    eemetric_skip_lines = csv_f.readlines()
    
with open(scene_skip_path, 'r') as csv_f:
    scene_skip_lines = csv_f.readlines()

print('\nWriting eemetric scene skip list CSV')
with open(eemetric_skip_path, 'w') as csv_f:
    for i, line in enumerate(eemetric_skip_lines):
        csv_f.write(line)
    csv_f.write('\n')
    for i, line in enumerate(scene_skip_lines):
        if i == 0:
            continue
        csv_f.write(line)

print('\nDone')

In [None]:
    
#     # for i, row in wrs2_stats_df.iterrows():
#     for i, row in wrs2_stats_df.sample(n=min(print_count, len(wrs2_stats_df.index))).iterrows():

#         scene_id = row["SCENE_ID"].upper()

#         above_scene_id = scene_id.upper().replace(wrs2_tgt, wrs2_above)
#         above_stats_df = stats_df.loc[stats_df['SCENE_ID'] == above_scene_id]
#         if len(above_stats_df):
#             above_cloud_pct = float(above_stats_df.iloc[0]['CLOUD_COVER_LAND'])
#         else:
#             above_cloud_pct = None
            
#         below_scene_id = scene_id.upper().replace(wrs2_tgt, wrs2_below)
#         below_stats_df = stats_df.loc[stats_df['SCENE_ID'] == below_scene_id]
#         if len(below_stats_df):
#             below_cloud_pct = float(below_stats_df.iloc[0]['CLOUD_COVER_LAND'])
#         else:
#             below_cloud_pct = None

#         # # Only show scenes that have above & below both skipped or None
#         # if (((above_scene_id not in scene_skip_list) and (above_cloud_pct is not None)) or 
#         #     ((below_scene_id not in scene_skip_list) and (below_cloud_pct is not None))):
#         #     continue   

#         # # Only show scenes that have either above & below skipped or None
#         # if (((above_scene_id not in scene_skip_list) and (above_cloud_pct is not None)) and 
#         #     ((below_scene_id not in scene_skip_list) and (below_cloud_pct is not None))):
#         #     continue   
            
#         landsat_type = scene_id.split('_')[0].upper()
#         landsat_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{scene_id}')
#         landsat_region = landsat_img.geometry().bounds(1, 'EPSG:4326')
#         landsat_sr_img = landsat_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])

#         # Landsat true color image
#         landsat_url = (
#             landsat_sr_img.where(land_mask.unmask().eq(0), 0.25)
#             .getThumbURL({'min': 0.0, 'max': 0.30, 'gamma': 1.25, 'region': landsat_region, 'dimensions': image_size})
#         )
    
#         # Landsat true color with Fmask
#         fmask_url = (
#             landsat_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#             .blend(fmask(landsat_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#             .getThumbURL({'region': landsat_region, 'dimensions': image_size})
#         )
    
#         print('#'*80)
#         print(
#             f'  {scene_id}  {row["TOTAL_PIXELS"]:>10d}  {row["UNMASKED_PIXELS"]:>10d}'
#             f'  ({row["CLOUD_COUNT_RATIO"]:>0.2f}) ({row["SNOW_COUNT_RATIO"]:>0.2f}) {row["CLOUD_COVER_LAND"]}'
#             f'  {row["SR_RED"]:0.2f}  {row["SR_GREEN"]:0.2f}  {row["SR_BLUE"]:0.2f}'
#         )
#         ipyplot.plot_images([landsat_url, fmask_url], img_width=image_size)
    
#         # Show the images above and below the target wrs2
#         above_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{above_scene_id}')
#         above_region = above_img.geometry().bounds(1, 'EPSG:4326')
#         above_sr_img = above_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])
#         try:
#             above_url = (
#                 above_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#                 .blend(fmask(above_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#                 .getThumbURL({'region': above_region, 'dimensions': image_size})
#             )
#         except:
#             above_url = None
            
#         below_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{below_scene_id}')
#         below_region = below_img.geometry().bounds(1, 'EPSG:4326')
#         below_sr_img = below_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])
#         try:
#             below_url = (
#                 below_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
#                 .blend(fmask(below_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
#                 .getThumbURL({'region': below_region, 'dimensions': image_size})
#             )
#         except:
#             below_url = None
    
#         above_skipped = f' (skipped)' if above_scene_id in scene_skip_list else ''   
#         below_skipped = f' (skipped)' if below_scene_id in scene_skip_list else ''
        
#         if above_url and below_url:
#             print(f'{below_scene_id} ({below_cloud_pct}){below_skipped}  {above_scene_id} ({above_cloud_pct}){above_skipped}')
#             ipyplot.plot_images([below_url, above_url], img_width=image_size)
#         elif above_url:
#             print(f'{above_scene_id} ({above_cloud_pct}){above_skipped}')
#             ipyplot.plot_images([above_url], img_width=image_size)
#         elif below_url:
#             print(f'{below_scene_id} ({below_cloud_pct}){below_skipped}')
#             ipyplot.plot_images([below_url], img_width=image_size)
    
#         wrs2_skip_scenes.append(scene_id)
#         wrs2_skip_count += 1
#         if wrs2_skip_count >= print_count:
#             break

#     if wrs2_skip_scenes:
#         wrs2_i += 1
#         for scene_id in wrs2_skip_scenes:
#             print(scene_id)
#         new_skip_scenes.extend(wrs2_skip_scenes)

# print('\nNew Skip Scenes')
# if new_skip_scenes:
#     for scene_id in new_skip_scenes:
#         print(scene_id)

# print('\nDone')

In [None]:
# Read in the full CSV archive
start_year = 1985
#start_year = 2003
#start_year = 2015
#start_year = 2020
#start_year = 2024
#start_year = 2025
end_year = 2025
years = list(range(start_year, end_year + 1))


# Read in the scene skip list
scene_skip_url = '../v2p1.csv'
# scene_skip_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1.csv'
scene_skip_df = pd.read_csv(scene_skip_url)
scene_skip_list = list(scene_skip_df['SCENE_ID'].values)
print(f'Skip list images: {len(scene_skip_list)}')

scene_cloudscore_url = '../v2p1_cloudscore.csv'
# scene_cloudscore_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1_cloudscore.csv'
scene_cloudscore_list = list(pd.read_csv(scene_cloudscore_url)['SCENE_ID'].values)
print(f'Skip cloudscore images: {len(scene_cloudscore_list)}')

# Add the cloudscore images to the skip list
scene_skip_list = sorted(list(set(scene_skip_list + scene_cloudscore_list)))


print('Reading image stats CSV files')
stats_df_list = []
for wrs2_i, wrs2_tile in enumerate(wrs2_list):
    if wrs2_i % 100 == 0:
        print(f'  {wrs2_i:>3d}')
        
    for year in range(start_year, end_year + 1):
        wrs2_stats_path = os.path.join(stats_ws, f'{year}', f'{wrs2_tile}_{year}.csv')
        if not os.path.isfile(wrs2_stats_path):
            # print(f'  {wrs2_tile}_{year} - Missing stats CSV, skipping')
            continue
        try:
            wrs2_stats_df = pd.read_csv(wrs2_stats_path, index_col=False)
        except Exception as e:
            print(f'  {wrs2_tile}_{year} - Error reading CSV, skipping')
            continue
        if wrs2_stats_df.empty:
            continue

        # # Only keep images that have a cloud cover land below the default threshold
        # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COVER_LAND'] < 71]

        # Add separate fields for the scene ID components
        wrs2_stats_df['DATE'] = wrs2_stats_df['SCENE_ID'].str.slice(12, 16) + '-' + wrs2_stats_df['SCENE_ID'].str.slice(16, 18) + '-' + wrs2_stats_df['SCENE_ID'].str.slice(18, 20)
        wrs2_stats_df['YEAR'] = wrs2_stats_df['SCENE_ID'].str.slice(12, 16).astype(int)
        wrs2_stats_df['MONTH'] = wrs2_stats_df['SCENE_ID'].str.slice(16, 18).astype(int)
        wrs2_stats_df['LANDSAT'] = wrs2_stats_df['SCENE_ID'].str.slice(0, 4)
        wrs2_stats_df['WRS2'] = 'p' + wrs2_stats_df['SCENE_ID'].str.slice(5, 8) + 'r' + wrs2_stats_df['SCENE_ID'].str.slice(8, 11)
        wrs2_stats_df.drop(['.geo', 'system:index'], axis=1, inplace=True)

        # Skip the most recent images since the MORAN stats aren't being computed
        wrs2_stats_df = wrs2_stats_df[~((wrs2_stats_df['YEAR'] >= 2025) & (wrs2_stats_df['MONTH'] >= 6))]

        # # Remove all of the images with a negative moran value
        # # This will help catch images that are 100% cloud (because of shadow)
        # #   but will also remove any image that we haven't computed MORAN stats for
        # wrs2_stats_df[wrs2_stats_df['MORAN_1K'] <= -0.1]

        # # Overwrite the negative moran and reflectance values
        # # These seem to mainly happen when the image is 100% masked (because of shadow)
        # #   and could probably be combined into a single call vased on the pixel count
        # wrs2_stats_df.loc[wrs2_stats_df['MORAN_1K'] < 0, 'MORAN_1K'] = 1
        # wrs2_stats_df.loc[wrs2_stats_df['MORAN_2K'] < 0, 'MORAN_2K'] = 1
        # wrs2_stats_df.loc[wrs2_stats_df['MORAN_4K'] < 0, 'MORAN_4K'] = 1
        # wrs2_stats_df.loc[wrs2_stats_df['MORAN_8K'] < 0, 'MORAN_8K'] = 1
        # wrs2_stats_df.loc[wrs2_stats_df['UNMASKED_SR_RED'] < -0.1, 'UNMASKED_SR_RED'] = 1
        # wrs2_stats_df.loc[wrs2_stats_df['UNMASKED_SR_GREEN'] < -0.1, 'UNMASKED_SR_GREEN'] = 1
        # wrs2_stats_df.loc[wrs2_stats_df['UNMASKED_SR_BLUE'] < -0.1, 'UNMASKED_SR_BLUE'] = 1
        # wrs2_stats_df.loc[wrs2_stats_df['UNMASKED_TOA_RED'] < -0.1, 'UNMASKED_TOA_RED'] = 1
        # wrs2_stats_df.loc[wrs2_stats_df['UNMASKED_TOA_GREEN'] < -0.1, 'UNMASKED_TOA_GREEN'] = 1
        # wrs2_stats_df.loc[wrs2_stats_df['UNMASKED_TOA_BLUE'] < -0.1, 'UNMASKED_TOA_BLUE'] = 1
        # wrs2_stats_df.loc[wrs2_stats_df['SR_RED'] < -0.1, 'SR_RED'] = 1
        # wrs2_stats_df.loc[wrs2_stats_df['SR_GREEN'] < -0.1, 'SR_GREEN'] = 1
        # wrs2_stats_df.loc[wrs2_stats_df['SR_BLUE'] < -0.1, 'SR_BLUE'] = 1
        # wrs2_stats_df.loc[wrs2_stats_df['TOA_RED'] < -0.1, 'TOA_RED'] = 1
        # wrs2_stats_df.loc[wrs2_stats_df['TOA_GREEN'] < -0.1, 'TOA_GREEN'] = 1
        # wrs2_stats_df.loc[wrs2_stats_df['TOA_BLUE'] < -0.1, 'TOA_BLUE'] = 1

        # Compute the average reflectance values for the RGB bands
        #   and then remove the separate RGB bands
        wrs2_stats_df['UNMASKED_SR'] = (wrs2_stats_df['UNMASKED_SR_RED'] + wrs2_stats_df['UNMASKED_SR_GREEN'] + wrs2_stats_df['UNMASKED_SR_BLUE']) / 3
        wrs2_stats_df['UNMASKED_TOA'] = (wrs2_stats_df['UNMASKED_TOA_RED'] + wrs2_stats_df['UNMASKED_TOA_GREEN'] + wrs2_stats_df['UNMASKED_TOA_BLUE']) / 3
        wrs2_stats_df['MASKED_SR'] = (wrs2_stats_df['SR_RED'] + wrs2_stats_df['SR_GREEN'] + wrs2_stats_df['SR_BLUE']) / 3
        wrs2_stats_df['MASKED_TOA'] = (wrs2_stats_df['TOA_RED'] + wrs2_stats_df['TOA_GREEN'] + wrs2_stats_df['TOA_BLUE']) / 3
        wrs2_stats_df.drop(['UNMASKED_SR_RED', 'UNMASKED_SR_GREEN', 'UNMASKED_SR_BLUE'], axis=1, inplace=True)
        wrs2_stats_df.drop(['UNMASKED_TOA_RED', 'UNMASKED_TOA_GREEN', 'UNMASKED_TOA_BLUE'], axis=1, inplace=True)
        # wrs2_stats_df.drop(['SR_RED', 'SR_GREEN', 'SR_BLUE'], axis=1, inplace=True)
        wrs2_stats_df.drop(['TOA_RED', 'TOA_GREEN', 'TOA_BLUE'], axis=1, inplace=True)

        if not wrs2_stats_df.empty:
            stats_df_list.append(wrs2_stats_df)

stats_df = pd.concat(stats_df_list)

# Add the high CLOUD_COVER_LAND scenes to the skip list but don't remove from the dataframe
scene_skip_list.extend(stats_df[stats_df['CLOUD_COVER_LAND'] >= 71]['SCENE_ID'].values)

# Skip the Landsat 7 scenes in 2023
l7_2022_mask = (stats_df['DATE'].str.slice(0,4) >= '2022') & (stats_df['LANDSAT'] == 'LE07')
stats_df = stats_df[~l7_2022_mask]

# Compute the ratios
stats_df['ACCA_COUNT_RATIO'] = stats_df['ACCA_PIXELS'] / stats_df['TOTAL_PIXELS']
stats_df['SNOW_COUNT_RATIO'] = stats_df['SNOW_PIXELS'] / stats_df['TOTAL_PIXELS']
stats_df['SHADOW_COUNT_RATIO'] = stats_df['SHADOW_PIXELS'] / stats_df['TOTAL_PIXELS']
stats_df['WATER_COUNT_RATIO'] = stats_df['WATER_PIXELS'] / stats_df['TOTAL_PIXELS']
stats_df['SATURATED_COUNT_RATIO'] = stats_df['SATURATED_PIXELS'] / stats_df['TOTAL_PIXELS']
stats_df['MASKED_PIXELS'] = (
    stats_df['CLOUD_PIXELS'] + stats_df['CIRRUS_PIXELS'] + stats_df['DILATE_PIXELS']
    + stats_df['SHADOW_PIXELS']
    + stats_df['SNOW_PIXELS']
    + stats_df['WATER_PIXELS']
    + stats_df['ACCA_PIXELS']
    # + stats_df['SATURATED_PIXELS']
)
stats_df['CLOUD_COUNT_RATIO'] = stats_df['MASKED_PIXELS'] / stats_df['TOTAL_PIXELS']
# stats_df['CLOUD_COUNT_RATIO'] = stats_df['UNMASKED_PIXELS'] / stats_df['TOTAL_PIXELS']

# Only keep images that are less than 100% cloudy based on the composite cloud masked computed above
# This should catch the images that have the shadow mask set over all land pixels
# stats_df = stats_df[stats_df['CLOUD_COUNT_RATIO'] < 1]

# Set the skip reason in this order so that the more rare ones are written last
stats_df['SKIPPED'] = 'None'
skip_ids = {
    'Cloud': scene_skip_df[scene_skip_df['REASON'].str.contains('Cloud')]['SCENE_ID'].values,
    'Snow': scene_skip_df[scene_skip_df['REASON'].str.contains('Snow')]['SCENE_ID'].values,
    'Cirrus': scene_skip_df[scene_skip_df['REASON'].str.contains('Cirrus')]['SCENE_ID'].values,
    'Smoke': scene_skip_df[scene_skip_df['REASON'].str.contains('Smoke')]['SCENE_ID'].values,
    'Shadow': scene_skip_df[scene_skip_df['REASON'].str.contains('Shadow')]['SCENE_ID'].values,
    'Missing': scene_skip_df[scene_skip_df['REASON'].str.contains('Missing')]['SCENE_ID'].values,
    'Cloudscore': scene_cloudscore_list,
    # 'Weird': 
    # 'Bad': 
}
for key, values in skip_ids.items():
    stats_df.loc[stats_df['SCENE_ID'].isin(skip_ids[key]), 'SKIPPED'] = key

print(f'  {len(stats_df.count(axis=1))}')

# Work through the tiles based on which ones already have the most skipped scenes
wrs2_tiles = list(stats_df.groupby(['WRS2'])['SCENE_ID'].count().sort_values(ascending=False).index)

print('\nDone')


In [None]:
# Plotting functions
reasons = ['None', 'Snow', 'Cirrus', 'Shadow', 'Error', 'Smoke', 'Missing', 'Cloudscore']

def plot_timeseries(data_df, x, y, reasons, height=800, width=1600):
    fig = go.FigureWidget()
    fig.update_layout(height=height, width=width, hovermode='closest')

    if y in ['UNMASKED_SR', 'UNMASKED_TOA']:
        fig.update_layout(yaxis_range=[-0.01, 0.31])
    
    fig.add_trace(go.Scatter(
        x=data_df[x], y=data_df[y], text=data_df['SCENE_ID'], 
        name='Full', mode='markers', showlegend=False,
        marker=dict(
            size=10,
            color='rgba(0, 0, 0, 0)',
        )
    ))
    
    for reason in reasons:
        if len(wrs2_stats_df.loc[data_df['SKIPPED']==reason, y]) == 0:
            continue
        fig.add_trace(go.Scatter(
            x=data_df.loc[data_df['SKIPPED']==reason, x], 
            y=data_df.loc[data_df['SKIPPED']==reason, y], 
            text=data_df.loc[data_df['SKIPPED']==reason, 'SCENE_ID'],
            name=reason, mode='markers', hoverinfo='skip',
            marker=dict(
                symbol='circle' if reason == 'None' else 'triangle-up',
                size=8,
            )
        ))
    
    out = Output()
    @out.capture(clear_output=True)
    def do_click(trace, points, selector):
        if points.point_inds:
            pyperclip.copy(data_df.SCENE_ID.iloc[points.point_inds[0]])
            print(f'{data_df.SCENE_ID.iloc[points.point_inds[0]]}')
    fig.data[0].on_click(do_click)
    return fig, out
    # VBox([fig, out])
    # return fig


def plot_square(data_df, x, y, reasons, height=800, width=800):
    fig = go.FigureWidget()
    fig.update_layout(height=height, width=width, hovermode='closest')
    
    fig.add_trace(go.Scatter(
        x=data_df[x], y=data_df[y], text=data_df['SCENE_ID'], 
        name='Full', mode='markers', showlegend=False,
        marker=dict(
            size=10,
            color='rgba(0, 0, 0, 0)',
        )
    ))
    
    for reason in reasons:
        if len(data_df.loc[data_df['SKIPPED']==reason, y]) == 0:
            continue
        fig.add_trace(go.Scatter(
            x=data_df.loc[data_df['SKIPPED']==reason, x], 
            y=data_df.loc[data_df['SKIPPED']==reason, y], 
            text=data_df.loc[data_df['SKIPPED']==reason, 'SCENE_ID'],
            name=reason, mode='markers', hoverinfo='skip',
            marker=dict(
                symbol='circle' if reason == 'None' else 'triangle-up',
                size=8,
            )
        ))
    
    out = Output()
    @out.capture(clear_output=True)
    def do_click(trace, points, selector):
        if points.point_inds:
            pyperclip.copy(data_df.SCENE_ID.iloc[points.point_inds[0]])
            print(f'{data_df.SCENE_ID.iloc[points.point_inds[0]]}')
    fig.data[0].on_click(do_click)
    return fig, out
    # VBox([fig, out])
    # return fig


wrs2_stats_df = stats_df[stats_df['WRS2'] == 'p042r032'].copy()


In [None]:
image_size = 1400
print_count = 10

start_year = 1985
#start_year = 2003
#start_year = 2015
#start_year = 2025
end_year = 2025
years = list(range(start_year, end_year + 1))

#outlier_band = 'UNMASKED_SR'
outlier_band = 'UNMASKED_TOA'
#outlier_band = 'UNMASKED_LST'  

z_score_min = 5
z_score_max = 100

outlier_cols = [
    #'UNMASKED_SR',
    'UNMASKED_TOA',
    #'UNMASKED_LST',
    # 'MORAN_1K',
    # 'CLOUD_COVER_LAND',
    # 'CLOUD_COUNT_RATIO',
    # 'SNOW_COUNT_RATIO',
    # 'UNMASKED_PIXELS',
    # 'ACCA_PIXELS',
    # 'SATURATED_PIXELS',
    # 'ACCA_COUNT_RATIO',
    # 'SATURATED_COUNT_RATIO',
    'MONTH',
]


# Read in the scene skip list
scene_skip_url = '../v2p1.csv'
# scene_skip_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1.csv'
scene_skip_df = pd.read_csv(scene_skip_url)
scene_skip_list = list(scene_skip_df['SCENE_ID'].values)
print(f'Skip list images: {len(scene_skip_list)}')

scene_cloudscore_url = '../v2p1_cloudscore.csv'
# scene_cloudscore_url = 'https://raw.githubusercontent.com/cgmorton/scene-skip-list/main/v2p1_cloudscore.csv'
scene_cloudscore_list = list(pd.read_csv(scene_cloudscore_url)['SCENE_ID'].values)
print(f'Skip cloudscore images: {len(scene_cloudscore_list)}')

# Add the high CLOUD_COVER_LAND scenes to the skip list but don't remove from the dataframe
scene_skip_list.extend(stats_df[stats_df['CLOUD_COVER_LAND'] >= 71]['SCENE_ID'].values)

# Add the cloudscore images to the skip list
scene_skip_list = set(scene_skip_list + scene_cloudscore_list)


new_skip_scenes = []
new_skip_count = 0

wrs2_i = 0

# for wrs2 in reversed(wrs2_tiles):
# for wrs2 in sorted(wrs2_tiles):
# for wrs2 in reversed(sorted(wrs2_tiles)):
for wrs2 in random.sample(wrs2_tiles, len(wrs2_tiles)):
    if wrs2_i >= print_count:
        break
    if wrs2_skip_list and (wrs2 in wrs2_skip_list):
        continue
    # if int(wrs2[1:4]) not in range(45, 48):
    #     continue
    #if int(wrs2[5:8]) not in range(27, 50):
    #    continue
    # if wrs2 != 'p031r026':
    #     continue    print(wrs2)

    wrs2_stats_df = stats_df[stats_df['WRS2'] == wrs2].copy()

    # Remove the high CLOUD_COVER_LAND scenes before computing outliers
    wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COVER_LAND'] < 71]

    # Remove the high CLOUD_COUNT_RATIO scenes before computing outliers
    wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['CLOUD_COUNT_RATIO'] < 90]

    # Should the known outliers be removed before checking for outliers?
    # Remove the high CLOUD_COUNT_RATIO scenes before computing outliers
    wrs2_stats_df = wrs2_stats_df[~wrs2_stats_df['SCENE_ID'].isin(scene_skip_list)]

    if years:
        wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['YEAR'].isin(years)]
    
    if len(wrs2_stats_df.count(axis=1)) == 0:
        continue
    print(f'{wrs2} - {len(wrs2_stats_df.count(axis=1))}')

    wrs2_path = int(wrs2[1:4])
    wrs2_row = int(wrs2[5:8])
    wrs2_tgt = f'{wrs2_path:03d}{wrs2_row:03d}'
    wrs2_above = f'{wrs2_path:03d}{wrs2_row-1:03d}'
    wrs2_below = f'{wrs2_path:03d}{wrs2_row+1:03d}'  

    # # # DEADBEEF - Trying to debug why UNMASKED_LST wasn't working
    # print(wrs2_stats_df.loc[['UNMASKED_LST'].isna(), 'SCENE_ID'])
    # # wrs2_i += 1
    # # continue



    # Fit an annual function to the data
    # Initial guesses for the parameters
    wrs2_stats_df.sort_values('DATE', ascending=True, inplace=True)
    wrs2_stats_df['DAYS'] = (pd.to_datetime(wrs2_stats_df['DATE']) - datetime.datetime(start_year, 1, 1)).dt.days
    days = wrs2_stats_df['DAYS'].values
    data = wrs2_stats_df[outlier_band].values
    p0 = [
        (np.max(data) - np.min(data)) / 2,            # Amplitude
        0,                                            # Phase
        (data[-1] - data[0]) / (days[-1] - days[0]),  # Slope
        np.mean(data)                                 # Intercept
    ]

    def annual_fit(x, A, phi, m, C):
        """
        Model function for a time series with an annual frequency and a linear trend.
    
        Args:
            x (array): Time data in days.
            A (float): Amplitude of the annual cycle.
            phi (float): Phase shift of the sine wave.
            m (float): Slope of the linear trend.
            C (float): Intercept (offset) of the linear trend.
        """
        return A * np.sin(2 * np.pi * x / 365 + phi) + m * x + C
        
    popt, pcov = curve_fit(annual_fit, days, data, p0=p0)
    fitted_amplitude, fitted_phase, fitted_slope, fitted_intercept = popt
    fit_curve = annual_fit(days, *popt)
    wrs2_stats_df['ERROR'] = wrs2_stats_df[outlier_band].values - annual_fit(wrs2_stats_df['DAYS'], *popt)

    # # Plot the original data and the fitted curve
    # plt.figure(figsize=(12, 6))
    # plt.scatter(days, data, label='Original Data', alpha=0.6)
    # plt.plot(days, fit_curve, label='Fitted Curve', color='red', linewidth=2)
    # plt.title('Time Series with Annual Frequency Fit')
    # plt.xlabel('Time (years)')
    # plt.ylabel('Data')
    # plt.legend()
    # plt.grid(True)
    # plt.show()

    # display(VBox(plot_timeseries(wrs2_stats_df, x='DATE', y='ERROR', reasons=['None'])))


    
    # # SciPy Outlier detection
    # # IsolationForest
    # model = IsolationForest(random_state=42)
    # wrs2_stats_df['OUTLIER_IF_SCORE'] = model.fit_predict(wrs2_stats_df[outlier_cols])

    # # OneClassSVM
    # # nu is an upper bound on the fraction of training errors and a lower bound of the fraction of support vectors.
    # model = OneClassSVM(nu=0.1) 
    # wrs2_stats_df['OUTLIER_SVM_SCORE'] = model.fit_predict(wrs2_stats_df[outlier_cols])
    
    # outlier_mask = (
    #     ((wrs2_stats_df['OUTLIER_IF_SCORE'] <= 0) | (wrs2_stats_df['OUTLIER_SVM_SCORE'] <= 0))
    #     & (wrs2_stats_df['SKIPPED'] == 'None')
    # )
    # outlier_df = wrs2_stats_df[outlier_mask].copy()


    # Identify outliers based on z-scores of the SR and LST bands
    # Test out different combinations of bands and z-score values
    z_scores_error = stats.zscore(wrs2_stats_df['ERROR'])
    if z_score_max:
        outlier_mask = (np.abs(z_scores_error) >= z_score_min) & (np.abs(z_scores_error) < z_score_max)
    else:
        outlier_mask = np.abs(z_scores_error) >= z_score_min
    # Test out masking based on combinationso f z-score masks
    # outlier_mask = (np.abs(stats.zscore(wrs2_stats_df['UNMASKED_LST'])) > 3) & (np.abs(stats.zscore(wrs2_stats_df['UNMASKED_SR'])) > 3)
    outlier_df = wrs2_stats_df[outlier_mask].copy()
    if outlier_df.empty:
        continue
    
    # Plot the timeseries of reflectance grouped by outlier or not
    wrs2_stats_df.loc[outlier_mask, 'SKIPPED'] = 'Outlier'
    display(VBox(plot_timeseries(wrs2_stats_df, x='DATE', y=outlier_band, reasons=['None', 'Outlier'])))
    # display(VBox(plot_timeseries(wrs2_stats_df, x='DATE', y='SATURATED_PIXELS', reasons=['None', 'Outlier'])))
    # display(VBox(plot_timeseries(wrs2_stats_df, x='DATE', y='SATURATED_COUNT_RATIO', reasons=['None', 'Outlier'])))


    # Sort by overall reflectance
    # wrs2_stats_df.sort_values('UNMASKED_LST', ascending=True, inplace=True)
    # wrs2_stats_df.sort_values('UNMASKED_SR', ascending=False, inplace=True)
    # wrs2_stats_df.sort_values('UNMASKED_SR', ascending=True, inplace=True)
    # wrs2_stats_df.sort_values('CLOUD_COUNT_RATIO', ascending=False, inplace=True)

    wrs2_skip_scenes = []
    wrs2_skip_count = 0

    # for i, row in wrs2_stats_df.sample(n=min(print_count, len(wrs2_stats_df.index))).iterrows():
    # for i, row in wrs2_stats_df.iterrows():
    for i, row in outlier_df.iterrows():

        scene_id = row["SCENE_ID"].upper()

        above_scene_id = scene_id.upper().replace(wrs2_tgt, wrs2_above)
        above_stats_df = stats_df.loc[stats_df['SCENE_ID'] == above_scene_id]
        if len(above_stats_df):
            above_cloud_pct = float(above_stats_df.iloc[0]['CLOUD_COVER_LAND'])
        else:
            above_cloud_pct = None
            
        below_scene_id = scene_id.upper().replace(wrs2_tgt, wrs2_below)
        below_stats_df = stats_df.loc[stats_df['SCENE_ID'] == below_scene_id]
        if len(below_stats_df):
            below_cloud_pct = float(below_stats_df.iloc[0]['CLOUD_COVER_LAND'])
        else:
            below_cloud_pct = None

        # Only show scenes that have above & below both skipped or None
        if (((above_scene_id not in scene_skip_list) and (above_cloud_pct is not None)) or 
            ((below_scene_id not in scene_skip_list) and (below_cloud_pct is not None))):
            continue   

        # # Only show scenes that have either above & below skipped or None
        # if (((above_scene_id not in scene_skip_list) and (above_cloud_pct is not None)) and 
        #     ((below_scene_id not in scene_skip_list) and (below_cloud_pct is not None))):
        #     continue   
            
        landsat_type = scene_id.split('_')[0].upper()
        landsat_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{scene_id}')
        landsat_region = landsat_img.geometry().bounds(1, 'EPSG:4326')
        landsat_sr_img = landsat_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])

        # Landsat true color image
        landsat_url = (
            landsat_sr_img.where(land_mask.unmask().eq(0), 0.25)
            .getThumbURL({'min': 0.0, 'max': 0.30, 'gamma': 1.25, 'region': landsat_region, 'dimensions': image_size})
        )
    
        # Landsat true color with Fmask
        fmask_url = (
            landsat_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
            .blend(fmask(landsat_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
            .getThumbURL({'region': landsat_region, 'dimensions': image_size})
        )
    
        print('#'*80)
        print(
            f'  {scene_id}  {row["TOTAL_PIXELS"]:>10d}  {row["UNMASKED_PIXELS"]:>10d}'
            f'  ({row["CLOUD_COUNT_RATIO"]:>0.2f}) ({row["SNOW_COUNT_RATIO"]:>0.2f}) {row["CLOUD_COVER_LAND"]}'
            f'  {row["SR_RED"]:0.2f}  {row["SR_GREEN"]:0.2f}  {row["SR_BLUE"]:0.2f}'
        )
        ipyplot.plot_images([landsat_url, fmask_url], img_width=image_size)
    
        # Show the images above and below the target wrs2
        above_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{above_scene_id}')
        above_region = above_img.geometry().bounds(1, 'EPSG:4326')
        above_sr_img = above_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])
        try:
            above_url = (
                above_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
                .blend(fmask(above_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
                .getThumbURL({'region': above_region, 'dimensions': image_size})
            )
        except:
            above_url = None
            
        below_img = ee.Image(f'LANDSAT/{landsat_type}/C02/T1_L2/{below_scene_id}')
        below_region = below_img.geometry().bounds(1, 'EPSG:4326')
        below_sr_img = below_img.select(rgb_bands[landsat_type]).multiply([0.0000275]).add([-0.2])
        try:
            below_url = (
                below_sr_img.where(land_mask.unmask().eq(0), 0.25).visualize(min=0, max=0.3, gamma=1.25)
                .blend(fmask(below_img).where(land_mask.unmask().eq(0), fmask_max).visualize(bands='fmask', min=0, max=fmask_max, palette=fmask_palette))
                .getThumbURL({'region': below_region, 'dimensions': image_size})
            )
        except:
            below_url = None
        
        above_skipped = f' (skipped)' if above_scene_id in scene_skip_list else ''   
        below_skipped = f' (skipped)' if below_scene_id in scene_skip_list else ''
        
        if above_url and below_url:
            print(f'{below_scene_id} ({below_cloud_pct}){below_skipped}  {above_scene_id} ({above_cloud_pct}){above_skipped}')
            ipyplot.plot_images([below_url, above_url], img_width=image_size)
        elif above_url:
            print(f'{above_scene_id} ({above_cloud_pct}){above_skipped}')
            ipyplot.plot_images([above_url], img_width=image_size)
        elif below_url:
            print(f'{below_scene_id} ({below_cloud_pct}){below_skipped}')
            ipyplot.plot_images([below_url], img_width=image_size)
    
        wrs2_skip_scenes.append(scene_id)
        wrs2_skip_count += 1
        if wrs2_skip_count >= print_count:
            break

    if wrs2_skip_scenes:
        wrs2_i += 1
        for scene_id in wrs2_skip_scenes:
            print(scene_id)
        new_skip_scenes.extend(wrs2_skip_scenes)
    
    # break

print('\nDone')

In [None]:
# wrs2_i = 0
# for wrs2 in random.sample(wrs2_tiles, len(wrs2_tiles)):
#     if wrs2 not in ['p042r033']:
#         continue
#     print(wrs2)
    
#     wrs2_path = int(wrs2[1:4])
#     wrs2_row = int(wrs2[5:8])
#     wrs2_tgt = f'{wrs2_path:03d}{wrs2_row:03d}'
#     #wrs2_above = f'{wrs2_path:03d}{wrs2_row-1:03d}'
#     #wrs2_below = f'{wrs2_path:03d}{wrs2_row+1:03d}'    

#     wrs2_stats_df = stats_df[stats_df['WRS2'] == wrs2].copy()

#     wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['YEAR'].astype(int).isin(range(2016, 2026))]
#     #wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['YEAR'].astype(int).isin(range(2020, 2026))]
#     #wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['YEAR'].astype(int).isin(range(2024, 2026))]

#     # Set True for scenes that have already been added to the skip list
#     # wrs2_stats_df['SKIPPED'] = wrs2_stats_df['SCENE_ID'].isin(scene_skip_list)
#     # wrs2_stats_df['CLOUDSCORE'] = wrs2_stats_df['SCENE_ID'].isin(scene_cloudscore_list)

#     if len(wrs2_stats_df.count(axis=1)) == 0:
#         continue
#     print(f'{wrs2} - {len(wrs2_stats_df.count(axis=1))}')

#     wrs2_stats_df.sort_values("DATE", inplace=True)
#     #wrs2_stats_df["DATE"] = wrs2_stats_df["DATE"].astype(int)
#     #wrs2_stats_df["DATE"] = pd.to_datetime(wrs2_stats_df["DATE"])
#     #wrs2_stats_df.set_index('DATE', inplace=True)

#     # wrs2_stats_df.drop(['YEAR', 'WRS2', 'DILATE_PIXELS'], axis=1, inplace=True)
#     # wrs2_stats_df.drop(['MORAN_2K', 'MORAN_4K', 'MORAN_8K'], axis=1, inplace=True)
#     # wrs2_stats_df.drop(['CIRRUS_PIXELS', 'SHADOW_PIXELS', 'SNOW_PIXELS', 'WATER_PIXELS'], axis=1, inplace=True)
    
#     # print(len(wrs2_stats_df.index))
#     # print(wrs2_stats_df.head(1))

#     # # Seaborn works but is not interactive
#     # sns.pairplot(
#     #     wrs2_stats_df[['CLOUD_COUNT_RATIO', 'CLOUD_COVER_LAND', 'SNOW_COUNT_RATIO', 'UNMASKED_SR', 'UNMASKED_TOA', 'SKIPPED']], 
#     #     hue="SKIPPED"
#     # )
#     # sns.relplot(
#     #     data=wrs2_stats_df,
#     #     x="DATE", 
#     #     y="CLOUD_COUNT_RATIO", 
#     #     #col="time",
#     #     hue="SKIPPED", 
#     #     #style="SKIPPED", 
#     #     #size="size",
#     # )
#     # sns.relplot(
#     #     data=wrs2_stats_df,
#     #     x="DATE", 
#     #     y="UNMASKED_TOA", 
#     #     #col="time",
#     #     hue="SKIPPED", 
#     #     #style="SKIPPED", 
#     #     #size="size",
#     # )

#     # # Jupyter-Scatter doesn't do anything
#     # jscatter.plot(
#     #     data=wrs2_stats_df,
#     #     x="CLOUD_COVER_LAND", 
#     #     y="CLOUD_COUNT_RATIO", 
#     #     # color_by="SKIPPED", 
#     # )

#     break

# print('\nDone')

In [None]:
# import plotly.express as px

# fig = px.scatter(wrs2_stats_df, x="DATE", y="CLOUD_COVER_LAND", color="SKIPPED", symbol="SKIPPED", hover_data=['SCENE_ID'])
# fig.update_traces(marker_size=8)
# fig.update_layout(height=800, width=1600)
# fig.show()


In [None]:
# # This wouldn't attach the on_click to all traces
# import plotly.express as px
# from ipywidgets import Output, VBox

# fig = px.scatter(wrs2_stats_df, x="DATE", y="CLOUD_COVER_LAND", color="SKIPPED", symbol="SKIPPED", hover_data=['SCENE_ID'])
# fig.update_traces(marker_size=8)
# fig.update_layout(height=800, width=1600)

# fig = go.FigureWidget(fig.data, fig.layout)
# #print(fig)

# out = Output()
# @out.capture(clear_output=True)
# def do_click(trace, points, selector):
#     #print(trace)
#     print(points)
#     #if points.point_inds and points.trace_index==0:
#     if points.point_inds:
#         index = points.point_inds[0]
#         print(f'Selected SCENE_ID: {wrs2_stats_df.SCENE_ID.iloc[index]}')
    
# fig.data[0].on_click(do_click)

# VBox([fig, out])


In [None]:
# # This wouldn't attach the on_click to all traces
# import plotly.express as px
# from ipywidgets import Output, VBox

# # fig = px.scatter(wrs2_stats_df, x="DATE", y="CLOUD_COVER_LAND", color="SKIPPED", symbol="SKIPPED", hover_data=['SCENE_ID'])
# # fig.update_traces(marker_size=8)
# # fig.update_layout(height=800, width=1600)

# fig = go.FigureWidget(fig.data, fig.layout)
# #print(fig)

# out = Output()
# @out.capture(clear_output=True)
# def do_click(trace, points, selector):
#     print(points.trace_name)
#     #print(trace)
#     #print(points)
#     #if points.point_inds and points.trace_index==0:
#     if points.point_inds:
#         index = points.point_inds[0]
#         print(f'Selected SCENE_ID: {wrs2_stats_df.SCENE_ID.iloc[index]}')
    
# fig.data[0].on_click(do_click)

# VBox([fig, out])


In [None]:
# wrs2_i = 0
# for wrs2 in random.sample(wrs2_tiles, len(wrs2_tiles)):
#     if wrs2_i >= 100:
#         break
#     # if wrs2 not in ['p042r033']:
#     #     continue
#     if int(wrs2[5:8]) not in [25]:
#         continue
#     # print(wrs2)

#     wrs2_tgt = f'{int(wrs2[1:4]):03d}{int(wrs2[5:8]):03d}'

#     wrs2_stats_df = stats_df[stats_df['WRS2'] == wrs2].copy()
#     # wrs2_stats_df = wrs2_stats_df[wrs2_stats_df['YEAR'].astype(int).isin(range(2016, 2026))]

#     if len(wrs2_stats_df.count(axis=1)) == 0:
#         continue
#     print(f'{wrs2} ({wrs2_i}) - {len(wrs2_stats_df.count(axis=1))}')

#     wrs2_stats_df.sort_values("DATE", inplace=True)

#     display(VBox(plot_timeseries(wrs2_stats_df, x='DATE', y='UNMASKED_SR', reasons=reasons)))
#     # display(plot_timeseries(x='DATE', y='UNMASKED_SR', reasons=reasons))
#     # display(plot_square(x='CLOUD_COVER_LAND', y='CLOUD_COUNT_RATIO', reasons=reasons))
#     # display(plot_square(x='UNMASKED_SR', y='UNMASKED_TOA', reasons=reasons))

#     wrs2_i += 1

#     # break

# print('\nDone')

In [None]:
# display(VBox(plot_timeseries(wrs2_stats_df, x='DATE', y='CLOUD_COVER_LAND', reasons=reasons)))

In [None]:
# VBox(plot_timeseries(x='DATE', y='CLOUD_COUNT_RATIO', reasons=reasons))

In [None]:
# display(plot_timeseries(x='DATE', y='UNMASKED_SR', reasons=reasons))

In [None]:
# display(plot_timeseries(x='DATE', y='UNMASKED_TOA', reasons=reasons))

In [None]:
# display(plot_square(x='CLOUD_COVER_LAND', y='CLOUD_COUNT_RATIO', reasons=reasons))

In [None]:
# display(plot_square(x='CLOUD_COUNT_RATIO', y='UNMASKED_SR', reasons=reasons))

In [None]:
# display(plot_square(x='CLOUD_COUNT_RATIO', y='UNMASKED_TOA', reasons=reasons))

In [None]:
# display(plot_square(x='UNMASKED_SR', y='UNMASKED_TOA', reasons=reasons))