**Name**:
Create_ML_data_frame

**Description**:  
This note book create a dateframe for ML model training. The data required is GOES FDC (fire product), MCMI (multispectral image) and ACM (Clear sky mask). In addition it will require VIIRS point data

**Date created**:  
`2024-12-01`

**Author**:  
Asaf Vanunu

---

# Import libraries

In [1]:
import GOES_VIIRS_tools
import os
import rioxarray
import pandas as pd
import geopandas as gpd
import numpy as np
import rasterio
import matplotlib.pyplot as plt
from rasterio.plot import show
import xarray as xr
from datetime import datetime, timedelta, time

# Now we will set the main path

In [2]:
main_dir = "F:\\ML_project" ## Here the data is stored
## Get the subdirectories
sub_dir = [os.path.join(main_dir, folder) for folder in os.listdir(main_dir) if os.path.isdir(os.path.join(main_dir, folder))]
print("Here are the subdirectories: ")
print(sub_dir) 

## Now we will set GOES main directory
for folder in sub_dir:
    s = os.path.basename(folder)
    if s == "GOES_16":
        GOES_dir = folder
print("\n")
print("GOES directory is: ")
print(GOES_dir)

## Now we will set GOES subdirectories
GOES_sub_dir = [os.path.join(GOES_dir, folder) for folder in os.listdir(GOES_dir) if os.path.isdir(os.path.join(GOES_dir, folder))]
for folder in GOES_sub_dir:
    s = os.path.basename(folder)
    if s == "MCMI":
        GOES_MCMI_dir = folder
    elif s == "ACM":
        GOES_ACM_dir = folder
    elif s == "FDC":
        GOES_FDC_dir = folder
## Print the subdirectories
print("\n")
print("GOES MCMI directory is: ")
print(GOES_MCMI_dir)
print("GOES ACM directory is: ")
print(GOES_ACM_dir)
print("GOES FDC directory is: ")
print(GOES_FDC_dir)
        
    


Here are the subdirectories: 
['F:\\ML_project\\brazil', 'F:\\ML_project\\califronia', 'F:\\ML_project\\canada', 'F:\\ML_project\\east_us', 'F:\\ML_project\\GOES_16', 'F:\\ML_project\\mexico', 'F:\\ML_project\\patagonia']


GOES directory is: 
F:\ML_project\GOES_16


GOES MCMI directory is: 
F:\ML_project\GOES_16\MCMI
GOES ACM directory is: 
F:\ML_project\GOES_16\ACM
GOES FDC directory is: 
F:\ML_project\GOES_16\FDC


# Now we will make sure we use only MCMI files that have matching FDC and ACM files
* Since MCMI files are multispectral images, we will use ACM for cloud masking and FDC for compersion of fire detections

In [3]:
## Take all netCDF files in the subdirectories
MCMI_files = np.array([os.path.join(GOES_MCMI_dir, file) for file in os.listdir(GOES_MCMI_dir) if file.endswith(".nc")])
FDC_files = np.array([os.path.join(GOES_FDC_dir, file) for file in os.listdir(GOES_FDC_dir) if file.endswith(".nc")])
ACM_files = np.array([os.path.join(GOES_ACM_dir, file) for file in os.listdir(GOES_ACM_dir) if file.endswith(".nc")])

In [4]:
## a function that take the last part of the NetCDF file name
def get_NetCDF_name(NetCDF_path):
    """This function takes the last part of the NetCDF file name

    Args:
        NetCDF_path (string): The path of the NetCDF file for example 'F:\\ML_project\\GOES_16\\MCMI\\OR_ABI-L2-MCMIPC-M6_G16_s202301010751.nc'
    """
    
    base_name = os.path.basename(NetCDF_path) ## Get the base name of the file
    file_name = base_name.split("_")[-1] ## Get the last part of the file name
    return file_name ## Return the last part of the file name
    

In [5]:
def get_MCMI_time_from_file_name(MCMI_file, format):
    """This function gets a path of MCMI NetCDF file and returns the time of the file in a string format

    Args:
        MCMI_file (string): for example 'F:\\ML_project\\GOES_16\\MCMI\\OR_ABI-L2-MCMIPC-M6_G16_s202301010751.nc'
        format (string): The format of the time for example "string" or "time" for string format or time format respectively
    """
    if format not in ["string", "time"]: ## If the format is not string or time
        raise ValueError("The format should be either 'string' or 'time'") ## Raise an error
    
    file_name = MCMI_file.split("_")[-1].split(".")[0] ## Get the last part of the file name
    Year = file_name[1:5] ## Get the year
    Month = file_name[5:7] ## Get the month
    Day = file_name[7:9] ## Get the day
    Hour = file_name[9:11] ## Get the hour
    Minute = file_name[11:13] ## Get the minute
    date_string = f"{Year}-{Month}-{Day} {Hour}:{Minute}" ## Create the date string
    
    if format == "string": ## If the format is string
        return date_string
    elif format == "time": ## If the format is time
        time_format = '%Y-%m-%d %H:%M' ## set a time format
        converted_time = datetime.strptime(date_string, time_format) ## convert to time
        return converted_time ## return the time
    

In [6]:
def create_matching_MCMI_df(MCMI_file_list, FDC_file_list, ACM_file_list):
    """This function creates a pandas dataframe that contains the matching MCMI, FDC, and ACM files

    Args:
        MCMI_file_list (list): A list of MCMI files
        FDC_file_list (list): A list of FDC files
        ACM_file_list (list): A list of ACM files
    """
    FDC_NC_name_list = list(map(get_NetCDF_name, FDC_file_list)) ## Get the last part of the FDC file names
    ACM_NC_name_list = list(map(get_NetCDF_name, ACM_file_list)) ## Get the last part of the ACM file names
    
    NC_name_list = [] ## A list to store the last part of the MCMI file names
    NC_time_list = [] ## A list to store the time of the MCMI files
    MCMI_match_list = [] ## A list to store the matching MCMI files
    FDC_match_list = [] ## A list to store the matching FDC files
    ACM_match_list = [] ## A list to store the matching ACM
    for MCMI_file in MCMI_file_list: ## Loop over the MCMI files
        MCMI_name = get_NetCDF_name(MCMI_file) ## Get the last part of the MCMI file name
        con = (np.isin(MCMI_name, FDC_NC_name_list)) & (np.isin(MCMI_name, ACM_NC_name_list)) ## Check if the last part of the MCMI file name is in the FDC and ACM file names
        if con == True: ## If the last part of the MCMI file name is in the FDC and ACM file names
            file_time = get_MCMI_time_from_file_name(MCMI_file=MCMI_file, format="string") ## Get the time of the MCMI file
            NC_time_list.append(file_time) ## Append the time of the MCMI file to the list
            NC_name_list.append(MCMI_name) ## Append the last part of the MCMI file name to the list
            MCMI_match_list.append(MCMI_file) ## Append the MCMI file to the list
            location_FDC = np.where(np.array(FDC_NC_name_list) == MCMI_name)[0][0] ## Get the location of the last part of the MCMI file name in the FDC file names
            FDC_match_list.append(FDC_file_list[location_FDC]) ## Append the FDC file to the list
            location_ACM = np.where(np.array(ACM_NC_name_list) == MCMI_name)[0][0] ## Get the location of the last part of the MCMI file name in the ACM file names
            ACM_match_list.append(ACM_file_list[location_ACM]) ## Append the ACM file to the list
            
    ## Create a pandas dataframe
    df = pd.DataFrame({"GOES_file_name":NC_name_list, "GOES_date_time":NC_time_list ,"MCMI": MCMI_match_list,
                       "FDC": FDC_match_list, "ACM": ACM_match_list})
    return df
        

In [7]:
NC_df = create_matching_MCMI_df(MCMI_files, FDC_files, ACM_files)
NC_df[:5]

Unnamed: 0,GOES_file_name,GOES_date_time,MCMI,FDC,ACM
0,s202201010731.nc,2022-01-01 07:31,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...
1,s202201010911.nc,2022-01-01 09:11,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...
2,s202201011846.nc,2022-01-01 18:46,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...
3,s202201012021.nc,2022-01-01 20:21,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...
4,s202201012031.nc,2022-01-01 20:31,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...


# Now we will load our VIIRS files so we can add their time to the data frame

In [7]:
AOI_list = ["east_us", "mexico"] ## The area of interest

In [8]:
for folder in sub_dir: ## Loop over the subdirectories
    folder_name = os.path.basename(folder) ## Get the base name of the folder for example "east_us"
    if folder_name in AOI_list: ## If the folder name is in the area of interest list
        ## Get the subdirectories of the AOI
        AOI_folders = [os.path.join(folder, sub_folder) for sub_folder in os.listdir(folder) if os.path.isdir(os.path.join(folder, sub_folder))]
        ## Loop over the subdirectories of the AOI
        for sub_folder in AOI_folders:
            if os.path.basename(sub_folder) == "VIIRS": ## If the subdirectory is VIIRS
                VIIRS_dir = sub_folder ## Set the VIIRS directory
                ## Get the subdirectories of the VIIRS directory
                VIIRS_folders = [os.path.join(VIIRS_dir, f) for f in os.listdir(VIIRS_dir) if os.path.isdir(os.path.join(VIIRS_dir, f))]
                ## Loop over the subdirectories of the VIIRS directory
                for VIIRS_folder in VIIRS_folders: ##
                    if os.path.basename(VIIRS_folder) == "VIIRS_points": ## If the subdirectory is VIIRS_points
                        if folder_name == "east_us": ## If the folder name is east_us
                            VIIRS_points_path_east_us = os.path.join(VIIRS_folder, "VIIRS_points.shp") ## Set the VIIRS points path for east US
                        elif folder_name == "mexico": ## If the folder name is mexico
                            VIIRS_points_path_mexico = os.path.join(VIIRS_folder, "VIIRS_points.shp")
                            
print(f"The VIIRS points path for east US is: {VIIRS_points_path_east_us}")
print(f"The VIIRS points path for Mexico is: {VIIRS_points_path_mexico}")

The VIIRS points path for east US is: F:\ML_project\east_us\VIIRS\VIIRS_points\VIIRS_points.shp
The VIIRS points path for Mexico is: F:\ML_project\mexico\VIIRS\VIIRS_points\VIIRS_points.shp


* Now we can open the VIIRS files

In [9]:
VIIRS_east_us = gpd.read_file(VIIRS_points_path_east_us) ## Read the VIIRS points shapefile for east US
VIIRS_mexico = gpd.read_file(VIIRS_points_path_mexico) ## Read the VIIRS points shape

In [10]:
VIIRS = pd.concat([VIIRS_east_us, VIIRS_mexico], ignore_index=True) ## Concatenate the VIIRS points for east US and Mexico

In [11]:
VIIRS[:5]

Unnamed: 0,latitude,longitude,Fire_file,Unique_dat,Unique_tim,DATE,TIME,DATE_TIME,fire_radia,fire_pixel,fire_pix_1,scan_line,grid_sampl,View_Zenit,night/day,region,geometry
0,34.642311,-87.08979,VNP14IMG.A2022001.0724.002.2024075110909.nc,A2022001,730,2022-01-01,07:30,2022-01-01 07:30,0.697141,8,n,1847,2511,20.98,Night,east_US,POINT (-87.08979 34.64231)
1,34.63818,-87.085693,VNP14IMG.A2022001.0724.002.2024075110909.nc,A2022001,730,2022-01-01,07:30,2022-01-01 07:30,3.107789,8,n,1848,2512,20.949999,Night,east_US,POINT (-87.08569 34.63818)
2,33.995903,-88.471245,VNP14IMG.A2022001.0724.002.2024075110909.nc,A2022001,730,2022-01-01,07:30,2022-01-01 07:30,1.867678,8,n,2092,2275,28.279999,Night,east_US,POINT (-88.47124 33.9959)
3,33.992199,-88.471901,VNP14IMG.A2022001.0724.002.2024075110909.nc,A2022001,730,2022-01-01,07:30,2022-01-01 07:30,1.867678,8,n,2093,2275,28.279999,Night,east_US,POINT (-88.4719 33.9922)
4,33.448055,-88.569901,VNP14IMG.A2022001.0724.002.2024075110909.nc,A2022001,730,2022-01-01,07:30,2022-01-01 07:30,1.585221,8,n,2258,2281,28.099998,Night,east_US,POINT (-88.5699 33.44806)


* Now we can get the VIIRS file that match the GOES files

In [12]:
time_format = '%Y-%m-%d %H:%M' ## set a time format
VIIRS_unqiue_time_list = list(map(lambda x: datetime.strptime(x, time_format), np.unique(VIIRS["DATE_TIME"]))) ## Get the unique time of the VIIRS points
VIIRS_unqiue_time_list[:5]

[datetime.datetime(2022, 1, 1, 7, 30),
 datetime.datetime(2022, 1, 1, 9, 12),
 datetime.datetime(2022, 1, 1, 18, 48),
 datetime.datetime(2022, 1, 1, 20, 24),
 datetime.datetime(2022, 1, 1, 20, 30)]

In [13]:
def time_delta(date_time_obj, time_detla):
    """This function gets a single date time object and a time delta and returns a new date time object.

    Args:
        date_time_obj (datetime): a single date time object
        time_detla (int): a time delta in minutes
    """
    new_date_time_obj = date_time_obj + timedelta(minutes = time_detla) ## add the time delta to the date time object
    return new_date_time_obj

In [14]:
def time_in_range(start, end, current_time):
    """Return true if current_time is in the range [start, end]"""
    if start <= end:
        return start <= current_time <= end
    else:
        return start <= current_time or current_time <= end

In [15]:
len(np.unique(VIIRS["DATE_TIME"]))

4495

In [16]:
len(np.unique(VIIRS["Fire_file"]))

4495

In [17]:
def match_GOES_VIIRS_files(GOES_NC_df, VIIRS_file):
    """This function gets GOES NC df and a VIIRS shapefile and returns the matching GOES files inside the df

    Args:
        GOES_NC_df (DataFrame): GOES_nc_df
        VIIRS_file (GeoDataFrame): VIIRS GeoDataFrame
    """
    if not isinstance(GOES_NC_df, pd.DataFrame): ## If the GOES_NC_df is not a pandas DataFrame
        raise ValueError("GOES_NC_df should be a pandas DataFrame") ## Raise an error
    elif not isinstance(VIIRS_file, gpd.GeoDataFrame): ## If the VIIRS_file is not a GeoDataFrame
        raise ValueError("VIIRS_file should be a GeoDataFrame") ## Raise an error
    
    time_format = '%Y-%m-%d %H:%M' ## set a time format
    
    df = GOES_NC_df.copy() ## Set the GOES NC df
    df["VIIRS_file"] = np.repeat(0, len(df)) ## Create a new column for the VIIRS file
    ## Create an array of unique time of the VIIRS file
    VIIRS_unique_time_list = np.array(list(map(lambda x: datetime.strptime(x, time_format), np.unique(VIIRS_file["DATE_TIME"]))))
    ## Create an array of unique fire file of the VIIRS file
    VIIRS_unique_fire_file_list = np.array(np.unique(VIIRS_file["Fire_file"]))
    
    GOES_time_list = list(map(lambda x: datetime.strptime(x, time_format), list(df["GOES_date_time"]))) ## Get the GOES time list
    
    if len(VIIRS_unique_time_list) != len(VIIRS_unique_fire_file_list): ## If the length of the unique time and fire file are not equal
        raise ValueError("The length of the unique time and fire file should be equal") ## Raise an error
        
    ## Now we will match GOES time
    
    for i in range(len(VIIRS_unique_time_list)): ## Loop over the unique time of the VIIRS file
        VIIRS_time = VIIRS_unique_time_list[i] ## Get the VIIRS time
        VIIRS_fire_file = VIIRS_unique_fire_file_list[i] ## Get the VIIRS fire file
        ## Get the time delta of the VIIRS time
        start_time = time_delta(VIIRS_time, -3)
        end_time = time_delta(VIIRS_time, 3)
        ## determine which GOES file it matches
        GOES_match_list = list(map(lambda x: time_in_range(start_time, end_time, x), GOES_time_list)) ## Check if the GOES time is in the range of the VIIRS time
        if np.sum(GOES_match_list) > 0: ## If there is a match
            GOES_match_index = np.where(GOES_match_list)[0] ## Get the index of the GOES match
            df["VIIRS_file"].loc[GOES_match_index] = VIIRS_fire_file ## Set the VIIRS file to the GOES match
            print(f"{i+1}/{len(VIIRS_unique_time_list)}") ## Print the progress
    if np.any(df["VIIRS_file"] == 0): ## If there are some GOES files that do not have a matching VIIRS file
        raise ValueError("There are some GOES files that do not have a matching VIIRS file") ## Raise an error
    return df ## Return the GOES NC df
    

* Here is the GOES files with the matching VIIRS files

In [19]:
NC_VIIRS_df = match_GOES_VIIRS_files(NC_df, VIIRS)
NC_VIIRS_df[:5]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["VIIRS_file"].loc[GOES_match_index] = VIIRS_fire_file ## Set the VIIRS file to the GOES match


1/4495
2/4495
3/4495
4/4495
5/4495
6/4495
7/4495
8/4495
9/4495
10/4495
11/4495
12/4495
13/4495
14/4495
15/4495
16/4495
17/4495
18/4495
19/4495
20/4495
21/4495
22/4495
23/4495
24/4495
25/4495
26/4495
27/4495
28/4495
29/4495
30/4495
31/4495
32/4495
33/4495
34/4495
35/4495
36/4495
37/4495
38/4495
39/4495
40/4495
41/4495
42/4495
43/4495
44/4495
45/4495
46/4495
47/4495
48/4495
49/4495
50/4495
51/4495
52/4495
53/4495
54/4495
55/4495
56/4495
57/4495
58/4495
59/4495
60/4495
61/4495
62/4495
63/4495
64/4495
65/4495
66/4495
67/4495
68/4495
69/4495
70/4495
71/4495
72/4495
73/4495
74/4495
75/4495
76/4495
77/4495
78/4495
79/4495
80/4495
81/4495
82/4495
83/4495
84/4495
85/4495
86/4495
87/4495
88/4495
89/4495
90/4495
91/4495
92/4495
93/4495
94/4495
95/4495
96/4495
97/4495
98/4495
99/4495
100/4495
101/4495
102/4495
103/4495
104/4495
105/4495
106/4495
107/4495
108/4495
109/4495
110/4495
111/4495
112/4495
113/4495
114/4495
115/4495
116/4495
117/4495
118/4495
119/4495
120/4495
121/4495
122/4495
123/4495
1

Unnamed: 0,GOES_file_name,GOES_date_time,MCMI,FDC,ACM,VIIRS_file
0,s202201010731.nc,2022-01-01 07:31,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...,VNP14IMG.A2022001.0724.002.2024075110909.nc
1,s202201010911.nc,2022-01-01 09:11,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...,VNP14IMG.A2022001.0906.002.2024075110907.nc
2,s202201011846.nc,2022-01-01 18:46,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...,VNP14IMG.A2022001.1842.002.2024075110906.nc
3,s202201012021.nc,2022-01-01 20:21,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...,VNP14IMG.A2022001.2018.002.2024075110907.nc
4,s202201012031.nc,2022-01-01 20:31,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...,VNP14IMG.A2022001.2024.002.2024075110907.nc


In [20]:
## Alternatively we can Load the dataframe after we have created it
for file in os.listdir(os.getcwd()):
    if file.endswith(".csv"):
        NC_VIIRS_df = pd.read_csv(f"{os.path.join(os.getcwd(), file)}")

In [21]:
NC_VIIRS_df[:5]

Unnamed: 0,GOES_file_name,GOES_date_time,MCMI,FDC,ACM,VIIRS_file,VIIRS_file_full_path
0,s202201010731.nc,2022-01-01 07:31,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...,VNP14IMG.A2022001.0724.002.2024075110909.nc,F:\ML_project\east_us\VIIRS\VIIRS_fire\VNP14IM...
1,s202201010911.nc,2022-01-01 09:11,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...,VNP14IMG.A2022001.0906.002.2024075110907.nc,F:\ML_project\east_us\VIIRS\VIIRS_fire\VNP14IM...
2,s202201011846.nc,2022-01-01 18:46,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...,VNP14IMG.A2022001.1842.002.2024075110906.nc,F:\ML_project\east_us\VIIRS\VIIRS_fire\VNP14IM...
3,s202201012021.nc,2022-01-01 20:21,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...,VNP14IMG.A2022001.2018.002.2024075110907.nc,F:\ML_project\east_us\VIIRS\VIIRS_fire\VNP14IM...
4,s202201012031.nc,2022-01-01 20:31,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...,VNP14IMG.A2022001.2024.002.2024075110907.nc,F:\ML_project\east_us\VIIRS\VIIRS_fire\VNP14IM...


# Now we will create a function that will crop GOES image to the matching VIIRS image

- For that we will need to get the VIIRS fire product full path

In [20]:
for dir in sub_dir: ## Loop over the subdirectories
    if np.isin(os.path.basename(dir), AOI_list) == True: ## If the base name of the directory is in the AOI list
        VIIRS_dir = [os.path.join(dir, folder) for folder in os.listdir(dir) if folder == "VIIRS"][0] ## Get the VIIRS directory
        VIIRS_fire_dir = [os.path.join(VIIRS_dir, folder) for folder in os.listdir(VIIRS_dir) if folder == "VIIRS_fire"][0] ## Get the VIIRS fire directory
        if os.path.basename(dir) == "east_us": ## If the base name of the directory is east_us
            ## Get the VIIRS fire files for east US
            VIIRS_fire_east_us = [os.path.join(VIIRS_fire_dir, file) for file in os.listdir(VIIRS_fire_dir) if file.endswith(".nc")]
        elif os.path.basename(dir) == "mexico": ## If the base name of the directory is mexico
            ## Get the VIIRS fire files for Mexico
            VIIRS_fire_mexico = [os.path.join(VIIRS_fire_dir, file) for file in os.listdir(VIIRS_fire_dir) if file.endswith(".nc")]
        print(f"The VIIRS fire directory for {os.path.basename(dir)} is: {VIIRS_fire_dir}") ## Print the VIIRS fire directory

The VIIRS fire directory for east_us is: F:\ML_project\east_us\VIIRS\VIIRS_fire
The VIIRS fire directory for mexico is: F:\ML_project\mexico\VIIRS\VIIRS_fire


* Now we have the VIIRS fire product full path for both us_east and mexico

In [21]:
VIIRS_fire_product_list = VIIRS_fire_east_us + VIIRS_fire_mexico ## Concatenate the VIIRS fire files for east US and Mexico
VIIRS_fire_product_list = np.array(VIIRS_fire_product_list) ## Convert the list to an array
VIIRS_fire_product_list[:5]

array(['F:\\ML_project\\east_us\\VIIRS\\VIIRS_fire\\VNP14IMG.A2022001.0718.002.2024075110906.nc',
       'F:\\ML_project\\east_us\\VIIRS\\VIIRS_fire\\VNP14IMG.A2022001.0724.002.2024075110909.nc',
       'F:\\ML_project\\east_us\\VIIRS\\VIIRS_fire\\VNP14IMG.A2022001.0900.002.2024075110907.nc',
       'F:\\ML_project\\east_us\\VIIRS\\VIIRS_fire\\VNP14IMG.A2022001.0906.002.2024075110907.nc',
       'F:\\ML_project\\east_us\\VIIRS\\VIIRS_fire\\VNP14IMG.A2022001.1042.002.2024075110909.nc'],
      dtype='<U82')

In [22]:
NC_VIIRS_df["VIIRS_file_full_path"] = np.repeat(0, len(NC_VIIRS_df)) ## Create a new column for the VIIRS file full path

In [23]:
VIIRS_fire_product_list_base_name = np.array(list(map(os.path.basename, VIIRS_fire_product_list))) ## Get the base name of the VIIRS fire files
VIIRS_fire_product_list_base_name

array(['VNP14IMG.A2022001.0718.002.2024075110906.nc',
       'VNP14IMG.A2022001.0724.002.2024075110909.nc',
       'VNP14IMG.A2022001.0900.002.2024075110907.nc', ...,
       'VNP14IMG.A2023365.1912.002.2024001102105.nc',
       'VNP14IMG.A2023365.2048.002.2024001140929.nc',
       'VNP14IMG.A2023365.2054.002.2024001140931.nc'], dtype='<U43')

In [24]:
VIIRS_fire_product_list_base_name = np.array(list(map(os.path.basename, VIIRS_fire_product_list))) ## Get the base name of the VIIRS fire files
for i in range(len(NC_VIIRS_df)): ## Loop over the VIIRS fire files
    VIIRS_file_name = NC_VIIRS_df["VIIRS_file"].iloc[i] ## Get the VIIRS fire file name
    location = np.where(VIIRS_fire_product_list_base_name==VIIRS_file_name)[0][0]
    full_VIIRS_file = VIIRS_fire_product_list[location] ## Get the full path of the VIIRS fire file
    NC_VIIRS_df["VIIRS_file_full_path"].iloc[i] = full_VIIRS_file ## Set the full
    print(f"{i+1}/{len(NC_VIIRS_df)}") ## Print the progress 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NC_VIIRS_df["VIIRS_file_full_path"].iloc[i] = full_VIIRS_file ## Set the full


1/4460
2/4460
3/4460
4/4460
5/4460
6/4460
7/4460
8/4460
9/4460
10/4460
11/4460
12/4460
13/4460
14/4460
15/4460
16/4460
17/4460
18/4460
19/4460
20/4460
21/4460
22/4460
23/4460
24/4460
25/4460
26/4460
27/4460
28/4460
29/4460
30/4460
31/4460
32/4460
33/4460
34/4460
35/4460
36/4460
37/4460
38/4460
39/4460
40/4460
41/4460
42/4460
43/4460
44/4460
45/4460
46/4460
47/4460
48/4460
49/4460
50/4460
51/4460
52/4460
53/4460
54/4460
55/4460
56/4460
57/4460
58/4460
59/4460
60/4460
61/4460
62/4460
63/4460
64/4460
65/4460
66/4460
67/4460
68/4460
69/4460
70/4460
71/4460
72/4460
73/4460
74/4460
75/4460
76/4460
77/4460
78/4460
79/4460
80/4460
81/4460
82/4460
83/4460
84/4460
85/4460
86/4460
87/4460
88/4460
89/4460
90/4460
91/4460
92/4460
93/4460
94/4460
95/4460
96/4460
97/4460
98/4460
99/4460
100/4460
101/4460
102/4460
103/4460
104/4460
105/4460
106/4460
107/4460
108/4460
109/4460
110/4460
111/4460
112/4460
113/4460
114/4460
115/4460
116/4460
117/4460
118/4460
119/4460
120/4460
121/4460
122/4460
123/4460
1

- and we will create a function to open GOES MCMI image in a requested band

In [22]:
def open_MCMI(MCMI_path, band_number):
    """This function opens the MCMI NetCDF file and returns the band of the given band number

    Args:
        MCMI_path (string): The path of the MCMI NetCDF file for example 'F:\\ML_project\\GOES_16\\MCMI\\OR_ABI-L2-MCMIPC-M6_G16_s202301010751.nc'
        band_number (int): The band number. for example 1

    Returns:
        xarray.DataArray: The band of the given band number
    """
    if (band_number < 1) or (band_number > 16): ## Check if the band number is between 1 and 16
        raise ValueError("The band number should be between 1 and 16") ## Raise an error
    
    band = f"CMI_C{band_number:02d}" ## The band name
    try:
        GOES_file = rioxarray.open_rasterio(MCMI_path) ## Open the MCMI NetCDF file
        GOES_CRS = GOES_file.rio.crs ## Get the CRS of the file
        MCMI = GOES_file.copy() ## Copy the MCMI file
    except: ## If there is an error
        print(f"Error in opening the MCMI file: {MCMI_path}") ## Print an error message
        return None ## Return None
    MCMI = MCMI.astype("float32") ## Convert the MCMI to float32
    MCMI_add_factor = MCMI[band].attrs["add_offset"] ## Get the add offset
    MCMI_scale_factor = MCMI[band].attrs["scale_factor"] ## Get the scale factor
    MCMI_fill_value = MCMI[band].attrs["_FillValue"] ## Get the fill value
    MCMI_values = MCMI[band].values[0] ## Get the values of the band
    MCMI_values[MCMI_values == MCMI_fill_value] = np.nan ## set the fill value to nan
    MCMI[band].values[0] = MCMI_values * MCMI_scale_factor + MCMI_add_factor ## Get the values of the band
    MCMI[band] = MCMI[band].rio.write_crs(GOES_CRS) ## Write the CRS of the band
    return MCMI[band] ## Return the values of the band
        
    

In [23]:
def open_FDC(FDC_path, product_name):
    """This function opens the FDC NetCDF file and returns the fire detection confidence

    Args:
        FDC_path (string): The path of the FDC NetCDF file for example 'F:\\ML_project\\GOES_16\\FDC\\OR_ABI-L2-FDCC-M6_G16_s202301010751.nc'
        product_name (string): The product name. for example 'Mask' or "Temp" or "Power"

    Returns:
        xarray.DataArray: The fire product values
    """
    try:
        GOES_file = rioxarray.open_rasterio(FDC_path) ## Open the FDC NetCDF file
        GOES_CRS = GOES_file.rio.crs ## Get the CRS of the file
        FDC = GOES_file.copy() ## Copy the FDC file
        FDC = FDC.astype("float32") ## Convert the FDC to float32
        FDC_add_factor = FDC[product_name].attrs["add_offset"] ## Get the add offset
        FDC_scale_factor = FDC[product_name].attrs["scale_factor"] ## Get the scale factor
        FDC_fill_value = FDC[product_name].attrs["_FillValue"] ## Get the fill value
        FDC_values = FDC[product_name].values[0] ## Get the values of the band
        FDC_values[FDC_values == FDC_fill_value] = np.nan ## set the fill value to nan
        FDC[product_name].values[0] = FDC_values * FDC_scale_factor + FDC_add_factor ## Get the values of the fire detection confidence
        FDC[product_name] = FDC[product_name].rio.write_crs(GOES_CRS) ## Write the CRS of the fire detection confidence
        return FDC[product_name] ## Return the values of the fire detection confidence
    except: ## If there is an error
        print(f"Error in opening the FDC file: {FDC_path}") ## Print an error message
        return None ## Return None

In [24]:
def open_ACM(ACM_path, product_name):
    """This function opens the ACM NetCDF file and returns the clear sky mask

    Args:
        ACM_path (string): The path of the ACM NetCDF file for example 'F:\\ML_project\\GOES_16\\ACM\\OR_ABI-L2-ACMC-M6_G16_s202301010751.nc'
        product_name (string): The product name. for example 'ACM' for 4 level classification where:
        0: Clear, 1: Probably Clear, 2: Probably Cloudy, 3: Cloudy
        and BCM for 2 level classification where:
        0: Clear, 1: Cloudy 

    Returns:
        xarray.DataArray: clear sky mask values
    """
    try:
        GOES_image = rioxarray.open_rasterio(ACM_path) ## Open the ACM NetCDF file
        GOES_CRS = GOES_image.rio.crs ## Get the CRS of the file
        ACM = GOES_image.copy() ## Copy the ACM file
        ACM = ACM.astype("float32") ## Convert the ACM to float32
        ACM_add_factor = ACM[product_name].attrs["add_offset"] ## Get the add offset
        ACM_scale_factor = ACM[product_name].attrs["scale_factor"] ## Get the scale factor
        ACM_values = ACM[product_name].values[0] * ACM_scale_factor + ACM_add_factor ## Get the values of the active fire pixels
        ACM_fill_value = ACM[product_name].attrs["_FillValue"] ## Get the fill value
        ACM_values = ACM[product_name].values[0] ## Get the values of the band
        ACM_values[ACM_values == ACM_fill_value] = np.nan ## set the fill value to nan
        ACM[product_name].values[0] = ACM_values * ACM_scale_factor + ACM_add_factor ## Get the values of the fire detection confidence
        ACM[product_name] = ACM[product_name].rio.write_crs(GOES_CRS) ## Write the CRS of the band
        return ACM[product_name] ## Return the values of the fire detection confidence
    except: ## If there is an error
        print(f"Error in opening the ACM file: {ACM_path}") ## Print an error message
        return None ## Return None

In [25]:
def fix_fill_values(cropped_GOES_image):
    """This function gets a cropped GOES image and replace the fill values with nan and return the fixed image

    Args:
        cropped_GOES_image (xr.DataArray): a rioxarray DataArray
    """
    if not isinstance(cropped_GOES_image, xr.DataArray):
        raise ValueError("cropped_GOES_image should be a rioxarray DataArray")
    else:
        GOES_fill_value = cropped_GOES_image.attrs["_FillValue"] ## Get the fill value
        cropped_GOES_image.values[0][cropped_GOES_image.values[0] == GOES_fill_value] = np.nan ## Set the fill value to nan
        return cropped_GOES_image ## Return the fixed image

In [26]:

def crop_GOES_using_VIIRS(GOES_path, GOES_band, VIIRS_path):
    """This function crops the GOES file using the VIIRS file. It returns the cropped GOES file

    Args:
        GOES_path (string): The path of the GOES file. Can be MCMI, FDC, or ACM files. for example 'F:\\ML_project\\GOES_16\\MCMI\\OR_ABI-L2-MCMIPC-M6_G16_s202301010751.nc
        GOES_band (string\int): The band of the GOES file. can be 7 for MCMI. For FDC can be "Mask", "Temp", "Power". For ACM can be "Mask", "Temp", "Power". For ACM can be "ACM" or "BCM"
        VIIRS_path (string): The path of the VIIRS file. for example 'F:\\ML_project\\east_us\\VIIRS\\VIIRS_fire\\VNP14IMG.nc'
    """
    CMI_bands = list(range(1,17)) ## The CMI bands
    FDC_bands = ["Mask", "Temp", "Power"] ## The FDC bands
    ACM_bands = ["ACM", "BCM"] ## The ACM bands
    band_types = CMI_bands + FDC_bands + ACM_bands ## The band types
    file_name = os.path.basename(GOES_path) ## Get the base name of the GOES file
    file_type = file_name.split("-")[2] ## Get the file type of the GOES file
    if file_type not in ["MCMIPC", "FDCC", "ACMC"]: ## If the file type is not MCMIPC, FDCC, or ACMC
        raise ValueError("The GOES file should be either MCMI, FDC, or ACM files") ## Raise an error
    if GOES_band not in band_types:
        raise ValueError("The GOES band should be either CMI for MCMI, Mask, Temp, Power for FDC, and ACM, BCM for ACM")
    
    if file_type == "MCMIPC": ## If the file type is MCMIPC
        GOES_image = open_MCMI(MCMI_path=GOES_path, band_number=GOES_band) ## Open the MCMI file
    elif file_type == "FDCC": ## If the file type is FDCC
        GOES_image = open_FDC(FDC_path=GOES_path, product_name=GOES_band) ## Open the FDC file
    elif file_type == "ACMC": ## If the file type is ACMC
        GOES_image = open_ACM(ACM_path=GOES_path, product_name=GOES_band) ## Open the ACM file
        
    try: ## Try to get the VIIRS polygon
        GOES_CRS = GOES_image.rio.crs ## Get the CRS of the GOES file
        VIIRS_polygon = GOES_VIIRS_tools.get_VIIRS_bounds_polygon(VIIRS_path) ## Get the VIIRS polygon
    except: ## If there is an error
        print(f"Error in getting the VIIRS polygon for the VIIRS file: {VIIRS_path}") ## Print an error message
        return None
    
    try: ## Try to crop the GOES image
        VIIRS_polygon = VIIRS_polygon.to_crs(GOES_CRS) ## Convert the VIIRS polygon to the CRS of the GOES image
        GOES_cropped = GOES_image.rio.clip(VIIRS_polygon.geometry) ## Clip the GOES image using the VIIRS polygon
        corrected_GOES_cropped = fix_fill_values(GOES_cropped) ## Fix the fill values of the cropped GOES image
        return corrected_GOES_cropped ## Return the cropped GOES image
    except: ## If there is an error
        print(f"Error in cropping the GOES image: {GOES_path}")
        return None
    
    

- let's test the function

In [28]:
cliped_GOES = crop_GOES_using_VIIRS(NC_VIIRS_df["MCMI"].iloc[0], 7, NC_VIIRS_df["VIIRS_file_full_path"].iloc[0])

In [31]:
cliped_GOES.rio.to_raster("F:\\ML_project\\GOES_16\\cliped_GOES.tif") ## Save the clipped GOES image as an example image

# next we will see how many FDC fire pixels have cloud cover from matching ACM image

- for this we will create a new notebook to not make this one too long

In [33]:
NC_VIIRS_df.to_csv("NC_VIIRS_df.csv", index=False)