**Name**:
Create_ML_data_frame

**Description**:  
This note book create a dateframe of GOES images and their temporal 4 previous images.

**Date created**:  
`2024-12-24`

**Author**:  
Asaf Vanunu

---

In [1]:
import GOES_VIIRS_tools
import os
import rioxarray
import pandas as pd
import geopandas as gpd
import numpy as np
import rasterio
import matplotlib.pyplot as plt
from rasterio.plot import show
import xarray as xr
from datetime import datetime, timedelta, time

* open CSV of GOES and VIIRS df

In [2]:
for file in os.listdir(os.getcwd()): ## loop through all files in the directory
    if file.endswith("csv"): ## if the file is a csv
        df = pd.read_csv(file) ## read the csv
df[:5]

Unnamed: 0,GOES_file_name,GOES_date_time,MCMI,FDC,ACM,VIIRS_file,VIIRS_file_full_path
0,s202201010731.nc,2022-01-01 07:31,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...,VNP14IMG.A2022001.0724.002.2024075110909.nc,F:\ML_project\east_us\VIIRS\VIIRS_fire\VNP14IM...
1,s202201010911.nc,2022-01-01 09:11,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...,VNP14IMG.A2022001.0906.002.2024075110907.nc,F:\ML_project\east_us\VIIRS\VIIRS_fire\VNP14IM...
2,s202201011846.nc,2022-01-01 18:46,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...,VNP14IMG.A2022001.1842.002.2024075110906.nc,F:\ML_project\east_us\VIIRS\VIIRS_fire\VNP14IM...
3,s202201012021.nc,2022-01-01 20:21,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...,VNP14IMG.A2022001.2018.002.2024075110907.nc,F:\ML_project\east_us\VIIRS\VIIRS_fire\VNP14IM...
4,s202201012031.nc,2022-01-01 20:31,F:\ML_project\GOES_16\MCMI\OR_ABI-L2-MCMIPC-M6...,F:\ML_project\GOES_16\FDC\OR_ABI-L2-FDCC-M6_G1...,F:\ML_project\GOES_16\ACM\OR_ABI-L2-ACMC-M6_G1...,VNP14IMG.A2022001.2024.002.2024075110907.nc,F:\ML_project\east_us\VIIRS\VIIRS_fire\VNP14IM...


* Now we will set the dir of the temporal GOES images

In [3]:
temporal_GOES_dir = "F:\\ML_project\\GOES_16\\temporal_GOES"

* create a function to make GOES date as the dir of the temporal images

In [4]:
def GOES_date_to_dir_format(goes_date):
    """Get a GOES date and return it in the format that the directory uses.

    Args:
        goes_date (str): GOES date in a string for example "2022-01-01 07:31"
    """
    under_line_date = goes_date.replace(" ", "_") ## replace the space with an underscore
    out_format = under_line_date.replace(":", "-") ## replace the colon with nothing
    return out_format ## return the date in the format that the directory uses
    

* now we can create an array of this date format

In [5]:
GOES_time_dir_format_list = np.array(list(map(GOES_date_to_dir_format, df["GOES_date_time"]))) ## get the GOES date in the format that the directory uses
GOES_time_dir_format_list[:5]

array(['2022-01-01_07-31', '2022-01-01_09-11', '2022-01-01_18-46',
       '2022-01-01_20-21', '2022-01-01_20-31'], dtype='<U16')

In [6]:
def GOES_dir_format_to_str(goes_dir_format):
    """get the GOES directory format and return it in a string format.

    Args:
        goes_dir_format (str): for example "2022-01-01_07-31"
    """
    YMD = goes_dir_format.split("_")[0] ## get the year, month, and day
    HM = goes_dir_format.split("_")[1] ## get the hour and minute
    HM_out = HM.replace("-", ":") ## replace the dash with a colon
    out = f"{YMD} {HM_out}" ## combine the year, month, day, hour, and minute
    return out ## return the date in a string format

* Now we can create function to get files in each folder

In [7]:
def get_GOES_temproal_df(goes_time_in_dir_format, temporal_GOES_dir):
    """Insert GOES date in the format that the directory uses and return a dataframe with GOES images matching that time in FDC, ACM and MCMI
    

    Args:
        goes_time_in_dir_format (str): GOES dir format for exmaple "2022-01-01_07-31"
        temporal_GOES_dir (str): Directory where the GOES images are stored
    """
    product_list = ["FDC", "ACM", "MCMI"] ## list of products
    dir_path = os.path.join(temporal_GOES_dir, goes_time_in_dir_format) ## get the directory path
    if os.path.exists(dir_path): ## if the directory exists
        print(f"Directory {dir_path} exists") ## print that the directory exists
        ACM_files_list = [] ## list to store the ACM files
        FDC_files_list = [] ## list to store the FDC files
        MCMI_files_list = [] ## list to store the MCMI files
        for product in product_list: ## loop through the products
            product_dir = os.path.join(dir_path, product) ## get the product directory
            list_of_images_dir = [os.path.join(product_dir, image) for image in os.listdir(product_dir)] ## get the list of images in the directory
            if len(list_of_images_dir) !=4: ## if the length of the list is not 4
                print(f"Missing images in {product_dir}") ## print that the images are missing
                return None ## return None
            else: ## if the length of the list is 4
                list_of_images_dir.sort(reverse=True) ## sort the list
                for image in list_of_images_dir: ## loop through the images
                    if product == "FDC": ## if the product is FDC
                        FDC_files_list.append(image) ## append the image to the FDC list
                    elif product == "ACM": ## if the product is ACM
                        ACM_files_list.append(image) ## append the image to the ACM list
                    elif product == "MCMI": ## if the product is MCMI
                        MCMI_files_list.append(image) ## append the image to the MCMI list
                        
        file_name_time = GOES_dir_format_to_str(goes_time_in_dir_format) ## get the file name in str
        file_name_time_list = np.repeat(file_name_time, len(ACM_files_list)) ## repeat the file name 4 times
        d = {"GOES_date_time": file_name_time_list, "MCMI": MCMI_files_list, "FDC":FDC_files_list, "ACM":ACM_files_list} ## create a dictionary
        df = pd.DataFrame(data = d) ## create a dataframe
        return df
    else:
        print(f"Directory {dir_path} does not exist") ## print that the directory does not exist
        return None ## return None
        

In [10]:
df_temporal_list = [] ## list to store the dataframes
no_4_files = [] ## list to store the GOES time that do not have 4 files
for i in range(len(GOES_time_dir_format_list)): ## loop through the GOES time dir format list
    goes_time = GOES_time_dir_format_list[i] ## get the GOES time
    df_temporal = get_GOES_temproal_df(goes_time, temporal_GOES_dir) ## get the GOES temporal dataframe
    if isinstance(df_temporal, pd.DataFrame):
        df_temporal_list.append(df_temporal)
        print(f"Appended {i}")
    else:
        no_4_files.append(GOES_dir_format_to_str(goes_time))

Directory F:\ML_project\GOES_16\temporal_GOES\2022-01-01_07-31 exists
Appended 0
Directory F:\ML_project\GOES_16\temporal_GOES\2022-01-01_09-11 exists
Appended 1
Directory F:\ML_project\GOES_16\temporal_GOES\2022-01-01_18-46 exists
Appended 2
Directory F:\ML_project\GOES_16\temporal_GOES\2022-01-01_20-21 exists
Appended 3
Directory F:\ML_project\GOES_16\temporal_GOES\2022-01-01_20-31 exists
Appended 4
Directory F:\ML_project\GOES_16\temporal_GOES\2022-01-02_07-11 exists
Appended 5
Directory F:\ML_project\GOES_16\temporal_GOES\2022-01-02_08-46 exists
Appended 6
Directory F:\ML_project\GOES_16\temporal_GOES\2022-01-02_08-51 exists
Appended 7
Directory F:\ML_project\GOES_16\temporal_GOES\2022-01-02_18-21 exists
Appended 8
Directory F:\ML_project\GOES_16\temporal_GOES\2022-01-02_18-31 exists
Appended 9
Directory F:\ML_project\GOES_16\temporal_GOES\2022-01-02_20-06 exists
Appended 10
Directory F:\ML_project\GOES_16\temporal_GOES\2022-01-02_20-11 exists
Appended 11
Directory F:\ML_project\GO

In [11]:
d = {"No_4_files": no_4_files} ## create a dictionary
df_no_4_files = pd.DataFrame(data = d) ## create a dataframe
df_no_4_files

Unnamed: 0,No_4_files
0,2022-03-05 17:21
1,2022-04-26 17:51
2,2022-04-26 19:31
3,2022-09-25 20:21
4,2022-09-27 18:06
5,2022-10-05 18:51
6,2023-03-08 17:21
7,2023-05-01 20:36
8,2023-11-02 17:41
9,2023-11-06 18:06


In [12]:
df_temporal_list_concat = pd.concat(df_temporal_list).reset_index(drop=True) ## concatenate the dataframes

In [13]:
out_dir = f"{os.getcwd()}\\temporal_df" ## output directory
if not os.path.exists(out_dir): ## if the output directory does not exist
    os.makedirs(out_dir) ## make the directory
else:
    print(f"Directory {out_dir} exists") ## print that the directory exists

Directory c:\Users\asaf_rs\Dropbox\Fire_Detection\python_ML_project\create_ML_df\temporal_df exists


In [14]:
df_no_4_files.to_csv(f"{out_dir}\\no_4_files.csv", index=False) ## save the no 4 files dataframe
df_temporal_list_concat.to_csv(f"{out_dir}\\temporal_df.csv", index=False) ## save the temporal dataframe