#### Extracting Training Data Chunks

This code creates the arrays in Wildfires/data-training/ which represent the bounds and dates of the chunks that are used to fetch data from NASA's API. This file is a copy of the original `extract_training_chunks.ipynb` but generates the lagged data during methodology revision. 

In [11]:
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import json
from scipy import sparse
import random
import os
from sys import getsizeof
from shapely.geometry import Polygon
from shapely import Point
from shapely.geometry import MultiPoint
from shapely import ops
from datetime import datetime, timedelta

coords = json.load(open('../coords.json'))
mpl.rcParams['figure.dpi'] = 120
# mpl.rcParams['savefig.pad_inches'] = 0

random.seed(10) # SO WE GET THE SAME DATASET EVERY TIME

# This code is processed 
# I have manually processed every file using this format

directory = r"D:\Users\xubil\OneDrive\Documents\Wildfires Data NPZ\Training"
data = {}

In [2]:
target_limit =  "limits_4326" #"limits_testing_9_chunks"
xmin, xmax, ymin, ymax = (coords[target_limit]["xmin"]), (coords[target_limit]["xmax"]), (coords[target_limit]["ymin"]), (coords[target_limit]["ymax"])

'''
where new_image is a figure.canvas.buffer_rgba() turned into a np matrix: 

nonzero_rows, nonzero_cols = np.nonzero(new_image) # Get all nonzero rows & collumns 

min_row, max_row = np.min(nonzero_rows), np.max(nonzero_rows)
min_col, max_col = np.min(nonzero_cols), np.max(nonzero_cols)
# After a series of test, (0, 575, 3, 764) was the exact fit of the canvas when the ration between width:height = 2:1

'''

min_row, max_row, min_col, max_col = 0, 575, 3, 764 
xyratio = 2/1

resx = 0.2
resy = resx/xyratio
n_chunkx = int(round((xmax-xmin)/resx, 1)) # MAKE SURE YOU CAN MATH: because we convert to int if you get 0.1232131 sketch 
n_chunky = int(round((ymax-ymin)/resy, 1)) # We're using round to not get like 2.9999999999999999997 make sure to get 0.3

# For Final Extraction: 
# x: 19.8/99 = 0.2 per chunk for 99 chunks
# y: 8/80 = 0.1 per chunk for 80 chunks

# For this file more specifically, we want to process an area around the point. Let us use the size of a chunk: 0.05 by 0.05
mat_h, mat_w = 0.045, 0.045

In [3]:
gdf = gpd.read_file(r"D:\Users\xubil\OneDrive\Documents\Wildfires Data\Feux_pt_ori_SHP\FEUX_PT_ORI_1972_2022.shp") # Path to the shapefile 

In [4]:
causes = ["Humaine", "Foudre"]
gdf = gdf.to_crs(4326)

In [5]:
gdf = gdf.loc[gdf["ANNEE"] >= 2000] # Only taking years after 2000

In [6]:
gdf

Unnamed: 0,OBJECTID,CLE,ANNEE,NOFEU,SECTION,CAUSE,DATE_DEBUT,DATE_RAPPO,DATE_ETEIN,SUP_HA,LATITUDE,LONGITUDE,geometry
0,1,20221080001,2022,1,Intensive,Humaine,2022-04-06,2022-04-06,2022-04-06,0.4,45.1289,-72.1135,POINT (-72.11350 45.12890)
1,2,20221080002,2022,2,Intensive,Humaine,2022-04-12,2022-04-12,2022-04-12,0.5,45.6624,-74.3352,POINT (-74.33520 45.66240)
2,3,20221080003,2022,3,Intensive,Humaine,2022-04-12,2022-04-12,2022-04-12,0.0,45.6734,-74.3641,POINT (-74.36410 45.67340)
3,4,20221080004,2022,4,Intensive,Humaine,2022-04-12,2022-04-12,2022-04-12,0.6,45.9701,-77.1305,POINT (-77.13050 45.97010)
4,5,20221080005,2022,5,Intensive,Humaine,2022-04-18,2022-04-18,2022-04-18,0.4,45.7227,-74.3818,POINT (-74.38180 45.72270)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43235,0,20211080621,2021,621,Intensive,Humaine,2021-10-28,2021-10-28,2021-10-28,0.0,45.4606,-72.1554,POINT (-72.15540 45.46060)
43236,0,20211080622,2021,622,Intensive,Humaine,2021-11-09,2021-11-09,2021-11-11,2.0,45.9512,-75.8547,POINT (-75.85470 45.95120)
43237,0,20211080623,2021,623,Intensive,Humaine,2021-11-11,2021-11-11,2021-11-11,0.0,45.9812,-74.2104,POINT (-74.21040 45.98120)
43238,0,20211080624,2021,624,Intensive,Humaine,2021-11-10,2021-11-16,2021-11-16,0.0,45.3931,-72.1209,POINT (-72.12090 45.39310)


In [7]:
gdf[gdf["CAUSE"] == "Foudre"].shape

(4349, 13)

In [8]:
gdf[gdf["CAUSE"] == "Humaine"].shape

(9498, 13)

### Process Training Data (YESTERDAY)

In [22]:
count = 0

random.seed(10) # MAKE SURE WE GET THE SAME BOUNDS EVERY TIME :))

for cause in causes:
    print(cause)
    trainingInputCoords = []
    trainingInputDates = []

    count = 0

    for point in gdf[gdf["CAUSE"] == cause].iterrows():
        rand = random.random() # Set seed makes the dataset generated always the same <3
        # rand 0-1 number determines the where exactly the grid cropped (only if possible if we are at bounds it will auto-adjust to fit bounds)

        # # # # # # # # #^
        #               #✓ The length of height and width offset is the same!!! 
        #    P          #
        #               #     
        #               #
        #               #
        # # # # # # # # #
        #<->

        # print(point)
        # print(count)

        rxcoord, rycoord = point[1]["geometry"].bounds[0], point[1]["geometry"].bounds[1]

        # print(rxcoord, rycoord)

        if not (rxcoord >= xmin and rxcoord <= xmax and rycoord >= ymin and rycoord <= ymax):
            continue

        # print(xcoord, ycoord)

        min_x, min_y = round(rxcoord-int(rand*mat_w), 3), round(rycoord-int(rand*mat_h), 3)
        max_x, max_y = min_x+mat_w, min_y+mat_h

        # print(min_x, max_x, min_y, max_y)

        # Shift the square if it is out of boundsss 
        if (min_x < xmin):
            min_x = xmin
            max_x = xmin+mat_w
        if (min_y < ymin):
            min_y = ymin
            max_y = ymin+mat_w
        if (max_x > xmax):
            max_x = xmax
            min_x = xmax-mat_h
        if (max_y >= ymax):
            max_y = ymax-1
            min_y = ymax-mat_h
        
        trainingInputCoords.append((min_x, max_x, min_y, max_y))
        trainingInputDates.append((datetime.strptime(point[1]["DATE_DEBUT"], "%Y-%m-%d") - timedelta(1)).strftime("%Y-%m-%d"))

        # print(min_x, max_x, min_y, max_y)

        count+=1

        if (count > 1000):
            break
    # break
    print(count)
    np.save('../../data-training/'+cause+"-yesterday", trainingInputCoords)
    np.save('../../data-training/'+cause+"-yesterday-dates", trainingInputDates)
    # print(trainingInputDates)
    del trainingInputDates
    del trainingInputCoords

    # break

Humaine
1001
Foudre
1001


### Merge all the coords together (need to fetch for all)

In [29]:
totalInputCoords = []
totalInputDates = []
directory_save = "../../data-training/"

for filename in os.listdir(directory_save):

    if (not filename.__contains__("yesterday")):
        continue
    if (filename.__contains__("ALL")):
        continue
    if (filename.__contains__("dates")):
        continue
    
    print(filename)
    print(filename[:-4]+"-dates.npy")

    nextCoords = np.load(os.path.join(directory_save, filename), allow_pickle=True)
    nextDates = np.load(os.path.join(directory_save, filename[:-4]+"-dates.npy"), allow_pickle=True)
    print(nextCoords.shape)
    totalInputCoords.extend(nextCoords)
    totalInputDates.extend(nextDates)

Foudre-yesterday.npy
Foudre-yesterday-dates.npy
(1001, 4)
Humaine-yesterday.npy
Humaine-yesterday-dates.npy
(1001, 4)


In [30]:
len(totalInputDates)

2002

In [31]:
np.save('../../data-training/ALL-yesterday', totalInputCoords)
np.save('../../data-training/ALL-yesterday-dates', totalInputDates)