#### Extracting Training Data Chunks

This code creates the arrays in Wildfires/data-training/ which represent the bounds and dates of the chunks that are used to fetch data from NASA's API. 

In [1]:
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import json
from scipy import sparse
import random
import os
from sys import getsizeof
from shapely.geometry import Polygon
from shapely import Point
from shapely.geometry import MultiPoint
from shapely import ops

coords = json.load(open('../coords.json'))
mpl.rcParams['figure.dpi'] = 120
# mpl.rcParams['savefig.pad_inches'] = 0

random.seed(10) # SO WE GET THE SAME DATASET EVERY TIME

# This code is processed 
# I have manually processed every file using this format

directory = r"D:\Users\xubil\OneDrive\Documents\Wildfires Data NPZ\Training"
data = {}

In [2]:
target_limit =  "limits_4326" #"limits_testing_9_chunks"
xmin, xmax, ymin, ymax = (coords[target_limit]["xmin"]), (coords[target_limit]["xmax"]), (coords[target_limit]["ymin"]), (coords[target_limit]["ymax"])

'''
where new_image is a figure.canvas.buffer_rgba() turned into a np matrix: 

nonzero_rows, nonzero_cols = np.nonzero(new_image) # Get all nonzero rows & collumns 

min_row, max_row = np.min(nonzero_rows), np.max(nonzero_rows)
min_col, max_col = np.min(nonzero_cols), np.max(nonzero_cols)
# After a series of test, (0, 575, 3, 764) was the exact fit of the canvas when the ration between width:height = 2:1

'''

min_row, max_row, min_col, max_col = 0, 575, 3, 764 
xyratio = 2/1

resx = 0.2
resy = resx/xyratio
n_chunkx = int(round((xmax-xmin)/resx, 1)) # MAKE SURE YOU CAN MATH: because we convert to int if you get 0.1232131 sketch 
n_chunky = int(round((ymax-ymin)/resy, 1)) # We're using round to not get like 2.9999999999999999997 make sure to get 0.3

# For Final Extraction: 
# x: 19.8/99 = 0.2 per chunk for 99 chunks
# y: 8/80 = 0.1 per chunk for 80 chunks

# For this file more specifically, we want to process an area around the point. Let us use the size of a chunk: 0.05 by 0.05
mat_h, mat_w = 0.045, 0.045

In [3]:
gdf = gpd.read_file(r"D:\Users\xubil\OneDrive\Documents\Wildfires Data\Feux_pt_ori_SHP\FEUX_PT_ORI_1972_2022.shp") # Path to the shapefile 

In [4]:
causes = ["Humaine", "Foudre"]
gdf = gdf.to_crs(4326)

In [14]:
gdf = gdf.loc[gdf["ANNEE"] >= 2000] # Only taking years after 2000

In [15]:
gdf

Unnamed: 0,OBJECTID,CLE,ANNEE,NOFEU,SECTION,CAUSE,DATE_DEBUT,DATE_RAPPO,DATE_ETEIN,SUP_HA,LATITUDE,LONGITUDE,geometry
0,1,20221080001,2022,1,Intensive,Humaine,2022-04-06,2022-04-06,2022-04-06,0.4,45.1289,-72.1135,POINT (-72.11350 45.12890)
1,2,20221080002,2022,2,Intensive,Humaine,2022-04-12,2022-04-12,2022-04-12,0.5,45.6624,-74.3352,POINT (-74.33520 45.66240)
2,3,20221080003,2022,3,Intensive,Humaine,2022-04-12,2022-04-12,2022-04-12,0.0,45.6734,-74.3641,POINT (-74.36410 45.67340)
3,4,20221080004,2022,4,Intensive,Humaine,2022-04-12,2022-04-12,2022-04-12,0.6,45.9701,-77.1305,POINT (-77.13050 45.97010)
4,5,20221080005,2022,5,Intensive,Humaine,2022-04-18,2022-04-18,2022-04-18,0.4,45.7227,-74.3818,POINT (-74.38180 45.72270)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43235,0,20211080621,2021,621,Intensive,Humaine,2021-10-28,2021-10-28,2021-10-28,0.0,45.4606,-72.1554,POINT (-72.15540 45.46060)
43236,0,20211080622,2021,622,Intensive,Humaine,2021-11-09,2021-11-09,2021-11-11,2.0,45.9512,-75.8547,POINT (-75.85470 45.95120)
43237,0,20211080623,2021,623,Intensive,Humaine,2021-11-11,2021-11-11,2021-11-11,0.0,45.9812,-74.2104,POINT (-74.21040 45.98120)
43238,0,20211080624,2021,624,Intensive,Humaine,2021-11-10,2021-11-16,2021-11-16,0.0,45.3931,-72.1209,POINT (-72.12090 45.39310)


In [16]:
gdf[gdf["CAUSE"] == "Foudre"].shape

(4349, 13)

In [17]:
gdf[gdf["CAUSE"] == "Humaine"].shape

(9498, 13)

### Process Training Data

In [18]:
count = 0

random.seed(10) # MAKE SURE WE GET THE SAME BOUNDS EVERY TIME :))

for cause in causes:
    print(cause)
    trainingInputCoords = []
    trainingInputDates = []

    for point in gdf[gdf["CAUSE"] == cause].iterrows():
        rand = random.random() # Set seed makes the dataset generated always the same <3
        # rand 0-1 number determines the where exactly the grid cropped (only if possible if we are at bounds it will auto-adjust to fit bounds)

        # # # # # # # # #^
        #               #✓ The length of height and width offset is the same!!! 
        #    P          #
        #               #     
        #               #
        #               #
        # # # # # # # # #
        #<->

        # print(point)
        # print(count)

        rxcoord, rycoord = point[1]["geometry"].bounds[0], point[1]["geometry"].bounds[1]

        # print(rxcoord, rycoord)

        if not (rxcoord >= xmin and rxcoord <= xmax and rycoord >= ymin and rycoord <= ymax):
            continue

        # print(xcoord, ycoord)

        min_x, min_y = round(rxcoord-int(rand*mat_w), 3), round(rycoord-int(rand*mat_h), 3)
        max_x, max_y = min_x+mat_w, min_y+mat_h

        # print(min_x, max_x, min_y, max_y)

        # Shift the square if it is out of boundsss 
        if (min_x < xmin):
            min_x = xmin
            max_x = xmin+mat_w
        if (min_y < ymin):
            min_y = ymin
            max_y = ymin+mat_w
        if (max_x > xmax):
            max_x = xmax
            min_x = xmax-mat_h
        if (max_y >= ymax):
            max_y = ymax-1
            min_y = ymax-mat_h
        
        trainingInputCoords.append((min_x, max_x, min_y, max_y))
        trainingInputDates.append(point[1]["DATE_DEBUT"])

        print(min_x, max_x, min_y, max_y)

        count+=1

        # if (count > 100):
        #     break
        # count+=1
    # break
    np.save('../../data-training/'+cause, trainingInputCoords)
    np.save('../../data-training/'+cause+"-dates", trainingInputDates)
    del trainingInputDates
    del trainingInputCoords

    # break

Humaine
-72.113 -72.068 45.129 45.174
-74.335 -74.28999999999999 45.662 45.707
-74.364 -74.319 45.673 45.718
-77.13 -77.085 45.97 46.015
-74.382 -74.337 45.723 45.768
-74.41 -74.365 45.668 45.713
-72.791 -72.746 45.319 45.364000000000004
-73.1 -73.05499999999999 46.003 46.048
-77.89 -77.845 48.23 48.275
-74.467 -74.422 45.928 45.973
-74.485 -74.44 45.628 45.673
-72.69 -72.645 45.708 45.753
-72.288 -72.243 45.173 45.218
-73.561 -73.516 46.328 46.373000000000005
-72.273 -72.228 46.129 46.174
-72.034 -71.989 46.913 46.958
-72.713 -72.66799999999999 46.466 46.511
-73.422 -73.377 45.887 45.932
-72.288 -72.243 46.523 46.568000000000005
-76.433 -76.388 45.875 45.92
-76.113 -76.068 46.0 46.045
-72.744 -72.699 46.319 46.364000000000004
-79.095 -79.05 46.717 46.762
-72.937 -72.892 47.038 47.083
-79.52 -79.475 47.369 47.414
-74.12 -74.075 45.799 45.844
-74.876 -74.831 45.893 45.938
-72.721 -72.676 46.323 46.368
-76.001 -75.956 46.279 46.324000000000005
-76.343 -76.298 45.677 45.722
-76.187 -76.14

### Similarly, we can create a code to generate training data where no fire events occured

In [19]:
def pointInRect(point,rect):
    x1, y1, x2, y2 = rect
    # x2, y2 = x1+w, y1+h
    x, y = point
    if (x1 < x and x < x2):
        if (y1 < y and y < y2):
            return True
    return False

In [20]:
points = []

for point in gdf.iterrows():
    checkx, checky = point[1]["geometry"].bounds[0], point[1]["geometry"].bounds[1]
    points.append([checkx, checky])

In [21]:
ob = MultiPoint(points)

In [22]:
from random import randrange
from datetime import timedelta, datetime

In [24]:
def random_date(start, end):
    """
    This function will return a random datetime between two datetime 
    objects.
    """
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = randrange(int_delta)
    return start + timedelta(seconds=random_second)

In [25]:
d1 = datetime.strptime('1/1/2000', '%m/%d/%Y')
d2 = datetime.strptime('12/31/2022', '%m/%d/%Y')


In [26]:
count = 0

random.seed(11) # MAKE SURE WE GET THE SAME BOUNDS EVERY TIME :))
trainingInputCoords = []
trainingInputDates = []

for i in range(15000): # About the same number of entries as Humaine and Foudre combined

    movingOn = False
    
    while(not movingOn):
        rand = random.random() 

        rxcoord, rycoord = (random.random()*(xmax-xmin)+xmin), (random.random()*(ymax-ymin)+ymin) # Generate two points between the bounds

        min_x, min_y = round(rxcoord-int(rand*mat_w), 3), round(rycoord-int(rand*mat_h), 3)
        max_x, max_y = min_x+mat_w, min_y+mat_h

        # print(min_x, max_x, min_y, max_y)

        # Shift the square if it is out of boundsss 
        if (min_x < xmin):
            min_x = xmin
            max_x = xmin+mat_w
        if (min_y < ymin):
            min_y = ymin
            max_y = ymin+mat_w
        if (max_x > xmax):
            max_x = xmax
            min_x = xmax-mat_h
        if (max_y >= ymax):
            max_y = ymax-1
            min_y = ymax-mat_h

        poly = Polygon([[min_x, min_y], [min_x, max_y], [max_x, max_y], [max_x, min_y]])

        # print(gdf.bounds)


        if (poly.intersects(ob)):
            print("Contains Point!")
        else:
            # print("Doesn't Contain!")
            movingOn = True
        
        # for point in gdf.iterrows():
        #     checkx, checky = point[1]["geometry"].bounds[0], point[1]["geometry"].bounds[1]
        #     if not pointInRect((checkx, checky), (min_x, max_x, min_y, max_y)):
        #         movingOn = True
            
    trainingInputCoords.append((min_x, max_x, min_y, max_y))
    trainingInputDates.append(random_date(d1, d2).strftime('%Y-%m-%d'))


    # print(min_x, max_x, min_y, max_y)

    # if count == 0:
    #     break

    count+=1
    print(count)

np.save('../../data-training/Sans-Feu', trainingInputCoords)
np.save('../../data-training/Sans-Feu-dates', trainingInputDates)
del trainingInputCoords

Contains Point!
1
2
Contains Point!
3
4
5
6
7
8
Contains Point!
9
10
11
12
13
Contains Point!
14
Contains Point!
15
Contains Point!
16
17
18
19
20
21
22
Contains Point!
23
Contains Point!
24
25
26
27
28
29
Contains Point!
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
Contains Point!
47
48
49
50
51
52
Contains Point!
53
54
55
Contains Point!
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
Contains Point!
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
Contains Point!
Contains Point!
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
Contains Point!
127
128
Contains Point!
129
130
131
132
Contains Point!
133
134
135
136
137
138
139
140
141
142
Contains Point!
143
144
145
146
147
Contains Point!
148
149
150
151
152
153
154
155
156
157
Contains Point!
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
Contains Point!


### And one more time for Data with Temporal Data

In [28]:
count = 0

random.seed(11) # MAKE SURE WE GET THE SAME BOUNDS EVERY TIME :))
trainingInputCoords = []
trainingInputDates = []

for i in range(15000): # About the same number of entries as Humaine and Foudre combined

    movingOn = False
    
    rand = random.random() 

    rxcoord, rycoord = (random.random()*(xmax-xmin)+xmin), (random.random()*(ymax-ymin)+ymin) # Generate two points between the bounds

    min_x, min_y = round(rxcoord-int(rand*mat_w), 3), round(rycoord-int(rand*mat_h), 3)
    max_x, max_y = min_x+mat_w, min_y+mat_h

        # print(min_x, max_x, min_y, max_y)

        # Shift the square if it is out of boundsss 
    if (min_x < xmin):
        min_x = xmin
        max_x = xmin+mat_w
    if (min_y < ymin):
        min_y = ymin
        max_y = ymin+mat_w
    if (max_x > xmax):
        max_x = xmax
        min_x = xmax-mat_h
    if (max_y >= ymax):
        max_y = ymax-1
        min_y = ymax-mat_h
            
    trainingInputCoords.append((min_x, max_x, min_y, max_y))
    trainingInputDates.append(random_date(d1, d2).strftime('%Y-%m-%d'))

    print(min_x, max_x, min_y, max_y)

    count+=1

np.save('../../data-training/Sans-Feu-Temp', trainingInputCoords)
np.save('../../data-training/Sans-Feu-Temp-dates', trainingInputDates)
del trainingInputCoords

-69.317 -69.27199999999999 51.994 52.039
-63.47 -63.425 46.119 46.164
-68.244 -68.199 46.089 46.134
-78.605 -78.56 51.077 51.122
-60.953 -60.908 52.318 52.363
-67.529 -67.484 49.585 49.63
-79.694 -79.649 51.637 51.682
-71.214 -71.169 48.124 48.169000000000004
-67.722 -67.67699999999999 48.598 48.643
-67.435 -67.39 47.854 47.899
-63.764 -63.719 50.262 50.307
-70.245 -70.2 44.838 44.883
-72.472 -72.42699999999999 51.373 51.418
-80.066 -80.021 50.079 50.124
-62.377 -62.332 48.36 48.405
-69.188 -69.143 46.188 46.233000000000004
-78.675 -78.63 47.261 47.306000000000004
-78.064 -78.01899999999999 46.571 46.616
-71.194 -71.149 48.495 48.54
-70.324 -70.279 52.482 52.527
-72.802 -72.757 47.759 47.804
-63.286 -63.241 52.399 52.444
-62.88 -62.835 46.286 46.331
-68.48 -68.435 49.216 49.261
-76.178 -76.133 46.666 46.711
-74.533 -74.488 45.187 45.232
-67.796 -67.751 44.724 44.769
-71.426 -71.381 52.273 52.318000000000005
-77.712 -77.667 47.689 47.734
-62.413 -62.367999999999995 51.142 51.18700000000

### Merge all the coords together (need to fetch for all)

In [29]:
totalInputCoords = []
totalInputDates = []
directory_save = "../../data-training/"

for filename in os.listdir(directory_save):

    if (filename.__contains__("dates")):
        continue
    
    print(filename)
    print(filename[:-4]+"-dates.npy")

    nextCoords = np.load(os.path.join(directory_save, filename), allow_pickle=True)
    nextDates = np.load(os.path.join(directory_save, filename[:-4]+"-dates.npy"), allow_pickle=True)
    totalInputCoords.extend(nextCoords)
    totalInputDates.extend(nextDates)

Foudre.npy
Foudre-dates.npy
Humaine.npy
Humaine-dates.npy
Sans-Feu-Temp.npy
Sans-Feu-Temp-dates.npy
Sans-Feu.npy
Sans-Feu-dates.npy


In [31]:
totalInputDates

['2022-05-14',
 '2022-05-14',
 '2022-05-14',
 '2022-05-15',
 '2022-05-15',
 '2022-05-21',
 '2022-06-11',
 '2022-06-15',
 '2022-06-30',
 '2022-07-16',
 '2022-07-16',
 '2022-07-16',
 '2022-07-16',
 '2022-07-16',
 '2022-07-16',
 '2022-07-12',
 '2022-07-19',
 '2022-07-12',
 '2022-07-21',
 '2022-07-22',
 '2022-07-21',
 '2022-07-23',
 '2022-08-15',
 '2022-08-15',
 '2022-08-15',
 '2022-08-15',
 '2022-08-15',
 '2022-09-02',
 '2022-08-22',
 '2022-10-07',
 '2022-11-07',
 '2000-05-07',
 '2000-06-03',
 '2000-06-08',
 '2000-06-17',
 '2000-06-30',
 '2000-06-30',
 '2000-07-09',
 '2000-07-14',
 '2000-07-15',
 '2000-07-15',
 '2000-07-15',
 '2000-07-15',
 '2000-07-15',
 '2000-07-15',
 '2000-07-15',
 '2000-07-15',
 '2000-07-15',
 '2000-07-15',
 '2000-07-15',
 '2000-07-15',
 '2000-07-15',
 '2000-07-15',
 '2000-07-15',
 '2000-07-15',
 '2000-07-15',
 '2000-07-16',
 '2000-07-15',
 '2000-07-15',
 '2000-07-17',
 '2000-07-18',
 '2000-07-18',
 '2000-07-19',
 '2000-07-18',
 '2000-07-25',
 '2000-07-25',
 '2000-07-

In [30]:
np.save('../../data-training/ALL', totalInputCoords)
np.save('../../data-training/ALL-dates', totalInputDates)