<a href="https://colab.research.google.com/github/b-fatma/S2I-DM/blob/master/src/merge/fire_create_non_fire_instances.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fire Data Preprocessing

In [None]:
import sys
from pathlib import Path

# sys.path.append(str(Path.cwd().parent.parent))

# from config import raw_files, processed_files

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## Loading the Data

In [None]:
filepath = 'fire.csv'
df = pd.read_csv(filepath, parse_dates=['acq_date_acq_time'])
df.reset_index(drop=True)
df.head()

Unnamed: 0,acq_date_acq_time,latitude,longitude,fire
0,2024-01-01 01:41:00,35.70751,5.53337,1
1,2024-01-01 01:42:00,32.13579,6.46961,1
2,2024-01-01 01:42:00,32.35563,6.9763,1
3,2024-01-01 01:43:00,28.19791,9.39581,1
4,2024-01-01 01:43:00,28.12826,9.49323,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12748 entries, 0 to 12747
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   acq_date_acq_time  12748 non-null  datetime64[ns]
 1   latitude           12748 non-null  float64       
 2   longitude          12748 non-null  float64       
 3   fire               12748 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(1)
memory usage: 398.5 KB


In [None]:
df_viz = df[
    (df['longitude'].between(7.7, 8)) &
    (df['latitude'].between(36.7, 37))
]


In [None]:
import seaborn as sns

sns.scatterplot(df_viz, x='longitude', y='latitude')

## No missing values

## Creating non-fire instances

In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon

gdf_original = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df.longitude, df.latitude),
    crs="EPSG:4326"  # WGS84
)

In [None]:
gdf_m = gdf_original.to_crs("EPSG:3857")

In [None]:
gdf_m

Unnamed: 0,acq_date_acq_time,latitude,longitude,fire,geometry
0,2024-01-01 01:41:00,35.70751,5.53337,1,POINT (615971.931 4260449.476)
1,2024-01-01 01:42:00,32.13579,6.46961,1,POINT (720193.691 3781148.403)
2,2024-01-01 01:42:00,32.35563,6.97630,1,POINT (776598.164 3810083.655)
3,2024-01-01 01:43:00,28.19791,9.39581,1,POINT (1045936.785 3273948.704)
4,2024-01-01 01:43:00,28.12826,9.49323,1,POINT (1056781.53 3265154.086)
...,...,...,...,...,...
12743,2024-12-19 12:15:00,36.54327,8.95251,1,POINT (996588.855 4375634.291)
12744,2024-12-22 01:40:00,33.87632,9.98897,1,POINT (1111967.054 4012206.889)
12745,2024-12-23 01:21:00,34.93834,8.53005,1,POINT (949560.822 4155504.95)
12746,2024-12-27 00:06:00,31.64689,9.16305,1,POINT (1020026.06 3717048.182)


In [None]:
!pip install --upgrade shapely
!pip install rasterio
!pip install xarray

Collecting rasterio
  Downloading rasterio-1.4.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting cligj>=0.5 (from rasterio)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Collecting click-plugins (from rasterio)
  Downloading click_plugins-1.1.1.2-py2.py3-none-any.whl.metadata (6.5 kB)
Downloading rasterio-1.4.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.3/22.3 MB[0m [31m120.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Downloading affine-2.4.0-py3-none-any.whl (15 kB)
Downloading click_plugins-1.1.1.2-py2.py3-none-any.whl (11 kB)
Installing collected packages: cligj, click-plugins, affine, rasterio
Successfully installed affine-2.4.0 click-plugins-1.1.1.2 cligj-0.7.2 rasterio-1.4.3


In [None]:
import numpy as np
import shapely
import geopandas as gpd
from tqdm import tqdm
from shapely import centroid

minx, miny, maxx, maxy = gdf_m.total_bounds
cell_size = 1000

# Generate x/y coords
xs = np.arange(minx, maxx, cell_size)
ys = np.arange(miny, maxy, cell_size)

# Precompute total cells for progress bar
total_cells = len(xs) * len(ys)

# Choose a chunk size (1k polygons per batch)
chunk_size = 1000

geoms_list = []
centroids_df = pd.DataFrame(columns=['centroid'])

# Flatten grid so chunks work correctly
X, Y = np.meshgrid(xs, ys)
x0 = X.ravel()
y0 = Y.ravel()

# Chunked processing with tqdm
for start in tqdm(range(0, total_cells, chunk_size), total=total_cells // chunk_size + 1):
    end = start + chunk_size

    x_chunk = x0[start:end]
    y_chunk = y0[start:end]

    # Vectorized rectangle corners for this chunk
    corners = np.stack([
        np.column_stack([x_chunk, y_chunk]),
        np.column_stack([x_chunk + cell_size, y_chunk]),
        np.column_stack([x_chunk + cell_size, y_chunk + cell_size]),
        np.column_stack([x_chunk, y_chunk + cell_size]),
        np.column_stack([x_chunk, y_chunk]),  # close ring
    ], axis=1)

    # Vectorized polygon creation
    geoms_chunk = shapely.polygons(corners)

    geoms_list.append(geoms_chunk)

    centroids_chunk = centroid(geoms_chunk)
    centroids_df = pd.concat([centroids_df, pd.DataFrame(centroids_chunk, columns=['centroid'])])

# Concatenate all chunks into one array
geoms = np.concatenate(geoms_list)

# Create GeoDataFrame
grid = gpd.GeoDataFrame(geometry=geoms, crs="EPSG:3857")

100%|██████████| 2854/2854 [01:32<00:00, 30.73it/s]


In [None]:
from google.colab import drive
drive.mount('/content/drive')
output_path = '/content/drive/MyDrive/dm_fire_prediction/fire_grid.csv'
grid.to_csv(output_path)

In [None]:
grid.head(), grid.shape, centroids_df.head(), centroids_df.shape

(                                            geometry
 0  POLYGON ((-903697.192 3151332.844, -902697.192...
 1  POLYGON ((-902697.192 3151332.844, -901697.192...
 2  POLYGON ((-901697.192 3151332.844, -900697.192...
 3  POLYGON ((-900697.192 3151332.844, -899697.192...
 4  POLYGON ((-899697.192 3151332.844, -898697.192...,
 (2853953, 1),
                                        centroid
 0  POINT (-903197.1922343344 3151832.844125225)
 1  POINT (-902197.1922343344 3151832.844125225)
 2  POINT (-901197.1922343344 3151832.844125225)
 3  POINT (-900197.1922343344 3151832.844125225)
 4  POINT (-899197.1922343344 3151832.844125225),
 (2853953, 1))

In [None]:
grid_centroids = pd.cgrid = grid.reset_index(drop=True)
centroids_df = centroids_df.reset_index(drop=True)

grid_centroids = pd.concat([grid, centroids_df], axis=1, ignore_index=True)

In [None]:
grid_centroids.head()

Unnamed: 0,0,1
0,"POLYGON ((-903697.192 3151332.844, -902697.192...",POINT (-903197.1922343344 3151832.844125225)
1,"POLYGON ((-902697.192 3151332.844, -901697.192...",POINT (-902197.1922343344 3151832.844125225)
2,"POLYGON ((-901697.192 3151332.844, -900697.192...",POINT (-901197.1922343344 3151832.844125225)
3,"POLYGON ((-900697.192 3151332.844, -899697.192...",POINT (-900197.1922343344 3151832.844125225)
4,"POLYGON ((-899697.192 3151332.844, -898697.192...",POINT (-899197.1922343344 3151832.844125225)


In [None]:
fires = pd.DataFrame({"centroid": grid_centroids[1]})

In [None]:

gdf = gpd.GeoDataFrame(
    fires,
    geometry=fires.centroid,
    crs="EPSG:3857"
)

In [None]:
gdf = gdf.to_crs("EPSG:4326")
gdf

Unnamed: 0,centroid,geometry
0,POINT (-903197.1922343344 3151832.844125225),POINT (-8.11356 27.22677)
1,POINT (-902197.1922343344 3151832.844125225),POINT (-8.10458 27.22677)
2,POINT (-901197.1922343344 3151832.844125225),POINT (-8.09559 27.22677)
3,POINT (-900197.1922343344 3151832.844125225),POINT (-8.08661 27.22677)
4,POINT (-899197.1922343344 3151832.844125225),POINT (-8.07763 27.22677)
...,...,...
2853948,POINT (1232802.8077656655 4483832.844125225),POINT (11.07446 37.3202)
2853949,POINT (1233802.8077656655 4483832.844125225),POINT (11.08344 37.3202)
2853950,POINT (1234802.8077656655 4483832.844125225),POINT (11.09242 37.3202)
2853951,POINT (1235802.8077656655 4483832.844125225),POINT (11.10141 37.3202)


In [None]:
gdf["fire"] = 0

fire_idx = gpd.sjoin(grid, gdf_m, predicate="intersects").index
gdf.loc[fire_idx, "fire"] = 1

In [None]:
gdf

Unnamed: 0,centroid,geometry,fire
0,POINT (-903197.1922343344 3151832.844125225),POINT (-8.11356 27.22677),0
1,POINT (-902197.1922343344 3151832.844125225),POINT (-8.10458 27.22677),0
2,POINT (-901197.1922343344 3151832.844125225),POINT (-8.09559 27.22677),0
3,POINT (-900197.1922343344 3151832.844125225),POINT (-8.08661 27.22677),0
4,POINT (-899197.1922343344 3151832.844125225),POINT (-8.07763 27.22677),0
...,...,...,...
2853948,POINT (1232802.8077656655 4483832.844125225),POINT (11.07446 37.3202),0
2853949,POINT (1233802.8077656655 4483832.844125225),POINT (11.08344 37.3202),0
2853950,POINT (1234802.8077656655 4483832.844125225),POINT (11.09242 37.3202),0
2853951,POINT (1235802.8077656655 4483832.844125225),POINT (11.10141 37.3202),0


In [None]:
gdf['fire'].sum()

np.int64(3332)

In [None]:
gdf["longitude"] = gdf.geometry.x
gdf["latitude"] = gdf.geometry.y

In [None]:
gdf

Unnamed: 0,centroid,geometry,fire,longitude,latitude
0,POINT (-903197.1922343344 3151832.844125225),POINT (-8.11356 27.22677),0,-8.113558,27.226774
1,POINT (-902197.1922343344 3151832.844125225),POINT (-8.10458 27.22677),0,-8.104575,27.226774
2,POINT (-901197.1922343344 3151832.844125225),POINT (-8.09559 27.22677),0,-8.095592,27.226774
3,POINT (-900197.1922343344 3151832.844125225),POINT (-8.08661 27.22677),0,-8.086609,27.226774
4,POINT (-899197.1922343344 3151832.844125225),POINT (-8.07763 27.22677),0,-8.077626,27.226774
...,...,...,...,...,...
2853948,POINT (1232802.8077656655 4483832.844125225),POINT (11.07446 37.3202),0,11.074456,37.320199
2853949,POINT (1233802.8077656655 4483832.844125225),POINT (11.08344 37.3202),0,11.083439,37.320199
2853950,POINT (1234802.8077656655 4483832.844125225),POINT (11.09242 37.3202),0,11.092422,37.320199
2853951,POINT (1235802.8077656655 4483832.844125225),POINT (11.10141 37.3202),0,11.101406,37.320199


In [None]:
gdf.drop(['centroid', 'geometry'], axis=1, inplace=True)
gdf.head(), grid.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 2853953 entries, 0 to 2853952
Data columns (total 1 columns):
 #   Column    Dtype   
---  ------    -----   
 0   geometry  geometry
dtypes: geometry(1)
memory usage: 21.8 MB


(   fire  longitude   latitude
 0     0  -8.113558  27.226774
 1     0  -8.104575  27.226774
 2     0  -8.095592  27.226774
 3     0  -8.086609  27.226774
 4     0  -8.077626  27.226774,
 None)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
output_path = '/content/drive/MyDrive/dm_fire_prediction/fire_centroid_approach.csv'
gdf.to_csv(output_path)

In [None]:
# grid.to_csv('fire_centroid_approach3857.csv')

In [None]:
grid_no_fire = gdf[gdf['fire'] == 0]

In [None]:
grid_keep_all_fire = pd.concat(
    [gdf_original[['latitude', 'longitude', 'fire']], grid_no_fire],
    axis=0,
    ignore_index=True
)

In [None]:
grid_keep_all_fire['fire'].sum() / grid_keep_all_fire.shape[0] * 100

np.float64(0.44520982101852746)

In [None]:
output_path = '/content/drive/MyDrive/dm_fire_prediction/fire_keep_all_fires_approach.csv'
grid_keep_all_fire.to_csv(output_path, index=False)