In [None]:
import pandas as pd
import numpy as np
import itertools

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

*   Defining the Target Variables
*   Define the time-splits for your train, validation and test data.
*   Define the latitude & longitude ranges of your target country, zone, etc. The example is for Greece.
*   Define the floating point precision for yout longitude-latitude values.
*   Define after how many fire calls you will consider to be a wild-fire.
*   Define after what confidence level (in percentage) you will treat a report as a wild-fire occurence.

In [None]:
TRAIN_UNTIL = 2018
VAL_BETWEEN = (2018, 2021)
TEST_ON = 2023

LAT_RANGE = (34, 42)
LON_RANGE = (19, 29)

PRECISION = 1
MIN_FIRE_RECORDS = 2

CONFIDENCE_THRESHOLD = 80

# **LGBM Regressor Model**

We zoomed in greece and used aggressive aggregation for a simple baseline prediction model.

Temporal resolution: Monthly

Spatial resolution: 'PRECISION' Decimal degree ~ ( 10^(2-'PRECISION') km grid)

Binary Target: At least 'MIN_FIRE_RECORDS' fire readings

In [None]:
fires = pd.read_csv(r"C:\Users\User\Videos\UTS\ꞮꞮꞮ.Semester\iLab 2\Countries\Greece\Wildfire Prediction\Greece_wildfire_prediction\data\Wildfire data\wildfire_processed_data.csv", parse_dates=['acq_date'])
fires.head()

Unnamed: 0,latitude,longitude,acq_date,satellite,instrument,confidence
0,34.81,24.12,2018-05-08,Terra,MODIS,62
1,34.94,26.13,2023-08-26,Aqua,MODIS,60
2,34.94,26.14,2014-08-13,Terra,MODIS,66
3,34.94,26.14,2016-03-08,Terra,MODIS,66
4,34.94,26.14,2023-08-25,Aqua,MODIS,62



*   We are pruning the data based on our desired confidence level.
*   We are only taking the zones that falls within our target latitude & longitude ranges.



In [None]:
fires = fires[fires.confidence > CONFIDENCE_THRESHOLD]

fires = fires[
        (fires.latitude > LAT_RANGE[0]) & (fires.latitude < LAT_RANGE[1]) & \
        (fires.longitude > LON_RANGE[0]) & (fires.longitude < LON_RANGE[1])]

fires.shape

(5909, 6)

We are extracting year and month. Then rounding the coordinate pairs with our defined floating point precision.

In [None]:
fires['year'] = fires.acq_date.dt.year
fires['month'] = fires.acq_date.dt.month
fires.latitude = fires.latitude.round(PRECISION)
fires.longitude = fires.longitude.round(PRECISION)

Grouping all the reports based on year, month and coordinates.

In [None]:
fires = fires.groupby(['latitude', 'longitude', 'year', 'month']).size().reset_index()
fires.columns = ['latitude', 'longitude', 'year', 'month', 'fire_cnt']

fires.shape
fires.head()
fires.nunique()

(1595, 5)

Unnamed: 0,latitude,longitude,year,month,fire_cnt
0,35.0,24.8,2013,7,1
1,35.0,24.9,2013,7,9
2,35.0,24.9,2014,5,2
3,35.0,24.9,2020,7,2
4,35.0,25.0,2013,4,2


latitude     66
longitude    81
year         11
month        12
fire_cnt     53
dtype: int64

Creating all the possible coordinate pairs with our desired precision. If we only take the coordinates found in the fire dataset, we would train a model that tends to produce false positives. So we should give all possible coordinates and aim for the model to predict the minority class.

In [None]:
lats = np.arange(LAT_RANGE[0], LAT_RANGE[1], 1/(10^PRECISION)).round(PRECISION)
lons = np.arange(LON_RANGE[0], LON_RANGE[1], 1/(10^PRECISION)).round(PRECISION)
years = fires.year.unique()
months = fires.month.unique()

unq_combs = list(itertools.product(lats, lons))
coords = pd.DataFrame(unq_combs, columns=["latitude", "longitude"])

unq_combs = list(itertools.product(years, months))
times = pd.DataFrame(unq_combs, columns=["year", "month"])

coords['one'] = 1
times['one'] = 1

base = pd.merge(coords, times, how='outer', on='one').drop_duplicates()
history = base.merge(fires, how='left', on= ['latitude', 'longitude', 'year', 'month']).drop_duplicates()

Let's see how many reports can be obtained for a coordinate throughout the area.

In [None]:
history = history.fillna(0)
history.fire_cnt.value_counts().head()

0.0    1054405
1.0        856
2.0        296
3.0        110
4.0         82
Name: fire_cnt, dtype: int64

We mark areas that have been reported more than our threshold value as wildfire. This is our ground-truth value.

In [None]:
history['fire'] = 1 * (history['fire_cnt'] >= MIN_FIRE_RECORDS)



*   For each year, we add the count of wild-fires and the count of fire reports for the last year as new features.

*   For each month, we add the count of wild-fires and the count of fire reports for the same month in last year as new features.

In [None]:
yearly = history.groupby(
    ['latitude', 'longitude', 'year'])[['fire_cnt', 'fire']].sum().reset_index()
monthly = history.groupby(
    ['latitude', 'longitude', 'year', 'month'])[['fire_cnt', 'fire']].sum().reset_index()

In [None]:
last_year = yearly.copy()
last_year.year += 1
last_year.columns = ['latitude', 'longitude', 'year', 'fire_cnt_last_year', 'fire_last_year']
last_year.head()

Unnamed: 0,latitude,longitude,year,fire_cnt_last_year,fire_last_year
0,34.0,19.0,2014,0.0,0
1,34.0,19.0,2015,0.0,0
2,34.0,19.0,2016,0.0,0
3,34.0,19.0,2017,0.0,0
4,34.0,19.0,2018,0.0,0


In [None]:
last_year_month = monthly.copy()
last_year_month.year += 1
last_year_month.columns = ['latitude', 'longitude', 'year', 'month', 'fire_cnt_last_year_same_month', 'fire_last_year_same_month']
last_year_month.head()

Unnamed: 0,latitude,longitude,year,month,fire_cnt_last_year_same_month,fire_last_year_same_month
0,34.0,19.0,2014,1,0.0,0
1,34.0,19.0,2014,2,0.0,0
2,34.0,19.0,2014,3,0.0,0
3,34.0,19.0,2014,4,0.0,0
4,34.0,19.0,2014,5,0.0,0


By shifting the years by one and comparing the two year columns, we calculate the number of fire reports and wild-fires in the previous years for each year.

In [None]:
past = yearly.copy()
past['one'] = 1
past = history[['latitude', 'longitude', 'year', 'one']].drop_duplicates().merge(
    past, on=['latitude', 'longitude', 'one'])
past = past[past.year_x < past.year_y]
past = past.groupby(['latitude', 'longitude', 'year_y'])[['fire_cnt', 'fire']].sum().reset_index()
past.columns = ['latitude', 'longitude', 'year', 'fire_cnt_before', 'fire_before']
past.head(3)


Unnamed: 0,latitude,longitude,year,fire_cnt_before,fire_before
0,34.0,19.0,2014,0.0,0
1,34.0,19.0,2015,0.0,0
2,34.0,19.0,2016,0.0,0


We combine the historical report and wildfire statistics we extracted with the main dataframe.

In [None]:
X = history.merge(past, how='left', on=['latitude', 'longitude', 'year'])
X = X.merge(last_year, how='left', on=['latitude', 'longitude', 'year'])
X = X.merge(last_year_month, how='left', on=['latitude', 'longitude', 'year', 'month'])
X = X.drop(columns='one')

X.head()
X.shape

Unnamed: 0,latitude,longitude,year,month,fire_cnt,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month
0,34.0,19.0,2013,7,0.0,0,,,,,,
1,34.0,19.0,2013,5,0.0,0,,,,,,
2,34.0,19.0,2013,4,0.0,0,,,,,,
3,34.0,19.0,2013,11,0.0,0,,,,,,
4,34.0,19.0,2013,10,0.0,0,,,,,,


(1056000, 12)

# **Temperature Data**

In [None]:
temp_df = pd.read_csv('/content/sample_data/Greece_temperature.csv')

In [None]:
temp_df.head(10)

Unnamed: 0,latitude,longitude,month,year,temperature_min,temperature_avg,temperature_max
0,38.6,21.4,1,2014,4.3,7.5,11.0
1,38.6,21.4,1,2014,4.3,7.5,11.0
2,38.6,21.4,1,2014,4.3,7.5,11.0
3,38.6,21.4,1,2014,4.3,7.5,11.0
4,38.6,21.4,1,2014,4.3,7.5,11.0
5,38.6,21.4,1,2014,4.3,7.5,11.0
6,38.6,21.4,1,2014,4.3,7.5,11.0
7,38.6,21.4,1,2014,4.3,7.5,11.0
8,38.6,21.4,1,2014,4.3,7.5,11.0
9,38.6,21.4,1,2014,4.3,7.5,11.0


# **Data Merging.**

We are merging the historical wildfire statistics data with the temperature data.

In [None]:
X = pd.merge(X, temp_df, on=["month", "year", "latitude", "longitude"], how="inner")

# Display the resulting DataFrame with matching rows
print(X)


         latitude  longitude  year  month  fire_cnt  fire  fire_cnt_before  \
0            35.0       25.7  2013      7       0.0     0              NaN   
1            35.0       25.7  2013      7       0.0     0              NaN   
2            35.0       25.7  2013      7       0.0     0              NaN   
3            35.0       25.7  2013      7       0.0     0              NaN   
4            35.0       25.7  2013      7       0.0     0              NaN   
...           ...        ...   ...    ...       ...   ...              ...   
2694880      40.8       21.4  2023      6       0.0     0              0.0   
2694881      40.8       21.4  2023      6       0.0     0              0.0   
2694882      40.8       21.4  2023      6       0.0     0              0.0   
2694883      40.8       21.4  2023      6       0.0     0              0.0   
2694884      40.8       21.4  2023      6       0.0     0              0.0   

         fire_before  fire_cnt_last_year  fire_last_year  \
0  

In [None]:
X.fire_cnt_before.max()

32.0

In [None]:
X = X.fillna(0)
X

Unnamed: 0,latitude,longitude,year,month,fire_cnt,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month,temperature_min_x,temperature_avg_x,temperature_max_x,temperature_min_y,temperature_avg_y,temperature_max_y
0,35.0,25.7,2013,7,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,21.6,26.4,31.6,21.6,26.4,31.6
1,35.0,25.7,2013,7,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,21.6,26.4,31.6,21.6,26.4,31.6
2,35.0,25.7,2013,7,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,21.6,26.4,31.6,21.6,26.4,31.6
3,35.0,25.7,2013,7,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,21.6,26.4,31.6,21.6,26.4,31.6
4,35.0,25.7,2013,7,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,21.6,26.4,31.6,21.6,26.4,31.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2694880,40.8,21.4,2023,6,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,18.3,23.3,13.0,18.3,23.3
2694881,40.8,21.4,2023,6,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,18.3,23.3,13.0,18.3,23.3
2694882,40.8,21.4,2023,6,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,18.3,23.3,13.0,18.3,23.3
2694883,40.8,21.4,2023,6,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,18.3,23.3,13.0,18.3,23.3




In [None]:
# Assuming 'X' is your DataFrame
fire_rows = X[X['fire'] == 1]

# Print rows where 'fire' is equal to 1
print(fire_rows)


         latitude  longitude  year  month  fire_cnt  fire  fire_cnt_before  \
49241        35.0       25.7  2017      8       2.0     1             12.0   
49242        35.0       25.7  2017      8       2.0     1             12.0   
49243        35.0       25.7  2017      8       2.0     1             12.0   
49244        35.0       25.7  2017      8       2.0     1             12.0   
49245        35.0       25.7  2017      8       2.0     1             12.0   
...           ...        ...   ...    ...       ...   ...              ...   
2008545      39.2       22.8  2014      9       2.0     1              3.0   
2008546      39.2       22.8  2014      9       2.0     1              3.0   
2008547      39.2       22.8  2014      9       2.0     1              3.0   
2008548      39.2       22.8  2014      9       2.0     1              3.0   
2008549      39.2       22.8  2014      9       2.0     1              3.0   

         fire_before  fire_cnt_last_year  fire_last_year  \
492

In [None]:
# Save the resampled dataset as a CSV file
X.to_csv('final_data.csv', index=False)

In [None]:

# Load the CSV file into a DataFrame
data = pd.read_csv('final_data.csv')

# Check the range of latitude and longitude
min_latitude = data['latitude'].min()
max_latitude = data['latitude'].max()

min_longitude = data['longitude'].min()
max_longitude = data['longitude'].max()

print(f"Latitude range: {min_latitude} to {max_latitude}")
print(f"Longitude range: {min_longitude} to {max_longitude}")

In [None]:
# Define the split years
TRAIN_START_YEAR = 2013
TRAIN_END_YEAR = 2018
VALID_START_YEAR = 2019
VALID_END_YEAR = 2021
TEST_START_YEAR = 2022
TEST_END_YEAR = 2023

# Split the dataset
train = X[(X['year'] >= TRAIN_START_YEAR) & (X['year'] <= TRAIN_END_YEAR)]
valid = X[(X['year'] >= VALID_START_YEAR) & (X['year'] <= VALID_END_YEAR)]
test = X[(X['year'] >= TEST_START_YEAR) & (X['year'] <= TEST_END_YEAR)]

In [None]:
train.shape, valid.shape, test.shape
train