# Stratified Sampling

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [73]:
# Load the dataset
df = pd.read_csv('data/cleaned_hotspots.csv')

df.head()

Unnamed: 0,lat,lon,rep_date,sensor,satellite,temp,rh,ws,wd,pcp,...,tfc,hfi,cfb,elev,sfl,cfl,ecozone,year,month,doy
0,54.357,-126.69,2019-09-15 06:20:00,MODIS,Terra,10.825,73,6.929,277,0.254,...,1.24,82.0,0.0,708,8.12067,0.872531,14.0,2019,9,258
1,54.603,-128.656,2019-09-15 19:50:00,MODIS,Terra,14.78,67,8.261,10,1.307,...,0.51,5.0,0.0,176,15.0995,0.875187,13.0,2019,9,258
2,58.945,-127.342,2019-05-24 19:58:00,VIIRS,S-NPP,20.307,43,7.295,309,2.278,...,0.26,1.0,0.0,603,17.9218,0.227139,12.0,2019,5,144
3,58.941,-127.376,2019-05-24 20:05:00,MODIS,Terra,20.079,43,7.275,309,2.298,...,0.23,1.0,0.0,641,8.69555,0.319614,12.0,2019,5,144
4,54.6047,-128.658,2019-09-15 21:00:00,IBAND,S-NPP,14.87,67,8.277,10,1.303,...,0.52,5.0,0.0,156,15.0995,0.875187,13.0,2019,9,258


In [74]:
df.rename(columns={'doy': 'day'}, inplace=True)

In [75]:
print(df['year'].value_counts())

2023    680106
2021    229679
2022     47713
2019     14094
2020      8234
Name: year, dtype: int64


In [76]:
years_of_interest = [2019, 2020, 2021, 2022, 2023]
df = df[df['year'].isin(years_of_interest)]

# Perform stratified sampling to get a representative sample
sampled_df, _ = train_test_split(df, test_size=0.25, stratify=df['year'], random_state=1)

# Display the shape of the sampled DataFrame to verify
print(f"Sampled data shape: {sampled_df.shape}")

# Ensure each year is represented in the sampled data
print("Sampled data year distribution:")
print(sampled_df['year'].value_counts())

Sampled data shape: (734869, 29)
Sampled data year distribution:
2023    510079
2021    172259
2022     35785
2019     10570
2020      6176
Name: year, dtype: int64


In [77]:
def categorize_intensity(hfi):
    if hfi <= 10:
        return 'Low'
    elif hfi <= 100:
        return 'Moderate'
    else:
        return 'High'

# Apply the function to create the new 'Intensity' feature
sampled_df['Intensity'] = sampled_df['hfi'].apply(categorize_intensity)

# Display the first few rows to verify
sampled_df[['hfi', 'Intensity']].head()

Unnamed: 0,hfi,Intensity
61900,6080.0,High
432459,4420.0,High
608651,2728.0,High
501627,46.0,Moderate
506466,1486.0,High


#### Remove HFI > 60000

In [78]:
sampled_df = sampled_df[sampled_df['hfi'] <= 60000]

# Verify the filtering step
print(f"Number of rows after filtering 'hfi' > 60000: {sampled_df.shape[0]}")

Number of rows after filtering 'hfi' > 60000: 733476


In [79]:
sampled_df

Unnamed: 0,lat,lon,rep_date,sensor,satellite,temp,rh,ws,wd,pcp,...,hfi,cfb,elev,sfl,cfl,ecozone,year,month,day,Intensity
61900,51.598000,-121.574997,2021-08-13 09:45:00,VIIRS-I,S-NPP,31.108999,16,8.527,141,0.000,...,6080.0,0.0,1219,5.795290,1.148687,14.0,2021,8,225,High
432459,49.368469,-116.154350,2023-08-19 09:47:00,VIIRS-I,S-NPP,13.346000,41,7.492,209,1.023,...,4420.0,0.0,1663,5.986526,1.919667,14.0,2023,8,231,High
608651,59.988510,-120.504066,2023-06-01 10:15:00,VIIRS-I,NOAA-20,15.510000,24,15.098,255,0.000,...,2728.0,49.0,558,33.087170,0.233186,4.0,2023,6,152,High
501627,53.562130,-123.966049,2023-08-22 10:30:00,VIIRS-I,S-NPP,20.032000,45,7.662,41,0.000,...,46.0,0.0,911,3.123410,0.041435,14.0,2023,8,234,Moderate
506466,54.869701,-125.683510,2023-08-27 10:36:00,VIIRS-I,S-NPP,24.865999,37,6.750,129,0.000,...,1486.0,0.0,984,5.504624,1.667683,14.0,2023,8,239,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
932601,58.411190,-120.454483,2023-09-23 10:28:00,VIIRS-I,S-NPP,14.515000,43,6.138,195,0.000,...,494.0,0.0,407,10.327592,0.330372,4.0,2023,9,266,High
1949,48.824000,-124.820000,2019-10-01 20:09:00,IBAND,JPSS1,13.438000,59,7.758,303,0.000,...,1.0,0.0,243,21.635500,1.693690,13.0,2019,10,274,Low
310496,49.239830,-120.055519,2023-08-20 10:19:00,VIIRS-I,NOAA-20,20.386999,28,7.578,114,0.000,...,10282.0,47.0,985,-1.000000,1.743592,14.0,2023,8,232,High
161408,50.933998,-122.079002,2021-08-12 09:13:00,VIIRS-I,NOAA-20,24.959999,25,9.897,88,0.000,...,9626.0,60.0,1498,4.580694,0.615667,14.0,2021,8,224,High


In [81]:
sampled_df['rep_date'] = pd.to_datetime(sampled_df['rep_date'])

In [82]:
sampled_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 733476 entries, 61900 to 342338
Data columns (total 30 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   lat        733476 non-null  float64       
 1   lon        733476 non-null  float64       
 2   rep_date   733476 non-null  datetime64[ns]
 3   sensor     733476 non-null  object        
 4   satellite  733476 non-null  object        
 5   temp       733476 non-null  float64       
 6   rh         733476 non-null  int64         
 7   ws         733476 non-null  float64       
 8   wd         733476 non-null  int64         
 9   pcp        733476 non-null  float64       
 10  ffmc       733476 non-null  float64       
 11  dmc        733476 non-null  float64       
 12  dc         733476 non-null  float64       
 13  isi        733476 non-null  float64       
 14  bui        733476 non-null  float64       
 15  fwi        733476 non-null  float64       
 16  fuel       73347

In [83]:
sampled_df.to_csv('data/sampled_hotspots.csv')