In [2]:
import pandas as pd
import numpy as np

In [3]:
fires = pd.read_csv('data/fires2024.csv')

In [4]:
# states = ['OR', 'ID', 'TX', 'CA']
# fires_top = fires[fires['state_abbr'].isin(states)] 

In [5]:
# ozone = pd.read_csv('ozone_2024.csv')
# pd.set_option('display.max_columns', None)
# ozone_test = ozone.query("`State Name` == 'California'")
# ozone_test

In [6]:
aq_weather = pd.read_csv('air_quality_with_weather_final.csv')

In [7]:
from math import radians, sin, cos, sqrt, atan2

def calculate_distance(lat1, lon1, lat2, lon2):
    '''
    Calculate distance using the Haversine formula -- great-circle distance between two points on a sphere given their longitudes and latitudes
    Returns distance in kilometers
    '''
    # Convert to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))

    # Earth's radius in kilometers
    radius = 6371
    distance = radius * c

    return distance

In [8]:
aq_weather.head()

Unnamed: 0,date,site_id,latitude,longitude,state_name,county_name,city_name,site_name,PM25,CO,O3,NO2,SO2,AQI_PM25,AQI_CO,AQI_O3,AQI_NO2,AQI_SO2,AQI,temperature_2m_mean,temperature_2m_max,temperature_2m_min,relative_humidity_2m_mean,wind_speed_10m_mean,wind_direction_10m_dominant,precipitation_sum,precipitation_hours,et0_fao_evapotranspiration,weather_code
0,2024-01-01,01-073-0023,33.553056,-86.815,Alabama,Jefferson,Birmingham,North Birmingham,11.55,0.3,0.027,24.3,1.8,57.0,3.0,25.0,23,1.0,57.0,6.2,9.9,3.2,63.0,11.6,317.0,0.0,0.0,1.78,Overcast
1,2024-01-01,04-013-9997,33.503833,-112.095767,Arizona,Maricopa,Phoenix,JLG SUPERSITE,85.35,1.3,0.021,37.3,2.3,176.0,15.0,19.0,35,3.0,176.0,10.7,16.5,6.3,66.0,6.3,45.0,0.0,0.0,1.4,Overcast
2,2024-01-01,04-019-1028,32.29515,-110.9823,Arizona,Pima,Tucson,CHILDREN'S PARK NCore,16.3,0.4,0.033,22.2,0.2,67.0,5.0,31.0,21,0.0,67.0,10.6,19.1,4.7,49.0,8.4,117.0,0.0,0.0,2.33,Overcast
3,2024-01-01,05-119-0007,34.756189,-92.281296,Arkansas,Pulaski,North Little Rock,PARR,5.9,0.0,0.026,4.5,0.6,33.0,0.0,24.0,4,0.0,33.0,4.6,11.0,-0.1,60.0,12.2,21.0,0.0,0.0,1.8,Overcast
4,2024-01-01,06-001-0011,37.814781,-122.282347,California,Alameda,Oakland,Oakland West,6.9,1.3,0.022,23.7,0.8,38.0,15.0,20.0,22,0.0,38.0,11.6,15.7,8.3,89.0,5.4,52.0,1.2,8.0,1.12,Light drizzle


In [9]:
# Start with existing aqi_weather dataframe
final_df = aq_weather.copy()

# Standardize dates
fires['acq_date'] = pd.to_datetime(fires['acq_date']).dt.date
final_df['date'] = pd.to_datetime(final_df['date']).dt.date

# Pre-group fires
fires_by_date = fires.groupby('acq_date')

In [10]:
# Vectorized distance function
def calculate_distance_vectorized(lat1, lon1, lat2_array, lon2_array):
    lat1_rad = np.radians(lat1)
    lon1_rad = np.radians(lon1)
    lat2_rad = np.radians(lat2_array)
    lon2_rad = np.radians(lon2_array)
    
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    
    a = np.sin(dlat/2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
    return 6371 * c

In [11]:
# Initialize fire feature columns with NaN/0
final_df['distance_to_fire_km'] = np.nan
final_df['fire_brightness'] = np.nan
final_df['fire_frp'] = np.nan
final_df['fires_within_50km'] = 0
final_df['fires_within_100km'] = 0
final_df['has_nearby_fire'] = 0

In [12]:
# Loop and fill in fire features
for i, row in enumerate(final_df.itertuples()):
    if i % 100 == 0:
        print(f"Processing {i}/{len(final_df)}...")
    
    try:
        daily_fires = fires_by_date.get_group(row.date)
    except KeyError:
        continue  # Keep NaN/0 values for this row
    
    if len(daily_fires) > 0:
        # Calculate distances
        distances = calculate_distance_vectorized(
            row.latitude,
            row.longitude,
            daily_fires['latitude'].values,
            daily_fires['longitude'].values
        )
        
        nearest_idx = distances.argmin()
        nearest_fire = daily_fires.iloc[nearest_idx]
        
        # Update the row in final_df
        final_df.at[row.Index, 'distance_to_fire_km'] = float(distances.min())
        final_df.at[row.Index, 'fire_brightness'] = float(nearest_fire['brightness'])
        final_df.at[row.Index, 'fire_frp'] = float(nearest_fire['frp'])
        final_df.at[row.Index, 'fires_within_50km'] = int((distances <= 50).sum())
        final_df.at[row.Index, 'fires_within_100km'] = int((distances <= 100).sum())
        final_df.at[row.Index, 'has_nearby_fire'] = 1

print(f"\nDone! Final dataset shape: {final_df.shape}")
print(f"Columns: {final_df.columns.tolist()}")

Processing 0/19802...
Processing 100/19802...
Processing 200/19802...
Processing 300/19802...
Processing 400/19802...
Processing 500/19802...
Processing 600/19802...
Processing 700/19802...
Processing 800/19802...
Processing 900/19802...
Processing 1000/19802...
Processing 1100/19802...
Processing 1200/19802...
Processing 1300/19802...
Processing 1400/19802...
Processing 1500/19802...
Processing 1600/19802...
Processing 1700/19802...
Processing 1800/19802...
Processing 1900/19802...
Processing 2000/19802...
Processing 2100/19802...
Processing 2200/19802...
Processing 2300/19802...
Processing 2400/19802...
Processing 2500/19802...
Processing 2600/19802...
Processing 2700/19802...
Processing 2800/19802...
Processing 2900/19802...
Processing 3000/19802...
Processing 3100/19802...
Processing 3200/19802...
Processing 3300/19802...
Processing 3400/19802...
Processing 3500/19802...
Processing 3600/19802...
Processing 3700/19802...
Processing 3800/19802...
Processing 3900/19802...
Processing 4

In [13]:
final_df

Unnamed: 0,date,site_id,latitude,longitude,state_name,county_name,city_name,site_name,PM25,CO,O3,NO2,SO2,AQI_PM25,AQI_CO,AQI_O3,AQI_NO2,AQI_SO2,AQI,temperature_2m_mean,temperature_2m_max,temperature_2m_min,relative_humidity_2m_mean,wind_speed_10m_mean,wind_direction_10m_dominant,precipitation_sum,precipitation_hours,et0_fao_evapotranspiration,weather_code,distance_to_fire_km,fire_brightness,fire_frp,fires_within_50km,fires_within_100km,has_nearby_fire
0,2024-01-01,01-073-0023,33.553056,-86.815000,Alabama,Jefferson,Birmingham,North Birmingham,11.55,0.3,0.027,24.3,1.8,57.0,3.0,25.0,23,1.0,57.0,6.2,9.9,3.2,63.0,11.6,317.0,0.0,0.0,1.78,Overcast,36.661749,298.82,0.76,3,3,1
1,2024-01-01,04-013-9997,33.503833,-112.095767,Arizona,Maricopa,Phoenix,JLG SUPERSITE,85.35,1.3,0.021,37.3,2.3,176.0,15.0,19.0,35,3.0,176.0,10.7,16.5,6.3,66.0,6.3,45.0,0.0,0.0,1.40,Overcast,114.379578,296.16,8.56,0,0,1
2,2024-01-01,04-019-1028,32.295150,-110.982300,Arizona,Pima,Tucson,CHILDREN'S PARK NCore,16.30,0.4,0.033,22.2,0.2,67.0,5.0,31.0,21,0.0,67.0,10.6,19.1,4.7,49.0,8.4,117.0,0.0,0.0,2.33,Overcast,125.380460,340.20,7.33,0,0,1
3,2024-01-01,05-119-0007,34.756189,-92.281296,Arkansas,Pulaski,North Little Rock,PARR,5.90,0.0,0.026,4.5,0.6,33.0,0.0,24.0,4,0.0,33.0,4.6,11.0,-0.1,60.0,12.2,21.0,0.0,0.0,1.80,Overcast,107.232831,296.57,0.54,0,0,1
4,2024-01-01,06-001-0011,37.814781,-122.282347,California,Alameda,Oakland,Oakland West,6.90,1.3,0.022,23.7,0.8,38.0,15.0,20.0,22,0.0,38.0,11.6,15.7,8.3,89.0,5.4,52.0,1.2,8.0,1.12,Light drizzle,26.582688,295.25,1.46,4,9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19797,2024-12-31,49-035-3015,40.777145,-111.945849,Utah,Salt Lake,Salt Lake City,Utah Technical Center,4.50,0.6,0.028,38.6,1.5,25.0,7.0,26.0,36,1.0,36.0,-0.0,2.8,-2.6,73.0,4.4,149.0,0.0,0.0,0.84,Overcast,124.388127,296.16,0.98,0,0,1
19798,2024-12-31,50-021-0002,43.608056,-72.982778,Vermont,Rutland,Rutland,State of Vermont District Court Parking Lot,4.70,0.5,0.032,21.7,0.7,26.0,6.0,30.0,20,0.0,30.0,5.9,8.6,2.9,68.0,10.8,206.0,0.0,0.0,1.10,3,191.116673,297.45,0.56,0,0,1
19799,2024-12-31,51-087-0014,37.556520,-77.400270,Virginia,Henrico,East Highland Park,MathScience Innovation Center,5.60,0.4,0.034,24.6,0.9,31.0,5.0,31.0,23,0.0,31.0,9.7,17.9,3.2,73.0,12.0,185.0,1.8,2.0,1.78,Slight rain,41.607035,299.06,0.59,3,5,1
19800,2024-12-31,53-033-0080,47.568236,-122.308628,Washington,King,Seattle,SEATTLE - BEACON HILL,3.40,0.3,0.017,29.5,0.3,19.0,3.0,16.0,27,0.0,27.0,5.1,7.8,1.9,91.0,7.1,148.0,0.5,4.0,0.43,51,64.945187,298.19,1.03,0,1,1


In [14]:
# 1. TEMPORAL FEATURES
final_df['datetime'] = pd.to_datetime(final_df['date'])
final_df['month'] = final_df['datetime'].dt.month
final_df['day_of_week'] = final_df['datetime'].dt.dayofweek
final_df['is_weekend'] = (final_df['day_of_week'] >= 5).astype(int)
final_df['season'] = final_df['month'].apply(lambda m: 
    'winter' if m in [12,1,2] else 
    'spring' if m in [3,4,5] else 
    'summer' if m in [6,7,8] else 'fall'
)
final_df['wildfire_season'] = ((final_df['month'] >= 6) & (final_df['month'] <= 10)).astype(int)

# 2. FIRE CATEGORIES
def categorize_fire_distance(distance):
    if pd.isna(distance):
        return 'no_fire'
    elif distance < 25:
        return 'very_close'
    elif distance < 50:
        return 'close'
    elif distance < 100:
        return 'moderate'
    else:
        return 'far'

def categorize_fire_intensity(frp):
    if pd.isna(frp) or frp == 0:
        return 'no_fire'
    elif frp < 10:
        return 'low'
    elif frp < 50:
        return 'moderate'
    elif frp < 100:
        return 'high'
    else:
        return 'extreme'

final_df['fire_distance_category'] = final_df['distance_to_fire_km'].apply(categorize_fire_distance)
final_df['fire_intensity'] = final_df['fire_frp'].apply(categorize_fire_intensity)

In [None]:
final_df.to_csv('air_quality_weather_fires.csv')

In [None]:
weather_code_map = {
    0: "Clear sky",
    1: "Mainly clear",
    2: "Partly cloudy",
    3: "Overcast",
    45: "Fog",
    48: "Depositing rime fog",
    51: "Light drizzle",
    53: "Moderate drizzle",
    55: "Dense drizzle",
    56: "Light freezing drizzle",
    57: "Dense freezing drizzle",
    61: "Slight rain",
    63: "Moderate rain",
    65: "Heavy rain",
    66: "Light freezing rain",
    67: "Heavy freezing rain",
    71: "Slight snow fall",
    73: "Moderate snow fall",
    75: "Heavy snow fall",
    77: "Snow grains",
    80: "Slight rain showers",
    81: "Moderate rain showers",
    82: "Violent rain showers",
    85: "Slight snow showers",
    86: "Heavy snow showers",
    95: "Thunderstorm",
    96: "Thunderstorm + slight hail",
    99: "Thunderstorm + heavy hail"
}



# Convert dictionary keys to strings to match your column type
weather_code_map_str = {str(k): v for k, v in weather_code_map.items()}

# Map and fill NaN with original values
final_df['weather_code'] = df['weather_code'].map(weather_code_map_str).fillna(final_df['weather_code'])

final_df.to_csv('air_quality_weather_fires.csv')

In [16]:
final_df

Unnamed: 0,date,site_id,latitude,longitude,state_name,county_name,city_name,site_name,PM25,CO,O3,NO2,SO2,AQI_PM25,AQI_CO,AQI_O3,AQI_NO2,AQI_SO2,AQI,temperature_2m_mean,temperature_2m_max,temperature_2m_min,relative_humidity_2m_mean,wind_speed_10m_mean,wind_direction_10m_dominant,precipitation_sum,precipitation_hours,et0_fao_evapotranspiration,weather_code,distance_to_fire_km,fire_brightness,fire_frp,fires_within_50km,fires_within_100km,has_nearby_fire,datetime,month,day_of_week,is_weekend,season,wildfire_season,fire_distance_category,fire_intensity
0,2024-01-01,01-073-0023,33.553056,-86.815000,Alabama,Jefferson,Birmingham,North Birmingham,11.55,0.3,0.027,24.3,1.8,57.0,3.0,25.0,23,1.0,57.0,6.2,9.9,3.2,63.0,11.6,317.0,0.0,0.0,1.78,Overcast,36.661749,298.82,0.76,3,3,1,2024-01-01,1,0,0,winter,0,close,low
1,2024-01-01,04-013-9997,33.503833,-112.095767,Arizona,Maricopa,Phoenix,JLG SUPERSITE,85.35,1.3,0.021,37.3,2.3,176.0,15.0,19.0,35,3.0,176.0,10.7,16.5,6.3,66.0,6.3,45.0,0.0,0.0,1.40,Overcast,114.379578,296.16,8.56,0,0,1,2024-01-01,1,0,0,winter,0,far,low
2,2024-01-01,04-019-1028,32.295150,-110.982300,Arizona,Pima,Tucson,CHILDREN'S PARK NCore,16.30,0.4,0.033,22.2,0.2,67.0,5.0,31.0,21,0.0,67.0,10.6,19.1,4.7,49.0,8.4,117.0,0.0,0.0,2.33,Overcast,125.380460,340.20,7.33,0,0,1,2024-01-01,1,0,0,winter,0,far,low
3,2024-01-01,05-119-0007,34.756189,-92.281296,Arkansas,Pulaski,North Little Rock,PARR,5.90,0.0,0.026,4.5,0.6,33.0,0.0,24.0,4,0.0,33.0,4.6,11.0,-0.1,60.0,12.2,21.0,0.0,0.0,1.80,Overcast,107.232831,296.57,0.54,0,0,1,2024-01-01,1,0,0,winter,0,far,low
4,2024-01-01,06-001-0011,37.814781,-122.282347,California,Alameda,Oakland,Oakland West,6.90,1.3,0.022,23.7,0.8,38.0,15.0,20.0,22,0.0,38.0,11.6,15.7,8.3,89.0,5.4,52.0,1.2,8.0,1.12,Light drizzle,26.582688,295.25,1.46,4,9,1,2024-01-01,1,0,0,winter,0,close,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19797,2024-12-31,49-035-3015,40.777145,-111.945849,Utah,Salt Lake,Salt Lake City,Utah Technical Center,4.50,0.6,0.028,38.6,1.5,25.0,7.0,26.0,36,1.0,36.0,-0.0,2.8,-2.6,73.0,4.4,149.0,0.0,0.0,0.84,Overcast,124.388127,296.16,0.98,0,0,1,2024-12-31,12,1,0,winter,0,far,low
19798,2024-12-31,50-021-0002,43.608056,-72.982778,Vermont,Rutland,Rutland,State of Vermont District Court Parking Lot,4.70,0.5,0.032,21.7,0.7,26.0,6.0,30.0,20,0.0,30.0,5.9,8.6,2.9,68.0,10.8,206.0,0.0,0.0,1.10,3,191.116673,297.45,0.56,0,0,1,2024-12-31,12,1,0,winter,0,far,low
19799,2024-12-31,51-087-0014,37.556520,-77.400270,Virginia,Henrico,East Highland Park,MathScience Innovation Center,5.60,0.4,0.034,24.6,0.9,31.0,5.0,31.0,23,0.0,31.0,9.7,17.9,3.2,73.0,12.0,185.0,1.8,2.0,1.78,Slight rain,41.607035,299.06,0.59,3,5,1,2024-12-31,12,1,0,winter,0,close,low
19800,2024-12-31,53-033-0080,47.568236,-122.308628,Washington,King,Seattle,SEATTLE - BEACON HILL,3.40,0.3,0.017,29.5,0.3,19.0,3.0,16.0,27,0.0,27.0,5.1,7.8,1.9,91.0,7.1,148.0,0.5,4.0,0.43,51,64.945187,298.19,1.03,0,1,1,2024-12-31,12,1,0,winter,0,moderate,low


In [17]:
# final_data = []

# for idx, row in aq_weather.iterrows():
#     latitude = row['latitude'] # site latitude
#     longitude = row['longitude'] # site longitude
#     date = row['date']
#     site_id = row['site_id']

#     # Find fires on date
#     daily_fires = fires[fires['acq_date'] == date]

#     if len(daily_fires) > 0:
#         # Calculate distances from site to each fire
#         distances = daily_fires.apply(
#             lambda fire: calculate_distance(
#                 latitude, longitude,
#                 fire['latitude'], fire['longitude']
#             )
#         )

#         nearest_idx = distances.idxmin()
#         nearest_fire = daily_fires.loc[nearest_idx]

#         fire_features = {
#             'distance_to_fire_km': distances.min(),
#             'fire_brightness': nearest_fire['brightness'], # mid-infrared brightness temperature (Kelvin): standard fire intensity metric, typically 300-500K for fires
#             'fire_frp': nearest_fire['frp'], # fire radiative power (megawatts): direct measure of energy released
#             'fires_within_50km': (distances <= 50).sum(),
#             'fires_within_100km': (distances <= 100).sum(),
#             'has_nearby_fire': 1
#         }

#     else:
#         fire_features = {
#             'distance_to_fire_km': np.nan,
#             'fire_brightness': np.nan,
#             'fire_frp': np.nan,
#             'fires_within_50km': 0,
#             'fires_within_100km': 0,
#             'has_nearby_fire': 0    
#         }

#     # Combine all features
#     final_row = {
#         'date': date,
#         'site_id': site_id,
#         'latitude': latitude,
#         'longitude': longitude,
#         'state_name': row['state_name']

#         # Air quality (target variables)


#         # Weather at station


#         # Fire features
#         **fire_features
#     }

#     final_data.append(final_row)

# final_df = pd.DataFrame(final_data)