In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

wildfires_df = pd.read_csv('/content/drive/MyDrive/CS249-Winter2024-Project/wildfires.csv', delimiter=',')

# Load wildfire data into a GeoDataFrame
gdf_wildfires = gpd.GeoDataFrame(
    wildfires_df,
    geometry=wildfires_df.apply(lambda row: Point(row['attr_InitialLongitude'], row['attr_InitialLatitude']), axis=1),
    crs="EPSG:4326"
)

# Load the California shapefile
gdf_california = gpd.read_file('/content/drive/MyDrive/CS249-Winter2024-Project/ca_state_boundary/ca_state_boundaries.shp')

# Use spatial join to filter wildfires within California's boundary
california_wildfires_gdf = gpd.sjoin(gdf_wildfires, gdf_california, how="inner", op='intersects')
california_wildfires_gdf['attr_FireDiscoveryDateTime'] = pd.to_datetime(california_wildfires_gdf['attr_FireDiscoveryDateTime'], format='%m/%d/%Y %I:%M:%S %p')

california_wildfires_gdf.iloc[:3]

  wildfires_df = pd.read_csv('/content/drive/MyDrive/CS249-Winter2024-Project/wildfires.csv', delimiter=',')
  if (await self.run_code(code, result,  async_=asy)):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: EPSG:3857

  california_wildfires_gdf = gpd.sjoin(gdf_wildfires, gdf_california, how="inner", op='intersects')


Unnamed: 0,OBJECTID,poly_SourceOID,poly_IncidentName,poly_FeatureCategory,poly_MapMethod,poly_GISAcres,poly_CreateDate,poly_DateCurrent,poly_PolygonDateTime,poly_IRWINID,...,NAME,LSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,Shape_Leng,Shape_Area
46,85,,LINDELL RD / EL TORO RD,Wildfire Daily Fire Perimeter,Hand Sketch,0.01,5/15/2022 2:46:47 AM,5/15/2022 8:15:27 PM,5/15/2022 7:46:15 AM,{372AA56F-5803-4095-9226-CB1A471ED42E},...,California,0,G4000,A,403673300000.0,20291770000.0,37.1551773,-119.5434183,42.617368,43.083192
65,106,6100.0,INKOPAH,Wildfire Daily Fire Perimeter,Auto-generated,0.1,5/4/2021 10:51:24 PM,5/4/2021 10:51:24 PM,1/16/2021 3:11:39 AM,{6D7B9AF9-250F-4866-9FDC-4E2C5471B3C2},...,California,0,G4000,A,403673300000.0,20291770000.0,37.1551773,-119.5434183,42.617368,43.083192
76,119,,LAC-194142,Wildfire Daily Fire Perimeter,Hand Sketch,,5/7/2021 4:52:25 AM,5/7/2021 4:52:25 AM,5/7/2021 4:52:24 AM,{16D3EDA1-AAC8-4902-9B67-D81D4856A8EF},...,California,0,G4000,A,403673300000.0,20291770000.0,37.1551773,-119.5434183,42.617368,43.083192


In [3]:
import shutil

# Save dataframe to CSV in google drive
california_wildfires_gdf.to_csv('california-wildfires-only.csv', index=False)
# Move CSV file to Google Drive
shutil.move('california-wildfires-only.csv', '/content/drive/MyDrive/CS249-Winter2024-Project/')

'/content/drive/MyDrive/CS249-Winter2024-Project/california-wildfires-only.csv'

In [5]:
import numpy as np
from scipy.spatial import cKDTree


stations_df = pd.read_csv('/content/drive/MyDrive/CS249-Winter2024-Project/california_stations.csv', delimiter=',')
california_wildfires_gdf['lat'] = pd.to_numeric(california_wildfires_gdf['attr_InitialLatitude'])
california_wildfires_gdf['lon'] = pd.to_numeric(california_wildfires_gdf['attr_InitialLongitude'])
stations_df['lat'] = pd.to_numeric(stations_df['LATITUDE'])
stations_df['lon'] = pd.to_numeric(stations_df['LONGITUDE'])


def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points
    on the earth (specified in decimal degrees)
    """
    # Convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

def find_closest_station(wildfire_row, stations_df):
    distances = stations_df.apply(
        lambda row: haversine(wildfire_row['lon'], wildfire_row['lat'], row['lon'], row['lat']),
        axis=1)
    return stations_df.loc[distances.idxmin()]

# using 'ID' as the unique identifier for stations
stations_df.reset_index(inplace=True)
stations_df.rename(columns={'index': 'station_index'}, inplace=True)


# Prepare the data
station_points = np.deg2rad(stations_df[['LATITUDE', 'LONGITUDE']].values)
wildfire_points = np.deg2rad(california_wildfires_gdf[['attr_InitialLatitude', 'attr_InitialLongitude']].values)

# Build the KDTree and query it for the closest station to each wildfire
tree = cKDTree(station_points)
distances, indices = tree.query(wildfire_points)

# Store the index (or ID) of the closest station
california_wildfires_gdf['closest_station_index'] = indices

wildfires_df = pd.merge(california_wildfires_gdf, stations_df, left_on='closest_station_index', right_on='station_index', suffixes=('_wildfire', '_station'))

wildfires_df.to_csv('wildfires-with-closest-station.csv', index=False)
# Move CSV file to Google Drive
shutil.move('wildfires-with-closest-station.csv', '/content/drive/MyDrive/CS249-Winter2024-Project/')

'/content/drive/MyDrive/CS249-Winter2024-Project/wildfires-with-closest-station.csv'

In [7]:
file_paths = ['weather_2020_clean.csv', 'weather_2021_clean.csv', 'weather_2022_clean.csv', 'weather_2023_clean.csv', 'weather_2024_clean.csv']

weather_df = [pd.read_csv('/content/drive/MyDrive/CS249-Winter2024-Project/'+file, delimiter=',') for file in file_paths]

combined_weather_df = pd.concat(weather_df, ignore_index=True)
elements_to_keep = ['PRCP', 'TMAX', 'TMIN', 'TAVG', 'AWND', 'RHAV', 'RHMN', 'RHMX']


filtered_weather_df = combined_weather_df[combined_weather_df['ELEMENT'].isin(elements_to_keep)]
filtered_weather_df['YEAR/MONTH/DAY'] = pd.to_datetime(filtered_weather_df['YEAR/MONTH/DAY'], format='%Y%m%d')
filtered_weather_df['Timestamp'] = filtered_weather_df['YEAR/MONTH/DAY'] + pd.Timedelta(hours=12)

filtered_weather_df['DATA VALUE'] = pd.to_numeric(filtered_weather_df['DATA VALUE'], errors='coerce')
filtered_weather_df['ID'] = filtered_weather_df['ID'].astype(str)
filtered_weather_df['ELEMENT'] = filtered_weather_df['ELEMENT'].astype(str)

pivoted_weather_df = filtered_weather_df.pivot_table(index=['ID', 'Timestamp'],
                                    columns='ELEMENT',
                                    values='DATA VALUE',
                                    aggfunc='mean').reset_index()

pivoted_weather_df.to_csv('pivot-table-weather.csv', index=False)
# Move CSV file to Google Drive
shutil.move('pivot-table-weather.csv', '/content/drive/MyDrive/CS249-Winter2024-Project/')

  weather_df = [pd.read_csv('/content/drive/MyDrive/CS249-Winter2024-Project/'+file, delimiter=',') for file in file_paths]
  weather_df = [pd.read_csv('/content/drive/MyDrive/CS249-Winter2024-Project/'+file, delimiter=',') for file in file_paths]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_weather_df['YEAR/MONTH/DAY'] = pd.to_datetime(filtered_weather_df['YEAR/MONTH/DAY'], format='%Y%m%d')


KeyboardInterrupt: 