## **DSC 550(Master's Thesis) - Prediction of the cause of the California Wildfires** 
*Trying to get the perfect dataset for my California wildfire prediction, the main focus is to predict the cause of the wildfire.* 

In [4]:
import geopandas as gpd
import fiona  # <-- direct import

file_path = r"C:\Users\annis\OneDrive\Desktop\California Wildfire\Data Files\FPA_FOD_20221014.gpkg"

# List all layers in the .gpkg
layers = fiona.listlayers(file_path)
# Load the most relevant layer (usually first or named like 'FPA_FOD_20221014')
gdf = gpd.read_file(file_path, layer=layers[0])

# Preview the data
print(gdf.head())


   FOD_ID      FPA_ID SOURCE_SYSTEM_TYPE SOURCE_SYSTEM NWCG_REPORTING_AGENCY  \
0       1  FS-1418826                FED   FS-FIRESTAT                    FS   
1       2  FS-1418827                FED   FS-FIRESTAT                    FS   
2       3  FS-1418835                FED   FS-FIRESTAT                    FS   
3       4  FS-1418845                FED   FS-FIRESTAT                    FS   
4       5  FS-1418847                FED   FS-FIRESTAT                    FS   

  NWCG_REPORTING_UNIT_ID  NWCG_REPORTING_UNIT_NAME SOURCE_REPORTING_UNIT  \
0                USCAPNF    Plumas National Forest                  0511   
1                USCAENF  Eldorado National Forest                  0503   
2                USCAENF  Eldorado National Forest                  0503   
3                USCAENF  Eldorado National Forest                  0503   
4                USCAENF  Eldorado National Forest                  0503   

  SOURCE_REPORTING_UNIT_NAME LOCAL_FIRE_REPORT_ID  ... FIRE_SI

In [5]:
import torch

print("CUDA Available:", torch.cuda.is_available())

CUDA Available: True


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
tensor = torch.tensor([1.0, 2.0, 3.0]).to(device)

In [8]:
gdf.head()

Unnamed: 0,FOD_ID,FPA_ID,SOURCE_SYSTEM_TYPE,SOURCE_SYSTEM,NWCG_REPORTING_AGENCY,NWCG_REPORTING_UNIT_ID,NWCG_REPORTING_UNIT_NAME,SOURCE_REPORTING_UNIT,SOURCE_REPORTING_UNIT_NAME,LOCAL_FIRE_REPORT_ID,...,FIRE_SIZE,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,OWNER_DESCR,STATE,COUNTY,FIPS_CODE,FIPS_NAME,geometry
0,1,FS-1418826,FED,FS-FIRESTAT,FS,USCAPNF,Plumas National Forest,511,Plumas National Forest,1,...,0.1,A,40.036944,-121.005833,USFS,CA,63,6063,Plumas County,POINT (-121.00582 40.03694)
1,2,FS-1418827,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,13,...,0.25,A,38.933056,-120.404444,USFS,CA,61,6061,Placer County,POINT (-120.40443 38.93305)
2,3,FS-1418835,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,27,...,0.1,A,38.984167,-120.735556,STATE OR PRIVATE,CA,17,6017,El Dorado County,POINT (-120.73554 38.98416)
3,4,FS-1418845,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,43,...,0.1,A,38.559167,-119.913333,USFS,CA,3,6003,Alpine County,POINT (-119.91332 38.55916)
4,5,FS-1418847,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,44,...,0.1,A,38.559167,-119.933056,USFS,CA,3,6003,Alpine County,POINT (-119.93304 38.55916)


In [11]:
gdf.columns

Index(['FOD_ID', 'FPA_ID', 'SOURCE_SYSTEM_TYPE', 'SOURCE_SYSTEM',
       'NWCG_REPORTING_AGENCY', 'NWCG_REPORTING_UNIT_ID',
       'NWCG_REPORTING_UNIT_NAME', 'SOURCE_REPORTING_UNIT',
       'SOURCE_REPORTING_UNIT_NAME', 'LOCAL_FIRE_REPORT_ID',
       'LOCAL_INCIDENT_ID', 'FIRE_CODE', 'FIRE_NAME',
       'ICS_209_PLUS_INCIDENT_JOIN_ID', 'ICS_209_PLUS_COMPLEX_JOIN_ID',
       'MTBS_ID', 'MTBS_FIRE_NAME', 'COMPLEX_NAME', 'FIRE_YEAR',
       'DISCOVERY_DATE', 'DISCOVERY_DOY', 'DISCOVERY_TIME',
       'NWCG_CAUSE_CLASSIFICATION', 'NWCG_GENERAL_CAUSE',
       'NWCG_CAUSE_AGE_CATEGORY', 'CONT_DATE', 'CONT_DOY', 'CONT_TIME',
       'FIRE_SIZE', 'FIRE_SIZE_CLASS', 'LATITUDE', 'LONGITUDE', 'OWNER_DESCR',
       'STATE', 'COUNTY', 'FIPS_CODE', 'FIPS_NAME', 'geometry'],
      dtype='object')

In [12]:
gdf.shape

(2303566, 38)

In [41]:
import pandas as pd
gdf['DISCOVERY_DATE'] = pd.to_datetime(gdf['DISCOVERY_DATE'])
gdf['DISCOVERY_DATE']

0         2005-02-02
1         2004-05-12
2         2004-05-31
3         2004-06-28
4         2004-06-28
             ...    
2303561   2020-06-05
2303562   2020-07-11
2303563   2020-08-27
2303564   2020-08-17
2303565   2020-11-20
Name: DISCOVERY_DATE, Length: 2303566, dtype: datetime64[ns]

In [57]:
gdf['DISCOVERY_TIME'] = gdf['DISCOVERY_TIME'].apply(lambda x: str(x).zfill(4) if pd.notna(x) else '0000')

In [58]:
gdf.loc[:, 'DISCOVERY_TIME'] = pd.to_datetime(gdf['DISCOVERY_TIME'], format='%H%M', errors='coerce').dt.time

### California dataset extraction - Working on just Cali data

In [70]:
gdf_ca = gdf[gdf['STATE'] == 'CA'].copy()

In [71]:
gdf_ca['STATE'].value_counts()

STATE
CA    251881
Name: count, dtype: int64

In [72]:
gdf_ca = gdf_ca[
    ['DISCOVERY_DATE', 'DISCOVERY_TIME', 'FIRE_YEAR', 'LATITUDE', 'LONGITUDE',
     'COUNTY', 'STATE', 'FIRE_SIZE', 'FIRE_SIZE_CLASS',
     'NWCG_CAUSE_CLASSIFICATION', 'NWCG_GENERAL_CAUSE']
]

In [73]:
gdf_ca.head()

Unnamed: 0,DISCOVERY_DATE,DISCOVERY_TIME,FIRE_YEAR,LATITUDE,LONGITUDE,COUNTY,STATE,FIRE_SIZE,FIRE_SIZE_CLASS,NWCG_CAUSE_CLASSIFICATION,NWCG_GENERAL_CAUSE
0,2005-02-02,13:00:00,2005,40.036944,-121.005833,63,CA,0.1,A,Human,Power generation/transmission/distribution
1,2004-05-12,08:45:00,2004,38.933056,-120.404444,61,CA,0.25,A,Natural,Natural
2,2004-05-31,19:21:00,2004,38.984167,-120.735556,17,CA,0.1,A,Human,Debris and open burning
3,2004-06-28,16:00:00,2004,38.559167,-119.913333,3,CA,0.1,A,Natural,Natural
4,2004-06-28,16:00:00,2004,38.559167,-119.933056,3,CA,0.1,A,Natural,Natural


In [74]:
# Display the count of null values for each column in the dataframe
null_counts = gdf_ca.isnull().sum()

# Combine the null counts with the column names and display them
print(null_counts)

DISCOVERY_DATE                   0
DISCOVERY_TIME                   0
FIRE_YEAR                        0
LATITUDE                         0
LONGITUDE                        0
COUNTY                       94725
STATE                            0
FIRE_SIZE                        0
FIRE_SIZE_CLASS                  0
NWCG_CAUSE_CLASSIFICATION        0
NWCG_GENERAL_CAUSE               0
dtype: int64


In [75]:
# Save the gdf_ca dataframe as a CSV file
gdf_ca.to_csv('gdf_ca_dataset1.csv', index=False)

In [83]:
import geopandas as gpd

# Load the shapefile
counties_shapefile = gpd.read_file(r"C:\Users\annis\OneDrive\Desktop\California Wildfire\Data Files\CA_Counties.shp")

In [84]:
from shapely.geometry import Point
import pandas as pd

# Convert to GeoDataFrame
geometry = [Point(lon, lat) for lon, lat in zip(gdf_ca['LONGITUDE'], gdf_ca['LATITUDE'])]
gdf_points = gpd.GeoDataFrame(gdf_ca, geometry=geometry, crs="EPSG:4326")

In [None]:
print(counties_shapefile.columns)

In [None]:
counties_shapefile = counties_shapefile.rename(columns={'COUNTYFP': 'COUNTY_NAME'})

In [None]:
print(gdf_points.crs)
print(counties_shapefile.crs)

In [None]:
gdf_points = gdf_points.set_geometry(gpd.points_from_xy(gdf_points['LONGITUDE'], gdf_points['LATITUDE']))
gdf_points = gdf_points.set_crs("EPSG:4326")  # Assuming your points are in lat/lon
gdf_points = gdf_points.to_crs(counties_shapefile.crs)


In [92]:
# Perform spatial join

gdf_with_county = gpd.sjoin(gdf_points, counties_shapefile, how="left", predicate='within')


# Preview the data
print(gdf_with_county[['LATITUDE', 'LONGITUDE', 'COUNTY_NAME']])


          LATITUDE   LONGITUDE COUNTY_NAME
0        40.036944 -121.005833         063
1        38.933056 -120.404444         017
2        38.984167 -120.735556         017
3        38.559167 -119.913333         003
4        38.559167 -119.933056         003
...            ...         ...         ...
2303542  34.337222 -119.053333         111
2303543  35.307500 -119.964444         079
2303551  37.472222 -121.249444         099
2303556  40.053250 -120.668900         063
2303562  37.148611 -119.503056         039

[251881 rows x 3 columns]


In [93]:
gdf_with_county.head()

Unnamed: 0,DISCOVERY_DATE,DISCOVERY_TIME,FIRE_YEAR,LATITUDE,LONGITUDE,COUNTY,STATE,FIRE_SIZE,FIRE_SIZE_CLASS,NWCG_CAUSE_CLASSIFICATION,...,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,Shape_Leng,Shape_Area
0,2005-02-02,13:00:00,2005,40.036944,-121.005833,63,CA,0.1,A,Human,...,,,,A,6612401000.0,156387636.0,39.9922953,-120.8243709,657319.221495,11550000000.0
1,2004-05-12,08:45:00,2004,38.933056,-120.404444,61,CA,0.25,A,Natural,...,472.0,40900.0,,A,4423299000.0,203342316.0,38.7856116,-120.5342245,467801.45206,7624166000.0
2,2004-05-31,19:21:00,2004,38.984167,-120.735556,17,CA,0.1,A,Human,...,472.0,40900.0,,A,4423299000.0,203342316.0,38.7856116,-120.5342245,467801.45206,7624166000.0
3,2004-06-28,16:00:00,2004,38.559167,-119.913333,3,CA,0.1,A,Natural,...,,,,A,1912293000.0,12557304.0,38.6217831,-119.7983522,275565.412692,3156006000.0
4,2004-06-28,16:00:00,2004,38.559167,-119.933056,3,CA,0.1,A,Natural,...,,,,A,1912293000.0,12557304.0,38.6217831,-119.7983522,275565.412692,3156006000.0


### After cleaning the data

### California County Codes and Names

| County Code | County Name      |
|--------------|------------------|
| 001          | Alameda          |
| 003          | Alpine           |
| 005          | Amador           |
| 007          | Butte            |
| 009          | Calaveras        |
| 011          | Colusa           |
| 013          | Contra Costa     |
| 015          | Del Norte        |
| 017          | El Dorado        |
| 019          | Fresno           |
| 021          | Glenn            |
| 023          | Humboldt         |
| 025          | Imperial         |
| 027          | Inyo             |
| 029          | Kern             |
| 031          | Kings            |
| 033          | Lake             |
| 035          | Lassen           |
| 037          | Los Angeles     |
| 039          | Madera           |
| 041          | Marin            |
| 043          | Mariposa         |
| 045          | Mendocino        |
| 047          | Merced           |
| 049          | Modoc            |
| 051          | Mono             |
| 053          | Monterey         |
| 055          | Napa             |
| 057          | Nevada           |
| 059          | Orange           |
| 061          | Placer           |
| 063          | Plumas           |
| 065          | Riverside        |
| 067          | Sacramento       |
| 069          | San Benito       |
| 071          | San Bernardino   |
| 073          | San Diego        |
| 075          | San Francisco    |
| 077          | San Joaquin      |
| 079          | San Luis Obispo  |
| 081          | San Mateo        |
| 083          | Santa Barbara    |
| 085          | Santa Clara      |
| 087          | Santa Cruz       |
| 089          | Shasta           |
| 091          | Sierra           |
| 093          | Siskiyou         |
| 095          | Solano           |
| 097          | Sonoma           |
| 099          | Stanislaus       |
| 101          | Sutter           |
| 103          | Tehama           |
| 105          | Trinity          |
| 107          | Tulare           |
| 109          | Tuolumne         |
| 111          | Ventura          |
| 113          | Yolo             |
| 115          | Yuba             |



In [100]:
# Drop the 'COUNTY' column
cal_fire = gdf_with_county.drop(columns=['COUNTY'])

# Rename the DataFrame
cal_fire = cal_fire.rename(columns={'COUNTY_NAME': 'County_Name'})  # If you want to keep the County column name

# Preview the modified DataFrame
cal_fire.head()

Unnamed: 0,DISCOVERY_DATE,DISCOVERY_TIME,FIRE_YEAR,LATITUDE,LONGITUDE,STATE,FIRE_SIZE,FIRE_SIZE_CLASS,NWCG_CAUSE_CLASSIFICATION,NWCG_GENERAL_CAUSE,...,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,Shape_Leng,Shape_Area
0,2005-02-02,13:00:00,2005,40.036944,-121.005833,CA,0.1,A,Human,Power generation/transmission/distribution,...,,,,A,6612401000.0,156387636.0,39.9922953,-120.8243709,657319.221495,11550000000.0
1,2004-05-12,08:45:00,2004,38.933056,-120.404444,CA,0.25,A,Natural,Natural,...,472.0,40900.0,,A,4423299000.0,203342316.0,38.7856116,-120.5342245,467801.45206,7624166000.0
2,2004-05-31,19:21:00,2004,38.984167,-120.735556,CA,0.1,A,Human,Debris and open burning,...,472.0,40900.0,,A,4423299000.0,203342316.0,38.7856116,-120.5342245,467801.45206,7624166000.0
3,2004-06-28,16:00:00,2004,38.559167,-119.913333,CA,0.1,A,Natural,Natural,...,,,,A,1912293000.0,12557304.0,38.6217831,-119.7983522,275565.412692,3156006000.0
4,2004-06-28,16:00:00,2004,38.559167,-119.933056,CA,0.1,A,Natural,Natural,...,,,,A,1912293000.0,12557304.0,38.6217831,-119.7983522,275565.412692,3156006000.0


In [101]:
cal_fire.columns

Index(['DISCOVERY_DATE', 'DISCOVERY_TIME', 'FIRE_YEAR', 'LATITUDE',
       'LONGITUDE', 'STATE', 'FIRE_SIZE', 'FIRE_SIZE_CLASS',
       'NWCG_CAUSE_CLASSIFICATION', 'NWCG_GENERAL_CAUSE', 'geometry',
       'index_right', 'STATEFP', 'County_Name', 'COUNTYNS', 'GEOID', 'NAME',
       'NAMELSAD', 'LSAD', 'CLASSFP', 'MTFCC', 'CSAFP', 'CBSAFP', 'METDIVFP',
       'FUNCSTAT', 'ALAND', 'AWATER', 'INTPTLAT', 'INTPTLON', 'Shape_Leng',
       'Shape_Area'],
      dtype='object')

In [102]:
# Display the count of null values for each column in the dataframe
null_counts = cal_fire.isnull().sum()

# Combine the null counts with the column names and display them
print(null_counts)

DISCOVERY_DATE                    0
DISCOVERY_TIME                    0
FIRE_YEAR                         0
LATITUDE                          0
LONGITUDE                         0
STATE                             0
FIRE_SIZE                         0
FIRE_SIZE_CLASS                   0
NWCG_CAUSE_CLASSIFICATION         0
NWCG_GENERAL_CAUSE                0
geometry                          0
index_right                     203
STATEFP                         203
County_Name                     203
COUNTYNS                        203
GEOID                           203
NAME                            203
NAMELSAD                        203
LSAD                            203
CLASSFP                         203
MTFCC                           203
CSAFP                        104775
CBSAFP                        33078
METDIVFP                     228984
FUNCSTAT                        203
ALAND                           203
AWATER                          203
INTPTLAT                    

In [106]:
# Filter the rows where 'County_Name' is missing
missing_county = cal_fire[cal_fire['County_Name'].isnull()]

# Select the 'LATITUDE' and 'LONGITUDE' columns for these rows
missing_lat_long = missing_county[['LATITUDE', 'LONGITUDE']]

# Display the latitude and longitude for rows with missing county names
print(missing_lat_long)

          LATITUDE   LONGITUDE
274      39.614722 -119.997222
840      38.396944 -119.097778
15356    39.563889 -119.998611
20312    39.385000 -120.003611
20573    39.360278 -120.000000
...            ...         ...
2168197  38.167222 -118.801944
2169273  42.417500 -123.568056
2174152  40.800800 -119.969200
2221769  39.259391 -119.968212
2291697  32.556136 -116.867069

[203 rows x 2 columns]


In [107]:

# Save the result to a CSV file
missing_lat_long.to_csv('missing_lat_long.csv', header=True)

### Reason for Dropping Rows with Missing County Information
The rows with missing County_Name values correspond to geographical coordinates that are primarily located in remote or unincorporated areas, which may not be directly associated with specific counties. These coordinates might lie in rural, federal, or border regions, where county designations are either absent or not clearly defined. Furthermore, these missing values could also result from data issues, such as incomplete geocoding or errors during data collection.

Since the County_Name is crucial for accurate analysis and prediction of wildfire patterns, keeping rows with missing county information would introduce noise and reduce the quality of the dataset. For these reasons, we will drop the rows with missing County_Name to ensure the integrity and accuracy of our wildfire prediction model.

In [108]:
cal_fire_cleaned = cal_fire.dropna(subset=['County_Name'])

In [109]:
cal_fire_cleaned.isnull().sum()

DISCOVERY_DATE                    0
DISCOVERY_TIME                    0
FIRE_YEAR                         0
LATITUDE                          0
LONGITUDE                         0
STATE                             0
FIRE_SIZE                         0
FIRE_SIZE_CLASS                   0
NWCG_CAUSE_CLASSIFICATION         0
NWCG_GENERAL_CAUSE                0
geometry                          0
index_right                       0
STATEFP                           0
County_Name                       0
COUNTYNS                          0
GEOID                             0
NAME                              0
NAMELSAD                          0
LSAD                              0
CLASSFP                           0
MTFCC                             0
CSAFP                        104572
CBSAFP                        32875
METDIVFP                     228781
FUNCSTAT                          0
ALAND                             0
AWATER                            0
INTPTLAT                    

In [111]:
# Select relevant columns for prediction of the cause of wildfire
relevant_columns = [
    'DISCOVERY_DATE', 'DISCOVERY_TIME', 'FIRE_YEAR', 'LATITUDE', 'LONGITUDE',
    'STATE', 'FIRE_SIZE', 'FIRE_SIZE_CLASS', 'NWCG_CAUSE_CLASSIFICATION', 
    'NWCG_GENERAL_CAUSE', 'County_Name'
]

# Create a new dataframe with only the relevant columns
CAL_FIRE = cal_fire_cleaned[relevant_columns]
CAL_FIRE = CAL_FIRE.rename(columns={'COUNTY_NAME' : 'COUNTY'}) 

# Display the first few rows of the new dataframe
CAL_FIRE.head()


Unnamed: 0,DISCOVERY_DATE,DISCOVERY_TIME,FIRE_YEAR,LATITUDE,LONGITUDE,STATE,FIRE_SIZE,FIRE_SIZE_CLASS,NWCG_CAUSE_CLASSIFICATION,NWCG_GENERAL_CAUSE,COUNTY
0,2005-02-02,13:00:00,2005,40.036944,-121.005833,CA,0.1,A,Human,Power generation/transmission/distribution,63
1,2004-05-12,08:45:00,2004,38.933056,-120.404444,CA,0.25,A,Natural,Natural,17
2,2004-05-31,19:21:00,2004,38.984167,-120.735556,CA,0.1,A,Human,Debris and open burning,17
3,2004-06-28,16:00:00,2004,38.559167,-119.913333,CA,0.1,A,Natural,Natural,3
4,2004-06-28,16:00:00,2004,38.559167,-119.933056,CA,0.1,A,Natural,Natural,3


In [115]:
CAL_FIRE['FIRE_SIZE_CLASS'].value_counts()

FIRE_SIZE_CLASS
A    138375
B     96100
C     11786
D      2547
E      1388
F       944
G       538
Name: count, dtype: int64

In [117]:
CAL_FIRE['NWCG_CAUSE_CLASSIFICATION'].value_counts()

NWCG_CAUSE_CLASSIFICATION
Human                                      183055
Missing data/not specified/undetermined     38415
Natural                                     30208
Name: count, dtype: int64

In [118]:
CAL_FIRE['NWCG_GENERAL_CAUSE'].value_counts()

NWCG_GENERAL_CAUSE
Missing data/not specified/undetermined       95224
Equipment and vehicle use                     45653
Natural                                       30208
Arson/incendiarism                            24399
Debris and open burning                       20849
Recreation and ceremony                       11420
Misuse of fire by a minor                      9312
Smoking                                        8463
Power generation/transmission/distribution     3677
Other causes                                    784
Railroad operations and maintenance             773
Fireworks                                       642
Firearms and explosives use                     274
Name: count, dtype: int64

In [119]:
CAL_FIRE.columns

Index(['DISCOVERY_DATE', 'DISCOVERY_TIME', 'FIRE_YEAR', 'LATITUDE',
       'LONGITUDE', 'STATE', 'FIRE_SIZE', 'FIRE_SIZE_CLASS',
       'NWCG_CAUSE_CLASSIFICATION', 'NWCG_GENERAL_CAUSE', 'COUNTY'],
      dtype='object')