In [24]:
#!pip install pandas
#!pip install kaggle

# zipfile, sys, and importlib comes installed with base version of Python

Kaggle API Documentation: https://www.kaggle.com/docs/api

In [25]:
# Modules + Versions

from importlib.metadata import version #Check Module Versions
import sys #Check Python Version

import kaggle #dataset download (API)
import zipfile #unzip dataset 
import pandas as pd

print("Python Version:", sys.version)
print("Kaggle Module Version: ", version('kaggle'))
print("Kaggle Module Version: ", version('pandas'))

Python Version: 3.12.0 (tags/v3.12.0:0fb18b0, Oct  2 2023, 13:03:39) [MSC v.1935 64 bit (AMD64)]
Kaggle Module Version:  1.6.17
Kaggle Module Version:  2.1.3


In [26]:
# download dataset (Kaggle API)
!kaggle datasets download -d ananthu017/california-wildfire-incidents-20132020

Dataset URL: https://www.kaggle.com/datasets/ananthu017/california-wildfire-incidents-20132020
License(s): CC0-1.0
california-wildfire-incidents-20132020.zip: Skipping, found more recently modified local copy (use --force to force download)


In [27]:
# extract zip file
zip_name = 'california-wildfire-incidents-20132020.zip'
with zipfile.ZipFile(zip_name, 'r') as file:
    file.extractall('data')

In [28]:
# read csv data to pandas
fires = pd.read_csv('data/California_Fire_Incidents.csv')

In [29]:
cols = ['Name', 'AcresBurned', 'Started', 'Counties']
fires = fires[cols]
fires.head()

Unnamed: 0,Name,AcresBurned,Started,Counties
0,Rim Fire,257314.0,2013-08-17T15:25:00Z,Tuolumne
1,Powerhouse Fire,30274.0,2013-05-30T15:28:00Z,Los Angeles
2,Mountain Fire,27531.0,2013-07-15T13:43:00Z,Riverside
3,American Fire,27440.0,2013-08-10T16:30:00Z,Placer
4,Springs Fire,24251.0,2013-05-02T07:01:00Z,Ventura


In [30]:
len(fires.Counties.unique()) # across 59 counties total

59

In [31]:
#rename columns
new_cols = {
    'Started': 'Date',
    'Counties': 'County'
}
fires.rename(new_cols, axis = 1, inplace = True)

In [32]:
#checking data types
fires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1636 entries, 0 to 1635
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         1636 non-null   object 
 1   AcresBurned  1633 non-null   float64
 2   Date         1636 non-null   object 
 3   County       1636 non-null   object 
dtypes: float64(1), object(3)
memory usage: 51.3+ KB


In [33]:
# Drop null columns since there are only 3
fires.dropna(inplace = True)

In [34]:
# format date to be date format
fires['Date'] = pd.to_datetime(fires['Date'].str.split('T').str[0])

In [35]:
fires.head() #ready to transfer to Tableau

Unnamed: 0,Name,AcresBurned,Date,County
0,Rim Fire,257314.0,2013-08-17,Tuolumne
1,Powerhouse Fire,30274.0,2013-05-30,Los Angeles
2,Mountain Fire,27531.0,2013-07-15,Riverside
3,American Fire,27440.0,2013-08-10,Placer
4,Springs Fire,24251.0,2013-05-02,Ventura


In [36]:
# export data
fires.to_csv('data/ca_fires.csv') 

In [37]:
fires[fires['Date'] == '2017-10-09']

Unnamed: 0,Name,AcresBurned,Date,County
638,Pocket Fire (Central LNU Complex),17357.0,2017-10-09,Sonoma
645,Canyon 2 Fire,9217.0,2017-10-09,Orange
653,LaPorte Fire (Wind Complex),6151.0,2017-10-09,Butte
674,Adobe Fire (Central LNU Complex),1868.0,2017-10-09,Sonoma
679,37 Fire,1660.0,2017-10-09,Sonoma
704,Lobo Fire (Wind Complex),821.0,2017-10-09,Nevada
774,Honey Fire,150.0,2017-10-09,Butte
787,Point Fire,130.0,2017-10-09,Calaveras
833,McCourtney Fire (Wind Complex),76.0,2017-10-09,Nevada
888,Pozo Fire,45.0,2017-10-09,San Luis Obispo
