In [1]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import time

# Add Matplotlib inline magic command
%matplotlib inline

In [2]:
# Reading in all CSVs as DataFrames
fire_df = pd.read_csv("fire_copy.csv")
housing_df = pd.read_csv("housing_from_drew's_first_preprocessing.csv")
electricity_df = pd.read_csv("electricity.csv")
environment_df = pd.read_csv("environment_data_june24.csv")

# Cleaning up Fire Data Set

In [3]:
# Taking a look at fire_df
fire_df.sample(n=5)

Unnamed: 0.1,Unnamed: 0,AcresBurned,ArchiveYear,CalFireIncident,Counties,CountyIds,Latitude,Longitude,MajorIncident,Name,PersonnelInvolved,Started
1115,1115,962.0,2018,True,Butte,4,39.77124,-121.76859,True,Stoney Fire,,2018-07-12T22:45:00Z
330,330,214.0,2015,True,Orange,30,33.7626,-117.7274,False,241 Fire,,2015-07-13T10:47:00Z
1078,1078,13139.0,2018,False,Riverside,33,0.0,0.0,False,Cranston Fire,,2018-07-25T11:41:00Z
536,536,85.0,2016,True,Tuolumne,55,37.927613,-120.528836,True,Tulloch Fire,,2016-05-30T15:10:00Z
484,484,360.0,2016,True,Tehama,52,40.30594,-122.1295,True,Hog Fire,168.0,2016-09-13T23:10:00Z


In [4]:
# Modify column names in fire_df
fire_df_new_columns = ["Unnamed: 0", "Acres Burned", "Year", "Cal Fire Incident", "County", "County IDs", "Latitude", "Longitude", 
                       "Major Incident", "Name", "Personnel Involved", "Started"]

fire_df.columns = fire_df_new_columns
fire_df.head()

Unnamed: 0.1,Unnamed: 0,Acres Burned,Year,Cal Fire Incident,County,County IDs,Latitude,Longitude,Major Incident,Name,Personnel Involved,Started
0,0,257314.0,2013,True,Tuolumne,55,37.857,-120.086,False,Rim Fire,,2013-08-17T15:25:00Z
1,1,30274.0,2013,True,Los Angeles,19,34.585595,-118.423176,False,Powerhouse Fire,,2013-05-30T15:28:00Z
2,2,27531.0,2013,True,Riverside,33,33.7095,-116.72885,False,Mountain Fire,,2013-07-15T13:43:00Z
3,3,27440.0,2013,False,Placer,31,39.12,-120.65,False,American Fire,,2013-08-10T16:30:00Z
4,4,24251.0,2013,True,Ventura,56,0.0,0.0,True,Springs Fire,2167.0,2013-05-02T07:01:00Z


In [5]:
# Fill 0's instead of NaN in the rows of Acres Burned column
fire_df["Acres Burned"] = fire_df["Acres Burned"].fillna(0)

In [6]:
# Format Latitude and Longitude columns in fire_df
fire_df["Latitude"] = fire_df["Latitude"].map("{:.2f}".format)
fire_df["Longitude"] = fire_df["Longitude"].map("{:.2f}".format)
fire_df.head()

Unnamed: 0.1,Unnamed: 0,Acres Burned,Year,Cal Fire Incident,County,County IDs,Latitude,Longitude,Major Incident,Name,Personnel Involved,Started
0,0,257314.0,2013,True,Tuolumne,55,37.86,-120.09,False,Rim Fire,,2013-08-17T15:25:00Z
1,1,30274.0,2013,True,Los Angeles,19,34.59,-118.42,False,Powerhouse Fire,,2013-05-30T15:28:00Z
2,2,27531.0,2013,True,Riverside,33,33.71,-116.73,False,Mountain Fire,,2013-07-15T13:43:00Z
3,3,27440.0,2013,False,Placer,31,39.12,-120.65,False,American Fire,,2013-08-10T16:30:00Z
4,4,24251.0,2013,True,Ventura,56,0.0,0.0,True,Springs Fire,2167.0,2013-05-02T07:01:00Z


In [7]:
# Convert "Latitude" and "Longitude" into floats, "County IDs" and "Acres Burned" into int
fire_df["Latitude"] = fire_df["Latitude"].astype(float)
fire_df["Longitude"] = fire_df["Longitude"].astype(float)
fire_df["County IDs"] = fire_df["County IDs"].astype(int)
fire_df["Acres Burned"] = fire_df["Acres Burned"].astype(int)
fire_df["Cal Fire Incident"] = fire_df["Cal Fire Incident"].astype(str)
fire_df["Major Incident"] = fire_df["Major Incident"].astype(str)

In [8]:
# Modify "Started" column in fire_df from object to datetime
fire_df["Started"] = pd.to_datetime(fire_df["Started"], infer_datetime_format=True)
fire_df.head()

Unnamed: 0.1,Unnamed: 0,Acres Burned,Year,Cal Fire Incident,County,County IDs,Latitude,Longitude,Major Incident,Name,Personnel Involved,Started
0,0,257314,2013,True,Tuolumne,55,37.86,-120.09,False,Rim Fire,,2013-08-17 15:25:00+00:00
1,1,30274,2013,True,Los Angeles,19,34.59,-118.42,False,Powerhouse Fire,,2013-05-30 15:28:00+00:00
2,2,27531,2013,True,Riverside,33,33.71,-116.73,False,Mountain Fire,,2013-07-15 13:43:00+00:00
3,3,27440,2013,False,Placer,31,39.12,-120.65,False,American Fire,,2013-08-10 16:30:00+00:00
4,4,24251,2013,True,Ventura,56,0.0,0.0,True,Springs Fire,2167.0,2013-05-02 07:01:00+00:00


### Since "Started" and "Archive Year" have the same years, should we get rid of 'Archive Year'
### since it's redundant and may overfit the model? Plus it can't be converted into datetime? --> Actually keep Archive Year so it's easier to group with other datasets that just have the year

In [9]:
# Drop "Unnamed: 0" and "Personnel Involved" columns from fire_df
fire_df.drop(["Unnamed: 0", "Personnel Involved"], axis=1, inplace=True)

In [10]:
# Check data types to see if they have been properly updated
fire_df.dtypes

Acres Burned                       int64
Year                               int64
Cal Fire Incident                 object
County                            object
County IDs                         int64
Latitude                         float64
Longitude                        float64
Major Incident                    object
Name                              object
Started              datetime64[ns, UTC]
dtype: object

In [11]:
# Reorder fire_df columns
fire_df = fire_df[["Name", "County", "County IDs", "Latitude", "Longitude", "Started",
                  "Year", "Acres Burned", "Cal Fire Incident", "Major Incident"]]
fire_df.head()

Unnamed: 0,Name,County,County IDs,Latitude,Longitude,Started,Year,Acres Burned,Cal Fire Incident,Major Incident
0,Rim Fire,Tuolumne,55,37.86,-120.09,2013-08-17 15:25:00+00:00,2013,257314,True,False
1,Powerhouse Fire,Los Angeles,19,34.59,-118.42,2013-05-30 15:28:00+00:00,2013,30274,True,False
2,Mountain Fire,Riverside,33,33.71,-116.73,2013-07-15 13:43:00+00:00,2013,27531,True,False
3,American Fire,Placer,31,39.12,-120.65,2013-08-10 16:30:00+00:00,2013,27440,False,False
4,Springs Fire,Ventura,56,0.0,0.0,2013-05-02 07:01:00+00:00,2013,24251,True,True


In [12]:
# fire_df.groupby(["Year"]).sum()

# ---------------------------------------------------------------------

# Cleaning up Housing Data Set

In [13]:
# Taking a look at housing_df
housing_df.sample(n=5)

Unnamed: 0.1,Unnamed: 0,longitude,latitude,population,households,median_income,median_house_value,ocean_proximity,County
3521,3521,-121.52,38.51,2942.0,1386.0,3.0963,156900.0,INLAND,Sacramento County
11975,11975,-117.02,32.7,2983.0,755.0,4.6803,129200.0,NEAR OCEAN,San Diego County
8750,8750,-118.11,33.81,1334.0,485.0,5.09545,246750.0,<1H OCEAN,Los Angeles County
9568,9568,-117.92,33.73,4105.0,1406.0,4.23285,215800.0,<1H OCEAN,Orange County
6909,6909,-118.65,36.57,570.0,225.0,1.4821,143300.0,INLAND,Tulare County


In [14]:
# Modify column names in housing_df
housing_df_new_columns = ["Unnamed: 0", "Longitude", "Latitude", "Population", "Households", 
                          "Average Income", "Average House Value", "Ocean Proximity", "County"]

housing_df.columns = housing_df_new_columns
housing_df.head()

Unnamed: 0.1,Unnamed: 0,Longitude,Latitude,Population,Households,Average Income,Average House Value,Ocean Proximity,County
0,0,-124.35,40.54,806.0,270.0,3.0147,94600.0,NEAR OCEAN,Humboldt County
1,1,-124.3,41.8,1298.0,478.0,1.9797,85800.0,NEAR OCEAN,Del Norte County
2,2,-124.3,41.84,1244.0,456.0,3.0313,103600.0,NEAR OCEAN,Del Norte County
3,3,-124.27,40.69,1194.0,465.0,2.5179,79000.0,NEAR OCEAN,Humboldt County
4,4,-124.26,40.58,907.0,369.0,2.3571,111400.0,NEAR OCEAN,Humboldt County


In [15]:
# Split the word "County" from every row in "County" column
housing_df["County"] = housing_df["County"].str.split("County", n=1, expand=True)

In [16]:
# Check to see if it worked
housing_df["County"]

0              Humboldt 
1             Del Norte 
2             Del Norte 
3              Humboldt 
4              Humboldt 
              ...       
12585         Riverside 
12586          Imperial 
12587            La Paz 
12588    San Bernardino 
12589    San Bernardino 
Name: County, Length: 12590, dtype: object

In [17]:
# Check to see if housing_df has newly updated "County" column
housing_df.head()

Unnamed: 0.1,Unnamed: 0,Longitude,Latitude,Population,Households,Average Income,Average House Value,Ocean Proximity,County
0,0,-124.35,40.54,806.0,270.0,3.0147,94600.0,NEAR OCEAN,Humboldt
1,1,-124.3,41.8,1298.0,478.0,1.9797,85800.0,NEAR OCEAN,Del Norte
2,2,-124.3,41.84,1244.0,456.0,3.0313,103600.0,NEAR OCEAN,Del Norte
3,3,-124.27,40.69,1194.0,465.0,2.5179,79000.0,NEAR OCEAN,Humboldt
4,4,-124.26,40.58,907.0,369.0,2.3571,111400.0,NEAR OCEAN,Humboldt


In [18]:
# Adjusting "Average Income" column so that each value is multipled by 10,000
housing_df["Average Income"] = housing_df["Average Income"].multiply(other=10**4)
housing_df.head()

Unnamed: 0.1,Unnamed: 0,Longitude,Latitude,Population,Households,Average Income,Average House Value,Ocean Proximity,County
0,0,-124.35,40.54,806.0,270.0,30147.0,94600.0,NEAR OCEAN,Humboldt
1,1,-124.3,41.8,1298.0,478.0,19797.0,85800.0,NEAR OCEAN,Del Norte
2,2,-124.3,41.84,1244.0,456.0,30313.0,103600.0,NEAR OCEAN,Del Norte
3,3,-124.27,40.69,1194.0,465.0,25179.0,79000.0,NEAR OCEAN,Humboldt
4,4,-124.26,40.58,907.0,369.0,23571.0,111400.0,NEAR OCEAN,Humboldt


In [19]:
# Format Population, Households, Average Income, and Average House Value columns in housing_df
housing_df["Population"] = housing_df["Population"].map("{:.0f}".format)
housing_df["Households"] = housing_df["Households"].map("{:.0f}".format)
housing_df["Average Income"] = housing_df["Average Income"].map("{:.0f}".format)
housing_df["Average House Value"] = housing_df["Average House Value"].map("{:.0f}".format)
housing_df.head()

Unnamed: 0.1,Unnamed: 0,Longitude,Latitude,Population,Households,Average Income,Average House Value,Ocean Proximity,County
0,0,-124.35,40.54,806,270,30147,94600,NEAR OCEAN,Humboldt
1,1,-124.3,41.8,1298,478,19797,85800,NEAR OCEAN,Del Norte
2,2,-124.3,41.84,1244,456,30313,103600,NEAR OCEAN,Del Norte
3,3,-124.27,40.69,1194,465,25179,79000,NEAR OCEAN,Humboldt
4,4,-124.26,40.58,907,369,23571,111400,NEAR OCEAN,Humboldt


In [20]:
# Change Population, Households, Average Income, and Average House Value to integers
housing_df["Population"] = housing_df["Population"].astype(int)
housing_df["Households"] = housing_df["Households"].astype(int)
housing_df["Average Income"] = housing_df["Average Income"].astype(int)
housing_df["Average House Value"] = housing_df["Average House Value"].astype(int)

In [21]:
# Drop "Unnamed: 0" column from housing_df
housing_df.drop("Unnamed: 0", axis=1, inplace=True)

In [22]:
# Check housing_df data types to see if they were successfully converted
housing_df.dtypes

Longitude              float64
Latitude               float64
Population               int64
Households               int64
Average Income           int64
Average House Value      int64
Ocean Proximity         object
County                  object
dtype: object

In [23]:
# Reorder housing_df columns
housing_df = housing_df[["County", "Latitude", "Longitude", "Ocean Proximity", "Population", 
               "Households", "Average Income", "Average House Value"]]

housing_df.head()

Unnamed: 0,County,Latitude,Longitude,Ocean Proximity,Population,Households,Average Income,Average House Value
0,Humboldt,40.54,-124.35,NEAR OCEAN,806,270,30147,94600
1,Del Norte,41.8,-124.3,NEAR OCEAN,1298,478,19797,85800
2,Del Norte,41.84,-124.3,NEAR OCEAN,1244,456,30313,103600
3,Humboldt,40.69,-124.27,NEAR OCEAN,1194,465,25179,79000
4,Humboldt,40.58,-124.26,NEAR OCEAN,907,369,23571,111400


In [24]:
# Take away white space from right side of County names
housing_df["County"] = housing_df["County"].str.rstrip()

housing_df["County"].unique().tolist()

['Humboldt',
 'Del Norte',
 nan,
 'Mendocino',
 'Trinity',
 'Sonoma',
 'Siskiyou',
 'Lake',
 'Marin',
 'Shasta',
 'Glenn',
 'Tehama',
 'Napa',
 'San Mateo',
 'San Francisco',
 'Colusa',
 'Contra Costa',
 'Alameda',
 'Solano',
 'Santa Cruz',
 'Yolo',
 'Santa Clara',
 'Butte',
 'Monterey',
 'Sutter',
 'Sacramento',
 'San Benito',
 'Yuba',
 'San Joaquin',
 'Placer',
 'Stanislaus',
 'Nevada',
 'Plumas',
 'Modoc',
 'San Luis Obispo',
 'Lassen',
 'El Dorado',
 'Merced',
 'Amador',
 'Sierra',
 'Calaveras',
 'Fresno',
 'Santa Barbara',
 'Tuolumne',
 'Madera',
 'Mariposa',
 'Kings',
 'Alpine',
 'Douglas',
 'Kern',
 'Tulare',
 'Mono',
 'Ventura',
 'Los Angeles',
 'Inyo',
 'Orange',
 'San Bernardino',
 'Riverside',
 'San Diego',
 'Imperial',
 'La Paz']

In [25]:
# Check housing_df
housing_df.head()

Unnamed: 0,County,Latitude,Longitude,Ocean Proximity,Population,Households,Average Income,Average House Value
0,Humboldt,40.54,-124.35,NEAR OCEAN,806,270,30147,94600
1,Del Norte,41.8,-124.3,NEAR OCEAN,1298,478,19797,85800
2,Del Norte,41.84,-124.3,NEAR OCEAN,1244,456,30313,103600
3,Humboldt,40.69,-124.27,NEAR OCEAN,1194,465,25179,79000
4,Humboldt,40.58,-124.26,NEAR OCEAN,907,369,23571,111400


# ---------------------------------------------------------------------

# Cleaning up Electricity Data Set

In [26]:
# Taking a look at electricity_df
electricity_df.sample(n=5)

Unnamed: 0.1,Unnamed: 0,County,2019,2018,2017,2016,2015,2014,2013
21,65,MARIPOSA,109.731056,112.753964,111.427108,107.670394,106.605056,109.720938,113.195728
20,62,MARIN,1355.342745,1328.249462,1383.982879,1342.378889,1354.45392,1355.224757,1396.080706
54,164,TUOLUMNE,452.64424,453.499795,460.439815,447.462212,441.686314,440.248371,460.722738
46,140,SISKIYOU,495.641195,495.135053,508.367638,492.687584,494.485866,485.465765,500.398866
42,128,SANTA CLARA,16664.460569,16703.552664,17016.72764,16814.634749,16794.808199,16661.945532,16572.119283


In [27]:
# Check electricity_df data types
electricity_df.dtypes

Unnamed: 0      int64
County         object
2019          float64
2018          float64
2017          float64
2016          float64
2015          float64
2014          float64
2013          float64
dtype: object

In [28]:
# Convert all rows in County Column from uppercase to normal writing
electricity_df["County"] = electricity_df["County"].str.title()
electricity_df["County"]

0             Alameda
1              Alpine
2              Amador
3               Butte
4           Calaveras
5              Colusa
6        Contra Costa
7           Del Norte
8           El Dorado
9              Fresno
10              Glenn
11           Humboldt
12           Imperial
13               Inyo
14               Kern
15              Kings
16               Lake
17             Lassen
18        Los Angeles
19             Madera
20              Marin
21           Mariposa
22          Mendocino
23             Merced
24              Modoc
25               Mono
26           Monterey
27               Napa
28             Nevada
29             Orange
30             Placer
31             Plumas
32          Riverside
33         Sacramento
34         San Benito
35     San Bernardino
36          San Diego
37      San Francisco
38        San Joaquin
39    San Luis Obispo
40          San Mateo
41      Santa Barbara
42        Santa Clara
43         Santa Cruz
44             Shasta
45        

In [29]:
# Check to see if all the counties are now in normal writing in County column
electricity_df.head()

Unnamed: 0.1,Unnamed: 0,County,2019,2018,2017,2016,2015,2014,2013
0,2,Alameda,10684.085867,10391.361826,11079.450563,10791.224841,10235.384987,10299.877787,10618.661255
1,5,Alpine,18.906214,18.704208,18.976912,17.415654,16.2099,15.983364,18.176468
2,8,Amador,317.885054,304.092677,313.312866,309.0941,285.308438,289.501492,310.344276
3,11,Butte,1396.246344,1475.788821,1529.818607,1482.07376,1492.09863,1489.484147,1502.980505
4,14,Calaveras,330.55907,332.353511,347.931437,316.143601,311.620104,319.671411,326.656878


In [30]:
# Format the numbers in all the year columns in electricity_df
electricity_df["2019"] = electricity_df["2019"].map("{:.2f}".format)
electricity_df["2018"] = electricity_df["2018"].map("{:.2f}".format)
electricity_df["2017"] = electricity_df["2017"].map("{:.2f}".format)
electricity_df["2016"] = electricity_df["2016"].map("{:.2f}".format)
electricity_df["2015"] = electricity_df["2015"].map("{:.2f}".format)
electricity_df["2014"] = electricity_df["2014"].map("{:.2f}".format)
electricity_df["2013"] = electricity_df["2013"].map("{:.2f}".format)

In [31]:
# Convert all the years back into integers in electricity_df
electricity_df["2019"] = electricity_df["2019"].astype(float)
electricity_df["2018"] = electricity_df["2018"].astype(float)
electricity_df["2017"] = electricity_df["2017"].astype(float)
electricity_df["2016"] = electricity_df["2016"].astype(float)
electricity_df["2015"] = electricity_df["2015"].astype(float)
electricity_df["2014"] = electricity_df["2014"].astype(float)
electricity_df["2013"] = electricity_df["2013"].astype(float)

In [32]:
# Check to see if columns were successfully converted to floats in electricity_df
electricity_df.dtypes

Unnamed: 0      int64
County         object
2019          float64
2018          float64
2017          float64
2016          float64
2015          float64
2014          float64
2013          float64
dtype: object

In [33]:
# Drop "Unnamed: 0" column from electricity_df
electricity_df.drop("Unnamed: 0", axis=1, inplace=True)

In [34]:
# Reorder columns in electricity_df 
electricity_df = electricity_df[["County", "2013", "2014", "2015", "2016", "2017", "2018", "2019"]]
electricity_df.head()

Unnamed: 0,County,2013,2014,2015,2016,2017,2018,2019
0,Alameda,10618.66,10299.88,10235.38,10791.22,11079.45,10391.36,10684.09
1,Alpine,18.18,15.98,16.21,17.42,18.98,18.7,18.91
2,Amador,310.34,289.5,285.31,309.09,313.31,304.09,317.89
3,Butte,1502.98,1489.48,1492.1,1482.07,1529.82,1475.79,1396.25
4,Calaveras,326.66,319.67,311.62,316.14,347.93,332.35,330.56


In [35]:
# Make Years into a column
electricity_df = electricity_df.melt(id_vars="County", var_name="Year", value_name="MWh")
electricity_df

Unnamed: 0,County,Year,MWh
0,Alameda,2013,10618.66
1,Alpine,2013,18.18
2,Amador,2013,310.34
3,Butte,2013,1502.98
4,Calaveras,2013,326.66
...,...,...,...
401,Tulare,2019,4162.20
402,Tuolumne,2019,452.64
403,Ventura,2019,5344.04
404,Yolo,2019,1720.75


In [36]:
# Convert Year column to integer
electricity_df["Year"] = electricity_df["Year"].astype(int)

In [37]:
electricity_df.dtypes

County     object
Year        int64
MWh       float64
dtype: object

In [38]:
# # Add MWh to each of the column names since values are in millions of kWh
# electricity_df_new_columns = ["County", "2013 (MWh)", "2014 (MWh)", "2015 (MWh)",
#                              "2016 (MWh)", "2017 (MWh)", "2018 (MWh)", "2019 (MWh)"]
# electricity_df.columns = electricity_df_new_columns
# electricity_df.head()

# ---------------------------------------------------------------------

# Cleaning up Environment Data Set

In [39]:
# Taking a look at environment_df
environment_df.sample(n=5)

Unnamed: 0.1,Unnamed: 0,County,Year,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F)
37,37,Los Angeles,2018,633.77,101.31,467.38764,11.007303,113.9,15.7,63.842722,100.0,0.0,53.071246,44.402094,3.952886,94.840832,64.699081
40,40,Marin,2018,96.12,28.95,422.826923,11.44272,99.0,22.7,56.184203,99.0,10.0,73.324176,47.370467,4.172115,100.141896,62.169505
85,85,San Mateo,2018,40.09,19.7,390.969444,12.026944,82.3,27.1,54.433056,100.0,12.0,81.980556,48.893611,3.918056,94.027778,57.706667
107,107,Sonoma,2019,178.0,152.51,402.288006,11.429777,102.1,22.3,55.893863,100.0,5.0,74.087866,47.126848,3.293654,79.046304,61.399372
21,21,Fresno,2019,465.43,77.31,464.255503,10.797149,108.7,22.8,63.23367,100.0,1.0,55.799711,45.394118,4.524035,108.580873,64.739228


In [40]:
# Drop "Unnamed: 0" column from environment_df
environment_df.drop("Unnamed: 0", axis=1, inplace=True)

In [41]:
# Format numbers in environment_df
environment_df["Sol Rad (Ly/day)"] = environment_df["Sol Rad (Ly/day)"].map("{:.0f}".format)
environment_df["Avg Vap Pres (mBars)"] = environment_df["Avg Vap Pres (mBars)"].map("{:.1f}".format)
environment_df["Avg Air Temp (F)"] = environment_df["Avg Air Temp (F)"].map("{:.1f}".format)
environment_df["Max Rel Hum (%)"] = environment_df["Max Rel Hum (%)"].map("{:.0f}".format)
environment_df["Min Rel Hum (%)"] = environment_df["Min Rel Hum (%)"].map("{:.0f}".format)
environment_df["Avg Rel Hum (%)"] = environment_df["Avg Rel Hum (%)"].map("{:.0f}".format)
environment_df["Dew Point (F)"] = environment_df["Dew Point (F)"].map("{:.1f}".format)
environment_df["Avg Wind Speed (mph)"] = environment_df["Avg Wind Speed (mph)"].map("{:.1f}".format)
environment_df["Wind Run (miles)"] = environment_df["Wind Run (miles)"].map("{:.1f}".format)
environment_df["Avg Soil Temp (F)"] = environment_df["Avg Soil Temp (F)"].map("{:.1f}".format)

environment_df.head()

Unnamed: 0,County,Year,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F)
0,Alameda,2018,132.6,40.31,438,11.5,99.3,19.9,57.5,100,9,70,47.3,4.0,94.8,61.8
1,Alameda,2019,141.87,57.87,430,11.8,101.6,26.8,57.9,100,9,71,48.0,4.0,95.6,60.7
2,Alameda,2020,116.04,24.83,494,12.0,108.8,28.3,59.1,100,8,69,48.5,4.1,99.2,62.5
3,Alpine,2019,33.83,5.56,461,5.8,92.0,4.6,50.4,99,9,46,28.6,7.1,170.2,51.3
4,Alpine,2020,47.24,7.6,494,5.7,95.5,2.6,51.3,99,7,44,28.7,7.6,183.1,50.8


### Should Wind Run (miles) be cumulative like it is now (all the miles of wind run added up for each year for each county) or average? Asking because it's a huge number.

In [42]:
# Convert data types back into floats for formatted columns
environment_df["Sol Rad (Ly/day)"] = environment_df["Sol Rad (Ly/day)"].astype(float)
environment_df["Avg Vap Pres (mBars)"] = environment_df["Avg Vap Pres (mBars)"].astype(float)
environment_df["Avg Air Temp (F)"] = environment_df["Avg Air Temp (F)"].astype(float)
environment_df["Max Rel Hum (%)"] = environment_df["Max Rel Hum (%)"].astype(float)
environment_df["Min Rel Hum (%)"] = environment_df["Min Rel Hum (%)"].astype(float)
environment_df["Avg Rel Hum (%)"] = environment_df["Avg Rel Hum (%)"].astype(float)
environment_df["Dew Point (F)"] = environment_df["Dew Point (F)"].astype(float)
environment_df["Avg Wind Speed (mph)"] = environment_df["Avg Wind Speed (mph)"].astype(float)
environment_df["Wind Run (miles)"] = environment_df["Wind Run (miles)"].astype(float)
environment_df["Avg Soil Temp (F)"] = environment_df["Avg Soil Temp (F)"].astype(float)

In [43]:
# Check environment_df data types
environment_df.dtypes

County                   object
Year                      int64
ETo (in)                float64
Precip (in)             float64
Sol Rad (Ly/day)        float64
Avg Vap Pres (mBars)    float64
Max Air Temp (F)        float64
Min Air Temp (F)        float64
Avg Air Temp (F)        float64
Max Rel Hum (%)         float64
Min Rel Hum (%)         float64
Avg Rel Hum (%)         float64
Dew Point (F)           float64
Avg Wind Speed (mph)    float64
Wind Run (miles)        float64
Avg Soil Temp (F)       float64
dtype: object

In [44]:
# Check out environment_df
environment_df

Unnamed: 0,County,Year,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F)
0,Alameda,2018,132.60,40.31,438.0,11.5,99.3,19.9,57.5,100.0,9.0,70.0,47.3,4.0,94.8,61.8
1,Alameda,2019,141.87,57.87,430.0,11.8,101.6,26.8,57.9,100.0,9.0,71.0,48.0,4.0,95.6,60.7
2,Alameda,2020,116.04,24.83,494.0,12.0,108.8,28.3,59.1,100.0,8.0,69.0,48.5,4.1,99.2,62.5
3,Alpine,2019,33.83,5.56,461.0,5.8,92.0,4.6,50.4,99.0,9.0,46.0,28.6,7.1,170.2,51.3
4,Alpine,2020,47.24,7.60,494.0,5.7,95.5,2.6,51.3,99.0,7.0,44.0,28.7,7.6,183.1,50.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,Yolo,2019,103.37,52.17,424.0,11.3,106.7,28.0,61.1,100.0,0.0,62.0,46.5,5.0,121.0,61.0
126,Yolo,2020,93.16,7.80,485.0,11.7,107.6,31.5,64.0,100.0,0.0,58.0,47.5,5.1,123.0,63.7
127,Yuba,2018,53.66,29.46,414.0,10.9,102.6,25.1,63.0,100.0,6.0,57.0,45.5,4.1,98.0,63.5
128,Yuba,2019,50.31,38.85,389.0,10.8,102.8,28.5,62.1,99.0,7.0,57.0,45.2,4.5,107.6,62.5


# ---------------------------------------------------------------------

# Matching all the Data Sets to Have the Same # of Counties

In [45]:
# See number of unique counties in fire_df
len(fire_df["County"].unique())

59

In [46]:
# List all the counties in fire_df to see which Counties are not in CA
fire_df["County"].unique().tolist()

['Tuolumne',
 'Los Angeles',
 'Riverside',
 'Placer',
 'Ventura',
 'Fresno',
 'Siskiyou',
 'Humboldt',
 'Tehama',
 'Shasta',
 'San Diego',
 'Kern',
 'Sonoma',
 'Contra Costa',
 'Butte',
 'Tulare',
 'Santa Barbara',
 'Mariposa',
 'Monterey',
 'El Dorado',
 'San Bernardino',
 'Plumas',
 'Modoc',
 'San Luis Obispo',
 'Madera',
 'Inyo',
 'Napa',
 'San Benito',
 'San Joaquin',
 'Lake',
 'Alameda',
 'Glenn',
 'Yolo',
 'Sacramento',
 'Stanislaus',
 'Solano',
 'Merced',
 'Mendocino',
 'Lassen',
 'Amador',
 'Yuba',
 'Nevada',
 'Santa Clara',
 'Calaveras',
 'San Mateo',
 'Orange',
 'Colusa',
 'Trinity',
 'Del Norte',
 'Mono',
 'Alpine',
 'Sutter',
 'Kings',
 'Sierra',
 'Santa Cruz',
 'Marin',
 'Mexico',
 'State of Oregon',
 'State of Nevada']

In [47]:
# Drop rows that have 'Mexico', 'State of Oregon', and 'State of Nevada' in fire_df counties
unwanted_county1_fire_df = fire_df[fire_df["County"] == "Mexico"].index
unwanted_county2_fire_df = fire_df[fire_df["County"] == "State of Oregon"].index
unwanted_county3_fire_df = fire_df[fire_df["County"] == "State of Nevada"].index

fire_df.drop(unwanted_county1_fire_df, inplace=True)
fire_df.drop(unwanted_county2_fire_df, inplace=True)
fire_df.drop(unwanted_county3_fire_df, inplace=True)

fire_df.head()

Unnamed: 0,Name,County,County IDs,Latitude,Longitude,Started,Year,Acres Burned,Cal Fire Incident,Major Incident
0,Rim Fire,Tuolumne,55,37.86,-120.09,2013-08-17 15:25:00+00:00,2013,257314,True,False
1,Powerhouse Fire,Los Angeles,19,34.59,-118.42,2013-05-30 15:28:00+00:00,2013,30274,True,False
2,Mountain Fire,Riverside,33,33.71,-116.73,2013-07-15 13:43:00+00:00,2013,27531,True,False
3,American Fire,Placer,31,39.12,-120.65,2013-08-10 16:30:00+00:00,2013,27440,False,False
4,Springs Fire,Ventura,56,0.0,0.0,2013-05-02 07:01:00+00:00,2013,24251,True,True


In [48]:
# Double-check to see if rows with data for "Mexico", "State of Oregon", and "State of Nevada" have been dropped
sorted(fire_df["County"].unique())

['Alameda',
 'Alpine',
 'Amador',
 'Butte',
 'Calaveras',
 'Colusa',
 'Contra Costa',
 'Del Norte',
 'El Dorado',
 'Fresno',
 'Glenn',
 'Humboldt',
 'Inyo',
 'Kern',
 'Kings',
 'Lake',
 'Lassen',
 'Los Angeles',
 'Madera',
 'Marin',
 'Mariposa',
 'Mendocino',
 'Merced',
 'Modoc',
 'Mono',
 'Monterey',
 'Napa',
 'Nevada',
 'Orange',
 'Placer',
 'Plumas',
 'Riverside',
 'Sacramento',
 'San Benito',
 'San Bernardino',
 'San Diego',
 'San Joaquin',
 'San Luis Obispo',
 'San Mateo',
 'Santa Barbara',
 'Santa Clara',
 'Santa Cruz',
 'Shasta',
 'Sierra',
 'Siskiyou',
 'Solano',
 'Sonoma',
 'Stanislaus',
 'Sutter',
 'Tehama',
 'Trinity',
 'Tulare',
 'Tuolumne',
 'Ventura',
 'Yolo',
 'Yuba']

In [49]:
# Now check to see the number of unique counties listed in fire_df
len(fire_df["County"].unique())

56

In [50]:
# See number of unique counties in housing_df
len(housing_df["County"].unique())

61

In [51]:
# List all the counties in housing_df to see which Counties are not in CA
housing_df["County"].unique().tolist()

['Humboldt',
 'Del Norte',
 nan,
 'Mendocino',
 'Trinity',
 'Sonoma',
 'Siskiyou',
 'Lake',
 'Marin',
 'Shasta',
 'Glenn',
 'Tehama',
 'Napa',
 'San Mateo',
 'San Francisco',
 'Colusa',
 'Contra Costa',
 'Alameda',
 'Solano',
 'Santa Cruz',
 'Yolo',
 'Santa Clara',
 'Butte',
 'Monterey',
 'Sutter',
 'Sacramento',
 'San Benito',
 'Yuba',
 'San Joaquin',
 'Placer',
 'Stanislaus',
 'Nevada',
 'Plumas',
 'Modoc',
 'San Luis Obispo',
 'Lassen',
 'El Dorado',
 'Merced',
 'Amador',
 'Sierra',
 'Calaveras',
 'Fresno',
 'Santa Barbara',
 'Tuolumne',
 'Madera',
 'Mariposa',
 'Kings',
 'Alpine',
 'Douglas',
 'Kern',
 'Tulare',
 'Mono',
 'Ventura',
 'Los Angeles',
 'Inyo',
 'Orange',
 'San Bernardino',
 'Riverside',
 'San Diego',
 'Imperial',
 'La Paz']

In [52]:
# Check to see if County column in housing_df is the only column with null values
housing_df.isna().sum()

County                 40
Latitude                0
Longitude               0
Ocean Proximity         0
Population              0
Households              0
Average Income          0
Average House Value     0
dtype: int64

In [53]:
# Drop rows that have 'La Paz', 'Douglas', and 'nan' in housing_df counties
unwanted_county1_housing_df = housing_df[housing_df["County"] == "La Paz"].index
unwanted_county2_housing_df = housing_df[housing_df["County"] == "Douglas"].index

housing_df.drop(unwanted_county1_housing_df, inplace=True)
housing_df.drop(unwanted_county2_housing_df, inplace=True)

housing_df = housing_df.dropna()

housing_df.head()

Unnamed: 0,County,Latitude,Longitude,Ocean Proximity,Population,Households,Average Income,Average House Value
0,Humboldt,40.54,-124.35,NEAR OCEAN,806,270,30147,94600
1,Del Norte,41.8,-124.3,NEAR OCEAN,1298,478,19797,85800
2,Del Norte,41.84,-124.3,NEAR OCEAN,1244,456,30313,103600
3,Humboldt,40.69,-124.27,NEAR OCEAN,1194,465,25179,79000
4,Humboldt,40.58,-124.26,NEAR OCEAN,907,369,23571,111400


In [54]:
# List all the counties in housing_df to see which Counties are not in CA
housing_df["County"].unique().tolist()

['Humboldt',
 'Del Norte',
 'Mendocino',
 'Trinity',
 'Sonoma',
 'Siskiyou',
 'Lake',
 'Marin',
 'Shasta',
 'Glenn',
 'Tehama',
 'Napa',
 'San Mateo',
 'San Francisco',
 'Colusa',
 'Contra Costa',
 'Alameda',
 'Solano',
 'Santa Cruz',
 'Yolo',
 'Santa Clara',
 'Butte',
 'Monterey',
 'Sutter',
 'Sacramento',
 'San Benito',
 'Yuba',
 'San Joaquin',
 'Placer',
 'Stanislaus',
 'Nevada',
 'Plumas',
 'Modoc',
 'San Luis Obispo',
 'Lassen',
 'El Dorado',
 'Merced',
 'Amador',
 'Sierra',
 'Calaveras',
 'Fresno',
 'Santa Barbara',
 'Tuolumne',
 'Madera',
 'Mariposa',
 'Kings',
 'Alpine',
 'Kern',
 'Tulare',
 'Mono',
 'Ventura',
 'Los Angeles',
 'Inyo',
 'Orange',
 'San Bernardino',
 'Riverside',
 'San Diego',
 'Imperial']

In [55]:
# Now check to see the number of unique counties listed in housing_df
len(housing_df["County"].unique())

58

In [56]:
# Create a copy of the housing_df called housing_df2
# In this new DF, drop Imperial and San Francisco county from it for Machine Learning Model
housing_df2 = housing_df.copy()

imperial_housing = housing_df2[housing_df2["County"] == "Imperial"].index
san_francisco_housing = housing_df2[housing_df2["County"] == "San Francisco"].index

housing_df2.drop(imperial_housing, inplace=True)
housing_df2.drop(san_francisco_housing, inplace=True)

housing_df2.head()

Unnamed: 0,County,Latitude,Longitude,Ocean Proximity,Population,Households,Average Income,Average House Value
0,Humboldt,40.54,-124.35,NEAR OCEAN,806,270,30147,94600
1,Del Norte,41.8,-124.3,NEAR OCEAN,1298,478,19797,85800
2,Del Norte,41.84,-124.3,NEAR OCEAN,1244,456,30313,103600
3,Humboldt,40.69,-124.27,NEAR OCEAN,1194,465,25179,79000
4,Humboldt,40.58,-124.26,NEAR OCEAN,907,369,23571,111400


In [57]:
# Check number of counties in housing_df2
len(housing_df2["County"].unique())

56

In [58]:
# See number of unique counties in electricity_df
len(electricity_df["County"].unique())

58

In [59]:
# Create a copy of the electricity_df called electricity_df2
# In this new DF, drop Imperial and San Francisco county from it for Machine Learning Model
electricity_df2 = electricity_df.copy()

imperial_electricity = electricity_df2[electricity_df2["County"] == "Imperial"].index
san_francisco_electricity = electricity_df2[electricity_df2["County"] == "San Francisco"].index

electricity_df2.drop(imperial_electricity, inplace=True)
electricity_df2.drop(san_francisco_electricity, inplace=True)

electricity_df2.head()

Unnamed: 0,County,Year,MWh
0,Alameda,2013,10618.66
1,Alpine,2013,18.18
2,Amador,2013,310.34
3,Butte,2013,1502.98
4,Calaveras,2013,326.66


In [60]:
# Check number of counties in electricity_df2
len(electricity_df2["County"].unique())

56

In [61]:
# List all the counties in electricity_df to see if they're CA counties
electricity_df["County"].unique().tolist()

['Alameda',
 'Alpine',
 'Amador',
 'Butte',
 'Calaveras',
 'Colusa',
 'Contra Costa',
 'Del Norte',
 'El Dorado',
 'Fresno',
 'Glenn',
 'Humboldt',
 'Imperial',
 'Inyo',
 'Kern',
 'Kings',
 'Lake',
 'Lassen',
 'Los Angeles',
 'Madera',
 'Marin',
 'Mariposa',
 'Mendocino',
 'Merced',
 'Modoc',
 'Mono',
 'Monterey',
 'Napa',
 'Nevada',
 'Orange',
 'Placer',
 'Plumas',
 'Riverside',
 'Sacramento',
 'San Benito',
 'San Bernardino',
 'San Diego',
 'San Francisco',
 'San Joaquin',
 'San Luis Obispo',
 'San Mateo',
 'Santa Barbara',
 'Santa Clara',
 'Santa Cruz',
 'Shasta',
 'Sierra',
 'Siskiyou',
 'Solano',
 'Sonoma',
 'Stanislaus',
 'Sutter',
 'Tehama',
 'Trinity',
 'Tulare',
 'Tuolumne',
 'Ventura',
 'Yolo',
 'Yuba']

In [62]:
# See number of unique counties in environment_df
len(environment_df["County"].unique())

44

In [63]:
# List all the counties in environment_df to see which Counties are missing
environment_df["County"].unique().tolist()

['Alameda',
 'Alpine',
 'Amador',
 'Butte',
 'Colusa',
 'Contra Costa',
 'El Dorado',
 'Fresno',
 'Humboldt',
 'Imperial',
 'Inyo',
 'Kern',
 'Kings',
 'Los Angeles',
 'Marin',
 'Mendocino',
 'Merced',
 'Modoc',
 'Monterey',
 'Napa',
 'Orange',
 'Placer',
 'Riverside',
 'Sacramento',
 'San Benito',
 'San Bernardino',
 'San Diego',
 'San Joaquin',
 'San Luis Obispo',
 'San Mateo',
 'Santa Barbara',
 'Santa Clara',
 'Santa Cruz',
 'Shasta',
 'Siskiyou',
 'Solano',
 'Sonoma',
 'Stanislaus',
 'Sutter',
 'Tehama',
 'Tulare',
 'Ventura',
 'Yolo',
 'Yuba']

In [64]:
environment_df.head()

Unnamed: 0,County,Year,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F)
0,Alameda,2018,132.6,40.31,438.0,11.5,99.3,19.9,57.5,100.0,9.0,70.0,47.3,4.0,94.8,61.8
1,Alameda,2019,141.87,57.87,430.0,11.8,101.6,26.8,57.9,100.0,9.0,71.0,48.0,4.0,95.6,60.7
2,Alameda,2020,116.04,24.83,494.0,12.0,108.8,28.3,59.1,100.0,8.0,69.0,48.5,4.1,99.2,62.5
3,Alpine,2019,33.83,5.56,461.0,5.8,92.0,4.6,50.4,99.0,9.0,46.0,28.6,7.1,170.2,51.3
4,Alpine,2020,47.24,7.6,494.0,5.7,95.5,2.6,51.3,99.0,7.0,44.0,28.7,7.6,183.1,50.8


In [65]:
fire_df.dtypes

Name                              object
County                            object
County IDs                         int64
Latitude                         float64
Longitude                        float64
Started              datetime64[ns, UTC]
Year                               int64
Acres Burned                       int64
Cal Fire Incident                 object
Major Incident                    object
dtype: object

In [66]:
environment_df.dtypes

County                   object
Year                      int64
ETo (in)                float64
Precip (in)             float64
Sol Rad (Ly/day)        float64
Avg Vap Pres (mBars)    float64
Max Air Temp (F)        float64
Min Air Temp (F)        float64
Avg Air Temp (F)        float64
Max Rel Hum (%)         float64
Min Rel Hum (%)         float64
Avg Rel Hum (%)         float64
Dew Point (F)           float64
Avg Wind Speed (mph)    float64
Wind Run (miles)        float64
Avg Soil Temp (F)       float64
dtype: object

In [67]:
electricity_df2.dtypes

County     object
Year        int64
MWh       float64
dtype: object

# ---------------------------------------------------------------------

# Creating Extra Data Sets for Visualization

In [68]:
# Read in Original Fire Data Set from Kaggle
og_fire_df = pd.read_csv("../Data/California_Fire_Incidents.csv")
og_fire_df.head()

Unnamed: 0,AcresBurned,Active,AdminUnit,AirTankers,ArchiveYear,CalFireIncident,CanonicalUrl,ConditionStatement,ControlStatement,Counties,...,SearchKeywords,Started,Status,StructuresDamaged,StructuresDestroyed,StructuresEvacuated,StructuresThreatened,UniqueId,Updated,WaterTenders
0,257314.0,False,Stanislaus National Forest/Yosemite National Park,,2013,True,/incidents/2013/8/17/rim-fire/,,,Tuolumne,...,"Rim Fire, Stanislaus National Forest, Yosemite...",2013-08-17T15:25:00Z,Finalized,,,,,5fb18d4d-213f-4d83-a179-daaf11939e78,2013-09-06T18:30:00Z,
1,30274.0,False,USFS Angeles National Forest/Los Angeles Count...,,2013,True,/incidents/2013/5/30/powerhouse-fire/,,,Los Angeles,...,"Powerhouse Fire, May 2013, June 2013, Angeles ...",2013-05-30T15:28:00Z,Finalized,,,,,bf37805e-1cc2-4208-9972-753e47874c87,2013-06-08T18:30:00Z,
2,27531.0,False,CAL FIRE Riverside Unit / San Bernardino Natio...,,2013,True,/incidents/2013/7/15/mountain-fire/,,,Riverside,...,"Mountain Fire, July 2013, Highway 243, Highway...",2013-07-15T13:43:00Z,Finalized,,,,,a3149fec-4d48-427c-8b2c-59e8b79d59db,2013-07-30T18:00:00Z,
3,27440.0,False,Tahoe National Forest,,2013,False,/incidents/2013/8/10/american-fire/,,,Placer,...,"American Fire, August 2013, Deadwood Ridge, Fo...",2013-08-10T16:30:00Z,Finalized,,,,,8213f5c7-34fa-403b-a4bc-da2ace6e6625,2013-08-30T08:00:00Z,
4,24251.0,False,Ventura County Fire/CAL FIRE,,2013,True,/incidents/2013/5/2/springs-fire/,Acreage has been reduced based upon more accur...,,Ventura,...,"Springs Fire, May 2013, Highway 101, Camarillo...",2013-05-02T07:01:00Z,Finalized,6.0,10.0,,,46731fb8-3350-4920-bdf7-910ac0eb715c,2013-05-11T06:30:00Z,11.0


In [69]:
# Drop rows that have 'Mexico', 'State of Oregon', and 'State of Nevada' in og_fire_df counties
unwanted_county1_og_fire_df = og_fire_df[og_fire_df["Counties"] == "Mexico"].index
unwanted_county2_og_fire_df = og_fire_df[og_fire_df["Counties"] == "State of Oregon"].index
unwanted_county3_og_fire_df = og_fire_df[og_fire_df["Counties"] == "State of Nevada"].index

og_fire_df.drop(unwanted_county1_og_fire_df, inplace=True)
og_fire_df.drop(unwanted_county2_og_fire_df, inplace=True)
og_fire_df.drop(unwanted_county3_og_fire_df, inplace=True)

og_fire_df.head()

Unnamed: 0,AcresBurned,Active,AdminUnit,AirTankers,ArchiveYear,CalFireIncident,CanonicalUrl,ConditionStatement,ControlStatement,Counties,...,SearchKeywords,Started,Status,StructuresDamaged,StructuresDestroyed,StructuresEvacuated,StructuresThreatened,UniqueId,Updated,WaterTenders
0,257314.0,False,Stanislaus National Forest/Yosemite National Park,,2013,True,/incidents/2013/8/17/rim-fire/,,,Tuolumne,...,"Rim Fire, Stanislaus National Forest, Yosemite...",2013-08-17T15:25:00Z,Finalized,,,,,5fb18d4d-213f-4d83-a179-daaf11939e78,2013-09-06T18:30:00Z,
1,30274.0,False,USFS Angeles National Forest/Los Angeles Count...,,2013,True,/incidents/2013/5/30/powerhouse-fire/,,,Los Angeles,...,"Powerhouse Fire, May 2013, June 2013, Angeles ...",2013-05-30T15:28:00Z,Finalized,,,,,bf37805e-1cc2-4208-9972-753e47874c87,2013-06-08T18:30:00Z,
2,27531.0,False,CAL FIRE Riverside Unit / San Bernardino Natio...,,2013,True,/incidents/2013/7/15/mountain-fire/,,,Riverside,...,"Mountain Fire, July 2013, Highway 243, Highway...",2013-07-15T13:43:00Z,Finalized,,,,,a3149fec-4d48-427c-8b2c-59e8b79d59db,2013-07-30T18:00:00Z,
3,27440.0,False,Tahoe National Forest,,2013,False,/incidents/2013/8/10/american-fire/,,,Placer,...,"American Fire, August 2013, Deadwood Ridge, Fo...",2013-08-10T16:30:00Z,Finalized,,,,,8213f5c7-34fa-403b-a4bc-da2ace6e6625,2013-08-30T08:00:00Z,
4,24251.0,False,Ventura County Fire/CAL FIRE,,2013,True,/incidents/2013/5/2/springs-fire/,Acreage has been reduced based upon more accur...,,Ventura,...,"Springs Fire, May 2013, Highway 101, Camarillo...",2013-05-02T07:01:00Z,Finalized,6.0,10.0,,,46731fb8-3350-4920-bdf7-910ac0eb715c,2013-05-11T06:30:00Z,11.0


# ---------------------------------------------------------------------

# Export Housing CSV, Electricity CSV, and Fire CSV
## Once Environment Data has all counties ** Except ** Imperial and San Francisco, then we can merge ALL Datasets and export as CSV

In [70]:
# # Export all the CSVs
# housing_df2.to_csv("../Data_Sets_for_Visualization/housing_visualization.csv")
# electricity_df2.to_csv("../Data_Sets_for_Visualization/electricity_visualization.csv")
# fire_df.to_csv("../Data_Sets_for_Visualization/fire_visualization.csv")

# ---------------------------------------------------------------------

# Correcting Incorrect Fire Coordinates

In [71]:
# Assigning the variable lat to the Latitude series in fire_df
lat = fire_df["Latitude"]
lat

0       37.86
1       34.59
2       33.71
3       39.12
4        0.00
        ...  
1631    33.83
1632    39.41
1633    38.73
1634    33.35
1635    33.45
Name: Latitude, Length: 1632, dtype: float64

In [72]:
# Converting lat series to a list
lat = list(lat)
lat

[37.86,
 34.59,
 33.71,
 39.12,
 0.0,
 37.28,
 33.86,
 41.32,
 41.03,
 40.04,
 40.5,
 32.95,
 40.19,
 34.79,
 34.79,
 38.82,
 34.29,
 37.91,
 33.12,
 39.45,
 33.34,
 36.21,
 34.55,
 37.58,
 33.62,
 33.04,
 38.25,
 0.0,
 35.71,
 34.88,
 34.49,
 34.9,
 38.77,
 34.07,
 40.01,
 33.89,
 41.24,
 35.16,
 37.97,
 37.02,
 34.3,
 36.6,
 38.63,
 37.16,
 0.0,
 39.44,
 38.05,
 39.43,
 0.0,
 39.9,
 37.12,
 34.15,
 32.71,
 34.35,
 37.78,
 38.73,
 0.0,
 39.51,
 0.0,
 34.2,
 40.01,
 40.54,
 39.96,
 38.38,
 34.61,
 38.67,
 38.64,
 38.64,
 38.35,
 38.18,
 0.0,
 33.99,
 37.44,
 36.72,
 0.0,
 34.42,
 36.26,
 38.64,
 38.09,
 37.04,
 33.96,
 38.88,
 38.68,
 34.29,
 36.77,
 36.15,
 0.0,
 37.92,
 32.56,
 38.93,
 41.73,
 33.64,
 40.89,
 38.28,
 35.44,
 39.66,
 37.19,
 33.96,
 35.63,
 39.04,
 39.73,
 0.0,
 0.0,
 34.32,
 37.17,
 34.18,
 38.44,
 34.0,
 39.21,
 0.0,
 39.01,
 41.57,
 34.38,
 38.84,
 33.91,
 39.33,
 35.46,
 39.04,
 41.92,
 37.14,
 37.74,
 35.51,
 38.06,
 37.03,
 38.98,
 0.0,
 40.37,
 33.97,
 37.67,
 

In [73]:
# Assigning the variable lng to the Longitude series in fire_df
lng = fire_df["Longitude"]
lng

0      -120.09
1      -118.42
2      -116.73
3      -120.65
4         0.00
         ...  
1631   -117.50
1632   -121.00
1633   -121.73
1634   -117.40
1635   -116.06
Name: Longitude, Length: 1632, dtype: float64

In [74]:
# Converting lng series to a list
lng = list(lng)
lng

[-120.09,
 -118.42,
 -116.73,
 -120.65,
 0.0,
 -119.32,
 -116.9,
 -123.18,
 -123.49,
 -121.85,
 -122.54,
 -116.47,
 -121.6,
 -118.94,
 -118.94,
 -122.85,
 -116.94,
 -121.88,
 -116.53,
 -121.38,
 -117.31,
 -118.44,
 -119.82,
 -119.91,
 -117.4,
 -116.52,
 -120.03,
 0.0,
 -118.59,
 -118.92,
 -118.61,
 -118.93,
 -120.3,
 -117.05,
 -120.76,
 -116.86,
 -121.03,
 -120.05,
 -121.91,
 -119.78,
 -117.6,
 -118.02,
 -122.29,
 -120.94,
 0.0,
 -121.5,
 -120.9,
 -121.58,
 0.0,
 -121.74,
 -119.64,
 -117.91,
 -116.75,
 -117.62,
 -121.74,
 -121.38,
 0.0,
 -122.56,
 0.0,
 -117.42,
 -122.08,
 -121.38,
 -122.33,
 -122.31,
 -120.07,
 -122.09,
 -121.08,
 -121.08,
 -122.65,
 -122.51,
 0.0,
 -117.16,
 -121.2,
 -119.22,
 0.0,
 -118.59,
 -118.46,
 -122.66,
 -122.14,
 -121.01,
 -117.19,
 -122.57,
 -120.84,
 -117.45,
 -119.63,
 -120.9,
 0.0,
 -120.33,
 -116.9,
 -122.97,
 -120.38,
 -116.71,
 -120.96,
 -120.97,
 -120.84,
 -121.5,
 -119.79,
 -116.65,
 -120.83,
 -123.15,
 -121.71,
 0.0,
 0.0,
 -117.35,
 -119.76,
 -117

In [75]:
lat_lng = zip(lat, lng)
lat_lng

<zip at 0x7fd9ac0c4e40>

In [76]:
# Adding latitudes and longitudes to a list
coordinates = list(lat_lng)

In [77]:
from citipy import citipy

In [78]:
# Create a list for holding the cities.
cities = []
# Identify the nearest city for each latitude and longitude combination.
for coordinate in coordinates:
    city = citipy.nearest_city(coordinate[0], coordinate[1]).city_name

    # If the city is unique, then we will add it to the cities list.
    if city not in cities:
        cities.append(city)
# Print the city count to confirm sufficient count.
len(cities)

228

In [79]:
# Looking at cities list
cities

['merced',
 'quartz hill',
 'east hemet',
 'placerville',
 'takoradi',
 'clovis',
 'banning',
 'ashland',
 'arcata',
 'chico',
 'redding',
 'alpine',
 'magalia',
 'fillmore',
 'healdsburg',
 'yucaipa',
 'clayton',
 'ramona',
 'oroville',
 'fallbrook',
 'porterville',
 'goleta',
 'chowchilla',
 'lake elsinore',
 'south lake tahoe',
 'oildale',
 'arvin',
 'santa clarita',
 'susanville',
 'santa maria',
 'rancho cucamonga',
 'ridgecrest',
 'napa',
 'los banos',
 'oakdale',
 'paradise',
 'azusa',
 'livermore',
 'north highlands',
 'clearlake',
 'rialto',
 'red bluff',
 'isla vista',
 'woodland',
 'cameron park',
 'rohnert park',
 'novato',
 'redlands',
 'patterson',
 'orange cove',
 'windsor',
 'benicia',
 'moreno valley',
 'crestline',
 'king city',
 'tijuana',
 'ukiah',
 'galt',
 'morro bay',
 'madera',
 'desert hot springs',
 'atascadero',
 'fontana',
 'linda',
 'altamont',
 'santa paula',
 'elko',
 'beaumont',
 'grass valley',
 'morgan hill',
 'tracy',
 'south san francisco',
 'east fo

In [81]:
pip install reverse_geocoder

Collecting reverse_geocoder
  Downloading reverse_geocoder-1.5.1.tar.gz (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 1.8 MB/s eta 0:00:01
Building wheels for collected packages: reverse-geocoder
  Building wheel for reverse-geocoder (setup.py) ... [?25ldone
[?25h  Created wheel for reverse-geocoder: filename=reverse_geocoder-1.5.1-py3-none-any.whl size=2268089 sha256=1b4beb32b1ae61c5d735a1ca0acb551d8ebea11a5856c64b5fb2fae6fa2e8329
  Stored in directory: /Users/miabroad/Library/Caches/pip/wheels/65/42/5e/223fcd5dc869ff98d5ee6b19e236f82828e4e3c039328ebe20
Successfully built reverse-geocoder
Installing collected packages: reverse-geocoder
Successfully installed reverse-geocoder-1.5.1
You should consider upgrading via the '/Users/miabroad/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [82]:
# Import dependencies
import reverse_geocoder as rg
import scipy

In [90]:
# Find where these cities are located
results = rg.search(coordinates) # default mode = 2
results

[{'lat': '37.9627',
  'lon': '-120.2413',
  'name': 'Tuolumne City',
  'admin1': 'California',
  'admin2': 'Tuolumne County',
  'cc': 'US',
  4: 'California'},
 {'lat': '34.62165',
  'lon': '-118.41397',
  'name': 'Green Valley',
  'admin1': 'California',
  'admin2': 'Los Angeles County',
  'cc': 'US',
  4: 'California'},
 {'lat': '33.74002',
  'lon': '-116.71891',
  'name': 'Idyllwild',
  'admin1': 'California',
  'admin2': 'Riverside County',
  'cc': 'US',
  4: 'California'},
 {'lat': '39.02018',
  'lon': '-120.81799',
  'name': 'Foresthill',
  'admin1': 'California',
  'admin2': 'Placer County',
  'cc': 'US',
  4: 'California'},
 {'lat': '4.88447',
  'lon': '-1.75536',
  'name': 'Takoradi',
  'admin1': 'Western',
  'admin2': '',
  'cc': 'GH',
  4: 'California'},
 {'lat': '37.08078',
  'lon': '-119.48541',
  'name': 'Auberry',
  'admin1': 'California',
  'admin2': 'Fresno County',
  'cc': 'US',
  4: 'California'},
 {'lat': '33.92557',
  'lon': '-116.87641',
  'name': 'Banning',
  'ad

In [109]:
len(results)

1632

In [103]:
not_in_ca_cities = []
for result in results:
    if (result['admin1'] != 'California'):
        not_in_ca_cities.append(result)

In [107]:
not_in_ca_cities

[{'lat': '4.88447',
  'lon': '-1.75536',
  'name': 'Takoradi',
  'admin1': 'Western',
  'admin2': '',
  'cc': 'GH',
  4: 'California'},
 {'lat': '4.88447',
  'lon': '-1.75536',
  'name': 'Takoradi',
  'admin1': 'Western',
  'admin2': '',
  'cc': 'GH',
  4: 'California'},
 {'lat': '4.88447',
  'lon': '-1.75536',
  'name': 'Takoradi',
  'admin1': 'Western',
  'admin2': '',
  'cc': 'GH',
  4: 'California'},
 {'lat': '4.88447',
  'lon': '-1.75536',
  'name': 'Takoradi',
  'admin1': 'Western',
  'admin2': '',
  'cc': 'GH',
  4: 'California'},
 {'lat': '4.88447',
  'lon': '-1.75536',
  'name': 'Takoradi',
  'admin1': 'Western',
  'admin2': '',
  'cc': 'GH',
  4: 'California'},
 {'lat': '4.88447',
  'lon': '-1.75536',
  'name': 'Takoradi',
  'admin1': 'Western',
  'admin2': '',
  'cc': 'GH',
  4: 'California'},
 {'lat': '4.88447',
  'lon': '-1.75536',
  'name': 'Takoradi',
  'admin1': 'Western',
  'admin2': '',
  'cc': 'GH',
  4: 'California'},
 {'lat': '4.88447',
  'lon': '-1.75536',
  'name

In [108]:
len(not_in_ca_cities)

202

In [100]:
test_dict = results[0]
test_dict

{'lat': '37.9627',
 'lon': '-120.2413',
 'name': 'Tuolumne City',
 'admin1': 'California',
 'admin2': 'Tuolumne County',
 'cc': 'US',
 4: 'California'}

In [92]:
test_dict[4]

'California'