In [1]:
import csv
import pandas as pd
import numpy as np
import operator

In [18]:
# Read temperature
df = pd.read_csv('../data/final/air_temperature.csv', parse_dates=['Date'])
df.drop(df.columns[0], axis=1,inplace=True)

In [22]:
# Standardize column names
df.rename(columns={'Date': 'date', 'County':'county'}, inplace=True)
# Recalculate temperature
df['maxTemp'] = (df['TMAX']/10*1.8)+32
df['minTemp'] = (df['TMIN']/10*1.8)+32
# Remove unncessary columns
df.drop(['TMAX', 'TMIN'], axis=1, inplace=True)
# Filter unncessary years
df = df[(df['date']>=np.datetime64('2003-01-01'))]
df = df[(df['date']<np.datetime64('2020-01-01'))]
temp = df
temp

Unnamed: 0,county,date,maxTemp,minTemp
0,Alameda,2003-01-01,55.760000,39.200000
1,Alameda,2003-01-02,61.178000,47.408000
2,Alameda,2003-01-03,61.610000,44.816000
3,Alameda,2003-01-04,60.292727,46.743636
4,Alameda,2003-01-05,67.721818,50.196364
...,...,...,...,...
377254,Yuba,2019-12-27,53.720000,33.320000
377255,Yuba,2019-12-28,55.940000,36.320000
377256,Yuba,2019-12-29,51.320000,36.320000
377257,Yuba,2019-12-30,59.000000,36.020000


In [23]:
# Read airquality
df = pd.read_csv('../data/final/airquality.csv', parse_dates=['date'])

In [25]:
# Standardize column names
df.rename(columns={'measure': 'airQuality'}, inplace=True)
airquality = df
airquality

Unnamed: 0,date,county,airQuality
0,2003-01-01,Alameda,17.433333
1,2003-01-01,Alpine,27.666667
2,2003-01-01,Amador,24.666667
3,2003-01-01,Butte,15.944444
4,2003-01-01,Calaveras,30.555556
...,...,...,...
360117,2019-12-31,Tulare,10.700000
360118,2019-12-31,Tuolumne,20.178056
360119,2019-12-31,Ventura,1.509722
360120,2019-12-31,Yolo,13.786957


In [41]:
# Read airquality
df = pd.read_csv('../data/final/calfire_clean.csv', parse_dates=['Started'])

In [44]:
# Standardize column names
df.rename(columns={'Started': 'date', 'Counties':'county', 'AcresBurned': 'acresBurned'}, inplace=True)
# Filter unncessary years
df = df[(df['date']>=np.datetime64('2003-01-01'))]
df = df[(df['date']<np.datetime64('2020-01-01'))]
# Aggregate by date, column
df = df.groupby(['county', 'date']).agg({'acresBurned': ['sum']}).reset_index().rename(columns={'sum':''})
df.columns = ["".join(x) for x in df.columns.ravel()]
fires=df
fires

Unnamed: 0,county,date,acresBurned
0,Alameda,2006-07-11,6400.0
1,Alameda,2009-08-13,12500.0
2,Alameda,2010-06-18,475.0
3,Alameda,2011-06-14,175.0
4,Alameda,2011-07-14,400.0
...,...,...,...
2056,Yuba,2017-10-08,9989.0
2057,Yuba,2019-06-08,70.0
2058,Yuba,2019-06-24,80.0
2059,Yuba,2019-06-25,80.0


In [53]:
# Write fire data with no nulls to a file
fires_marks = fires
fires_marks = fires_marks[(fires_marks.acresBurned != 0)]
fires_marks.describe()
fires_marks.to_csv('../data/final/fires_marks.csv')

In [54]:
# Read rainfall
df = pd.read_csv('../data/final/water.csv', parse_dates=['date'])

In [55]:
# Standardize column names
df.rename(columns={'daily_rain_inches': 'dailyRain'}, inplace=True)
rain = df
rain

In [57]:
# Join data
merged = pd.merge(temp, airquality, how='left', on=['date', 'county'])
merged = pd.merge(merged, fires, how='left', on=['date', 'county'])
merged = pd.merge(merged, rain, how='left', on=['date', 'county'])
merged

Unnamed: 0,county,date,maxTemp,minTemp,airQuality,acresBurned
0,Alameda,2003-01-01,55.760000,39.200000,17.433333,
1,Alameda,2003-01-02,61.178000,47.408000,17.433333,
2,Alameda,2003-01-03,61.610000,44.816000,17.433333,
3,Alameda,2003-01-04,60.292727,46.743636,14.788889,
4,Alameda,2003-01-05,67.721818,50.196364,12.144444,
...,...,...,...,...,...,...
359432,Yuba,2019-12-27,53.720000,33.320000,9.439130,
359433,Yuba,2019-12-28,55.940000,36.320000,17.576667,
359434,Yuba,2019-12-29,51.320000,36.320000,19.130000,
359435,Yuba,2019-12-30,59.000000,36.020000,8.877778,


In [None]:
# Write resulting data to csv
df_final.to_csv('../data/final/final_data.csv')