In [65]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point # Used in conversion of pandas dataframe to a geodataframe
from geopandas import GeoDataFrame # Needed to convert pandas dataframe to geodataframe
from pandas.io.json import json_normalize
import re # Used for regular expressions
import requests # # Used to pull down Tiger shapefiles from Census site
import zipfile # Used to zip contents of request from Census site
import io # Used in zipping process of files
import fiona
fiona.Env(); # Used to avoid errors when writing dataframe to geojson

### Script to combine all of the WNV JSON files into a single JSON

In [66]:
import json
import glob

# Empty list for JSON data
json_data = []

# Loops through JSON files in data dir and extract the contents of each and append to the empty list declared above
# Glob would would grab files randomly, using sorted to have glob pickup file sequentially. 
for file in sorted(glob.glob('./data/wnv/*.json')):
    
    # Open and read file in binary mode to keep data as is and not convert anything (e.g. new line, end of line) 
    with open(file, 'rb') as infile:
        
        # Append the currently open file to the json_data list
        json_data.append(json.load(infile))

# Output JSON data to new file 
with open('./data/wnv/output/wnv2003_2018.json', 'w') as outfile:
    json.dump(json_data, outfile)


### Read in the newly created JSON file

In [67]:
json_data = pd.read_json('./data/wnv/output/wnv2003_2018.json')

In [68]:
json_data.head()

Unnamed: 0,dead_bird,pool,sentinel,surveillance
0,[],"[[-115.55991333277777, 32.79452379068916, El C...",[],"[[-0.0037874454854030003, 0.004151121647450000..."
1,[],"[[-115.70874635314847, 32.760950501453685, See...","[[-115.70466298389093, 32.76097984589758, Seel...","[[0.00040120886695000003, -0.00237453044225900..."
2,[],"[[-115.70778129001854, 32.76645731969914, Seel...","[[-115.70380745397101, 32.76212619660325, Seel...","[[0.000582078219765, 0.002604673449621, Pico R..."
3,"[[-117.06524547408917, 32.57587171177409, San ...","[[-115.70536399338327, 33.06352430738503, West...","[[-115.7063219308171, 32.765271649553355, Seel...","[[0.0042305860051320004, 0.001971813442978, Un..."
4,"[[-117.08130333018387, 32.64154680042579, Chul...","[[-117.03840702222823, 32.59563784834163, Chul...","[[-115.7048189736365, 32.75950529599969, Seele...","[[-0.004922821113662001, -0.004294370979541, C..."


### Read in the JSON without assigning to a dataframe instead.  Seems like it may be easier to work with it this way

In [69]:
with open('./data/wnv/output/wnv2003_2018.json') as f:
    d = json.load(f)


### Assign json objects to year variables

In [70]:
year_03 = d[0]
year_04 = d[1]
year_05 = d[2]
year_06 = d[3]
year_07 = d[4]
year_08 = d[5]
year_09 = d[6]
year_10 = d[7]
year_11 = d[8]
year_12 = d[9]
year_13 = d[10]
year_14 = d[11]
year_15 = d[12]
year_16 = d[13]
year_17 = d[14]
year_18 = d[15]

### Assign each specimen type (except surveillance) to a variable to create a dataframe if it has data

In [71]:
year_03_pool = json_normalize(year_03, record_path='pool')

year_04_sentinel = json_normalize(year_04, record_path='sentinel')
year_04_pool = json_normalize(year_04, record_path='pool')

year_05_sentinel = json_normalize(year_05, record_path='sentinel')
year_05_pool = json_normalize(year_05, record_path='pool')

year_06_sentinel = json_normalize(year_06, record_path='sentinel')
year_06_dead_bird = json_normalize(year_06, record_path='dead_bird')
year_06_pool = json_normalize(year_06, record_path='pool')

year_07_sentinel = json_normalize(year_07, record_path='sentinel')
year_07_dead_bird = json_normalize(year_07, record_path='dead_bird')
year_07_pool = json_normalize(year_07, record_path='pool')

year_08_sentinel = json_normalize(year_08, record_path='sentinel')
year_08_dead_bird = json_normalize(year_08, record_path='dead_bird')
year_08_pool = json_normalize(year_08, record_path='pool')

year_09_sentinel = json_normalize(year_09, record_path='sentinel')
year_09_dead_bird = json_normalize(year_09, record_path='dead_bird')
year_09_pool = json_normalize(year_09, record_path='pool')

year_10_sentinel = json_normalize(year_10, record_path='sentinel')
year_10_dead_bird = json_normalize(year_10, record_path='dead_bird')
year_10_pool = json_normalize(year_10, record_path='pool')

year_11_sentinel = json_normalize(year_11, record_path='sentinel')
year_11_dead_bird = json_normalize(year_11, record_path='dead_bird')
year_11_pool = json_normalize(year_11, record_path='pool')

year_12_sentinel = json_normalize(year_12, record_path='sentinel')
year_12_dead_bird = json_normalize(year_12, record_path='dead_bird')
year_12_pool = json_normalize(year_12, record_path='pool')

year_13_sentinel = json_normalize(year_13, record_path='sentinel')
year_13_dead_bird = json_normalize(year_13, record_path='dead_bird')
year_13_pool = json_normalize(year_13, record_path='pool')

year_14_sentinel = json_normalize(year_14, record_path='sentinel')
year_14_dead_bird = json_normalize(year_14, record_path='dead_bird')
year_14_pool = json_normalize(year_14, record_path='pool')

year_15_sentinel = json_normalize(year_15, record_path='sentinel')
year_15_dead_bird = json_normalize(year_15, record_path='dead_bird')
year_15_pool = json_normalize(year_15, record_path='pool')

year_16_sentinel = json_normalize(year_16, record_path='sentinel')
year_16_dead_bird = json_normalize(year_16, record_path='dead_bird')
year_16_pool = json_normalize(year_16, record_path='pool')

year_17_sentinel = json_normalize(year_17, record_path='sentinel')
year_17_dead_bird = json_normalize(year_17, record_path='dead_bird')
year_17_pool = json_normalize(year_17, record_path='pool')

year_18_sentinel = json_normalize(year_18, record_path='sentinel')
year_18_dead_bird = json_normalize(year_18, record_path='dead_bird')
year_18_pool = json_normalize(year_18, record_path='pool')

### Function to cleanup the resulting dataframes a bit

In [72]:
# Pass the function a dataframe to work with
def df_cleanup(dataframes):
    for pdf in dataframes:
        
        # Rename columns
        pdf.rename(columns={0: 'lon', 1: 'lat', 2 : 'city', 3 : 'collections', 4 : 'virus', 5 : 'date'}, inplace=True)
        
        # Ensure all city names are capitalized
        pdf['city'] = pdf['city'].str.title()
        
        # Round coordinates to 6 decimal places
        pdf['lat'] = pdf['lat'].round(6)
        pdf['lon'] = pdf['lon'].round(6)
        
        # Add a spectype column which will be populated later
        pdf['spectype'] = ''

### List of our dataframes which will be passed to the df_cleanup function

In [73]:
json_dfs = [year_03_pool, 
            year_04_sentinel, 
            year_04_pool, 
            year_05_sentinel, 
            year_05_pool, 
            year_06_sentinel, 
            year_06_dead_bird, 
            year_06_pool, 
            year_07_sentinel, 
            year_07_dead_bird, 
            year_07_pool, 
            year_08_sentinel, 
            year_08_dead_bird, 
            year_08_pool, 
            year_09_sentinel, 
            year_09_dead_bird, 
            year_09_pool, 
            year_10_sentinel,
            year_10_dead_bird, 
            year_10_pool, 
            year_11_sentinel,
            year_11_dead_bird, 
            year_11_pool, 
            year_12_sentinel, 
            year_12_dead_bird, 
            year_12_pool, 
            year_13_sentinel, 
            year_13_dead_bird, 
            year_13_pool, 
            year_14_sentinel, 
            year_14_dead_bird, 
            year_14_pool, 
            year_15_sentinel, 
            year_15_dead_bird, 
            year_15_pool, 
            year_16_sentinel, 
            year_16_dead_bird, 
            year_16_pool, 
            year_17_sentinel, 
            year_17_dead_bird, 
            year_17_pool, 
            year_18_sentinel, 
            year_18_dead_bird, 
            year_18_pool
           ] 

### Pass our list of dataframes to *df_cleanup* to tidy things up a little bit

In [74]:
df_cleanup(json_dfs)

### Populate the *spectype* column in the dataframes

In [75]:
year_03_pool['spectype'] = 'Mosquito Pool' 

year_04_sentinel['spectype'] = 'Sentinel Chicken' 
year_04_pool['spectype'] = 'Mosquito Pool' 

year_05_sentinel['spectype'] = 'Sentinel Chicken'
year_05_pool['spectype'] = 'Mosquito Pool' 

year_06_sentinel['spectype'] = 'Sentinel Chicken'
year_06_dead_bird['spectype'] = 'Dead Bird'
year_06_pool['spectype'] = 'Mosquito Pool' 

year_07_sentinel['spectype'] = 'Sentinel Chicken'
year_07_dead_bird['spectype'] = 'Dead Bird'
year_07_pool['spectype'] = 'Mosquito Pool' 

year_08_sentinel['spectype'] = 'Sentinel Chicken'
year_08_dead_bird['spectype'] = 'Dead Bird'
year_08_pool['spectype'] = 'Mosquito Pool' 

year_09_sentinel['spectype'] = 'Sentinel Chicken'
year_09_dead_bird['spectype'] = 'Dead Bird'
year_09_pool['spectype'] = 'Mosquito Pool' 

year_10_sentinel['spectype'] = 'Sentinel Chicken'
year_10_dead_bird['spectype'] = 'Dead Bird'
year_10_pool['spectype'] = 'Mosquito Pool' 

year_11_sentinel['spectype'] = 'Sentinel Chicken'
year_11_dead_bird['spectype'] = 'Dead Bird'
year_11_pool['spectype'] = 'Mosquito Pool' 

year_12_sentinel['spectype'] = 'Sentinel Chicken'
year_12_dead_bird['spectype'] = 'Dead Bird'
year_12_pool['spectype'] = 'Mosquito Pool' 

year_13_sentinel['spectype'] = 'Sentinel Chicken'
year_13_dead_bird['spectype'] = 'Dead Bird'
year_13_pool['spectype'] = 'Mosquito Pool' 

year_14_sentinel['spectype'] = 'Sentinel Chicken'
year_14_dead_bird['spectype'] = 'Dead Bird'
year_14_pool['spectype'] = 'Mosquito Pool' 

year_15_sentinel['spectype'] = 'Sentinel Chicken'
year_15_dead_bird['spectype'] = 'Dead Bird'
year_15_pool['spectype'] = 'Mosquito Pool' 

year_16_sentinel['spectype'] = 'Sentinel Chicken'
year_16_dead_bird['spectype'] = 'Dead Bird'
year_16_pool['spectype'] = 'Mosquito Pool' 

year_17_sentinel['spectype'] = 'Sentinel Chicken'
year_17_dead_bird['spectype'] = 'Dead Bird'
year_17_pool['spectype'] = 'Mosquito Pool' 

year_18_sentinel['spectype'] = 'Sentinel Chicken'
year_18_dead_bird['spectype'] = 'Dead Bird'
year_18_pool['spectype'] = 'Mosquito Pool' 

In [76]:
year_18_pool.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 746 entries, 0 to 745
Data columns (total 7 columns):
lon            746 non-null float64
lat            746 non-null float64
city           693 non-null object
collections    746 non-null int64
virus          746 non-null object
date           746 non-null object
spectype       746 non-null object
dtypes: float64(2), int64(1), object(4)
memory usage: 40.9+ KB


### Merge all our data frames and using `concat` since they all have the same columns

In [77]:
merged_dfs = pd.concat(json_dfs)

### Work with the date field to strip out the square brackets and remove the intergers in between the data strings.
### Thank you to Professor Cooper for helping in tweaking some initial code I had to best work with this field

In [78]:
# Take intial look
merged_dfs['date'].head()

0                                     [[9-16-2003, 1]]
1                                     [[7-16-2003, 1]]
2                                     [[9-16-2003, 1]]
3                      [[8-19-2003, 2], [9-2-2003, 2]]
4    [[8-4-2003, 1], [8-19-2003, 2], [9-2-2003, 3],...
Name: date, dtype: object

In [79]:
# Empty list which will eventually hold our date strings/groups with integers and square brackets stripped out
date_items = []

# Loop through the date field of the dataframe
for date_group in merged_dfs['date']:
    
    # Empty list which will hold dates pulled out from the date column
    extracted_dates_list = []
    # Loop through the group of dates pulled from the date column (some may only be a single list)
    # Pull out the first entry in the list and append it to the extracted dates list
    for date in date_group:
        extracted_dates_list.append(date[0])
    
    # Separate the extracted dates with a comma and a space if there is more than one
    cs = ', '
    cs = cs.join(extracted_dates_list)
    # Add our comma separated dates to the date_items list
    date_items.append(cs)  

In [80]:
# Replace the current date column in the dataframe with out newly created date_items list
merged_dfs['date'] = date_items

In [81]:
# Take a look after stripping out brackets
merged_dfs['date'].head()

0                                   9-16-2003
1                                   7-16-2003
2                                   9-16-2003
3                         8-19-2003, 9-2-2003
4    8-4-2003, 8-19-2003, 9-2-2003, 9-16-2003
Name: date, dtype: object

### Subset the dataframe since there are 322 null values in the city column

In [82]:
wnv_city_populated = merged_dfs.loc[merged_dfs['city'].notnull()]
wnv_null_city = merged_dfs.loc[merged_dfs['city'].isnull()]

In [83]:
wnv_null_city.head()

Unnamed: 0,lon,lat,city,collections,virus,date,spectype
16,-117.352159,33.672344,,7,WNV,"8-25-2004, 10-6-2004, 10-21-2004, 11-3-2004, 1...",Sentinel Chicken
33,-117.400191,33.888835,,3,WNV,"7-14-2004, 8-11-2004",Sentinel Chicken
46,-117.854757,34.027711,,3,WNV,"7-21-2004, 9-8-2004",Sentinel Chicken
47,-117.335613,34.026161,,6,WNV,"7-14-2004, 7-28-2004, 8-25-2004",Sentinel Chicken
49,-118.025735,34.037443,,5,WNV,"7-12-2004, 9-8-2004",Sentinel Chicken


### Reverse geocode with geopy using the lat/lon columns of the wnv_null_city dataframe

In [84]:
from geopy.geocoders import MapBox
geolocator = MapBox('pk.eyJ1IjoiYmF6aW5pNjI3IiwiYSI6ImVmZTNiM2VlZTlhMjk0NzE3MjU5YmEzZWVkYjUwNjAzIn0.idDK-FJKJn3Q7StDtBOX-g',user_agent="wnv-geocode")

# Create an address column to populate the results of the reverse gecoding with
wnv_null_city['address'] = wnv_null_city.apply(
    lambda row: geolocator.reverse((row['lat'], row['lon']), timeout=20), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [85]:
wnv_null_city.tail(100)

Unnamed: 0,lon,lat,city,collections,virus,date,spectype,address
792,-119.841071,36.815869,,1,WNV,7-27-2017,Mosquito Pool,"(2654 West Barstow Avenue, Fresno, California ..."
795,-119.829290,36.822385,,2,WNV,7-6-2017,Mosquito Pool,"(2144 West Rue Saint Michel, Fresno, Californi..."
805,-120.450826,36.851335,,1,WNV,7-20-2017,Mosquito Pool,"(1691 O Street, Firebaugh, California 93622, U..."
806,-120.442964,36.857758,,1,WNV,7-27-2017,Mosquito Pool,"(5650 Avenue 7 1/2, Firebaugh, California 9362..."
807,-120.442164,36.856835,,9,WNV,"7-18-2017, 8-1-2017, 8-22-2017, 9-6-2017",Mosquito Pool,"(7401 River Drive, Firebaugh, California 93622..."
809,-120.447686,36.855079,,1,WNV,9-14-2017,Mosquito Pool,"(Hazel M Bailey Primary School, 1691 Q St, Fir..."
812,-120.450766,36.860876,,1,WNV,7-27-2017,Mosquito Pool,"(1749 Thatcher Drive, Firebaugh, California 93..."
814,-120.454121,36.865229,,1,WNV,8-11-2017,Mosquito Pool,"(1890 7th St, Firebaugh, California 93622, Uni..."
815,-120.457265,36.862217,,1,WNV,8-18-2017,Mosquito Pool,"(874 Q Street, Firebaugh, California 93622, Un..."
816,-120.453477,36.863635,,1,WNV,8-24-2017,Mosquito Pool,"(Avenue 8 1/2, Firebaugh, California 93622, Un..."


### There are points mixed in from Utah and Arizona (and possibly others) so extract the state for right now so we can drop rows if they're not in CA

In [86]:
# Empty list to hold extracted city
states = []

# Address column to a list 
addies = wnv_null_city['address'].tolist()

# Loop through the addresses, pull out the city and append to cities list 
for addy in addies:
    
    # Convert Shapely object to a string
    addy_as_string = str(addy)
   
    # Regex looking for something like City,State zip anywhere in our address string
    state = re.search(",\s([A-z][a-z]+)\s\d{5}", addy_as_string)
        
    # If we have a match append the city to our cities list 
    if state:
        states.append(state.group(1))

print(states)

['California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'Arizona', 'California', 'Califor

### Create a state column in the data frame and add the data from the states list to it

In [38]:
wnv_null_city['state'] = states

ValueError: Length of values does not match length of index

In [87]:
# Check the length of our states array and check how many entries we have in our dataframe
print(len(states))
wnv_null_city.info()

320
<class 'pandas.core.frame.DataFrame'>
Int64Index: 322 entries, 16 to 745
Data columns (total 8 columns):
lon            322 non-null float64
lat            322 non-null float64
city           0 non-null object
collections    322 non-null int64
virus          322 non-null object
date           322 non-null object
spectype       322 non-null object
address        320 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 22.6+ KB


In [88]:
# Two entries appear to have null addresses
wnv_null_city.loc[wnv_null_city['address'].isnull()]

Unnamed: 0,lon,lat,city,collections,virus,date,spectype,address
0,0.002318,-0.001781,,91,WNV,"7-6-2009, 7-13-2009, 7-20-2009, 7-21-2009, 7-2...",Mosquito Pool,
0,0.004765,-0.003379,,1,WNV,4-3-2013,Sentinel Chicken,


### The above points appear to be bad entries (or at least have no bearing on CA) judging from the coordinates
### I also took a look at the original map source of the data and the entry for 4/3/13 is listed when the data range is narrowed but is not visible

![badDataPoint](./data/wnv/badDataPoint040313.png)

### Drop these bad points from the dataframe

In [89]:
wnv_null_city_dropped_pts = wnv_null_city.loc[wnv_null_city['address'].notnull()];
wnv_null_city_dropped_pts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 320 entries, 16 to 745
Data columns (total 8 columns):
lon            320 non-null float64
lat            320 non-null float64
city           0 non-null object
collections    320 non-null int64
virus          320 non-null object
date           320 non-null object
spectype       320 non-null object
address        320 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 22.5+ KB


### Add our states list to this new dataframe

In [90]:
wnv_null_city_dropped_pts['state'] = states 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


### Now drop any rows where the state value isn't California

In [91]:
wnv_null_city_only_ca = wnv_null_city_dropped_pts.loc[wnv_null_city_dropped_pts['state'] == 'California']

In [92]:
wnv_null_city_only_ca.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 240 entries, 16 to 627
Data columns (total 9 columns):
lon            240 non-null float64
lat            240 non-null float64
city           0 non-null object
collections    240 non-null int64
virus          240 non-null object
date           240 non-null object
spectype       240 non-null object
address        240 non-null object
state          240 non-null object
dtypes: float64(2), int64(1), object(6)
memory usage: 18.8+ KB


### City can now be extracted from the address column in this dataframe

In [93]:
# Empty list to hold extracted city
cities = []

# Address column to a list 
addies = wnv_null_city_only_ca['address'].tolist()

# Loop through the addresses, pull out the city and append to cities list 
for addy in addies:
    
    # Convert Shapely object to a string
    addy_as_string = str(addy)
   
    # Regex looking for something like City,California anywhere in our address string
    city_state = re.search("([A-Z][a-z]+)+,\s[C][a]+", addy_as_string)
        
    # If we have a match append the city to our cities list 
    if city_state:
        cities.append(city_state.group(1))

#print(cities)

### The null city column in our dataframe can now be replaced with our cities list

In [94]:
# Check the length of our list versue the number of items in the dataframe
print(len(cities))
wnv_null_city_only_ca.info()

240
<class 'pandas.core.frame.DataFrame'>
Int64Index: 240 entries, 16 to 627
Data columns (total 9 columns):
lon            240 non-null float64
lat            240 non-null float64
city           0 non-null object
collections    240 non-null int64
virus          240 non-null object
date           240 non-null object
spectype       240 non-null object
address        240 non-null object
state          240 non-null object
dtypes: float64(2), int64(1), object(6)
memory usage: 18.8+ KB


In [95]:
# Matches so we can drop our original city column
wnv_null_city_only_ca['city'] = cities

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [96]:
wnv_null_city_only_ca.head()

Unnamed: 0,lon,lat,city,collections,virus,date,spectype,address,state
16,-117.352159,33.672344,Elsinore,7,WNV,"8-25-2004, 10-6-2004, 10-21-2004, 11-3-2004, 1...",Sentinel Chicken,"(17887 Lakeshore Drive, Lake Elsinore, Califor...",California
33,-117.400191,33.888835,Riverside,3,WNV,"7-14-2004, 8-11-2004",Sentinel Chicken,"(1420 Croyance Drive, Riverside, California 92...",California
46,-117.854757,34.027711,Walnut,3,WNV,"7-21-2004, 9-8-2004",Sentinel Chicken,"(20500 Simon Court, Walnut, California 91789, ...",California
47,-117.335613,34.026161,Colton,6,WNV,"7-14-2004, 7-28-2004, 8-25-2004",Sentinel Chicken,"(2650 W La Cadena Dr, Colton, California 92324...",California
49,-118.025735,34.037443,Whittier,5,WNV,"7-12-2004, 9-8-2004",Sentinel Chicken,"(San Gabriel River Freeway, Whittier, Californ...",California


### Drop the address and state columns

In [97]:
wnv_null_city_only_ca.drop(columns=['address', 'state'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


### *wnv_null_city_only_ca* and *wnv_city_populated* can be merged

In [98]:
wnv_merged = pd.concat([wnv_city_populated,wnv_null_city_only_ca], sort=True)

### With all the point in the null cities data frame that weren't in CA, check and make sure all our points are within CA
### Use [Lauren Oldja's](https://medium.com/@loldja/reading-shapefile-zips-from-a-url-in-python-3-93ea8d727856) tutorial on pulling in Tiger shapefile for CA from the Census site

In [99]:
url = 'https://www2.census.gov/geo/tiger/GENZ2017/shp/cb_2017_us_state_500k.zip'

# Use requests to pull down data form the address
r = requests.get(url)

# Use zip to bundle the files
z = zipfile.ZipFile(io.BytesIO(r.content))

# Extract files to wnv data dir
z.extractall(path='./data/wnv/cb_2017_us_state_500k/') 

### Pull in the shape file and pull out the row for CA

In [100]:
tiger_state = gpd.read_file('./data/wnv/cb_2017_us_state_500k/cb_2017_us_state_500k.shp')
tiger_ca = tiger_state.loc[tiger_state['STATEFP'] == '06']
tiger_ca.head()

Unnamed: 0,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,geometry
29,6,1779778,0400000US06,6,CA,California,0,403483182192,20484637928,"(POLYGON ((-118.604415 33.478552, -118.598783 ..."


### Reproject *tiger_ca* and *wnv_merged* to [CA Albers](http://spatialreference.org/ref/epsg/nad83-california-albers/) to be used for a point-in-polygon analysis

In [101]:
# wnv_merged needs to be converted to a GeoDataframe first
geometry = [Point(xy) for xy in zip(wnv_merged.lon, wnv_merged.lat)]
wnv_merged = wnv_merged.drop(['lon', 'lat'], axis=1)
crs = {'init': 'epsg:4326'}
wnv_merged_gdf = GeoDataFrame(wnv_merged, crs=crs, geometry=geometry)

# Now we can reproject to CA Albers
tiger_ca_albers = tiger_ca.to_crs('+proj=aea +lat_1=34 +lat_2=40.5 +lat_0=0 +lon_0=-120 +x_0=0 +y_0=-4000000 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ')
wnv_merged_albers = wnv_merged_gdf.to_crs('+proj=aea +lat_1=34 +lat_2=40.5 +lat_0=0 +lon_0=-120 +x_0=0 +y_0=-4000000 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ')

### Create a polygon of the *tiger_ca_albers* dataframe which will then be used to check what points intersect the polygon

In [102]:
ca_poly = tiger_ca_albers.geometry.unary_union

# Determine what points are in the polygon
wnv_points_in_ca = wnv_merged_albers[wnv_merged_albers.geometry.intersects(ca_poly)]

In [103]:
print(wnv_points_in_ca.info()) 

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 27620 entries, 0 to 627
Data columns (total 6 columns):
city           27620 non-null object
collections    27620 non-null int64
date           27620 non-null object
spectype       27620 non-null object
virus          27620 non-null object
geometry       27620 non-null object
dtypes: int64(1), object(5)
memory usage: 1.5+ MB
None


### Reproject to WGS84

In [104]:
wnv_points_in_ca.to_crs(epsg=4326, inplace=True)

Unnamed: 0,city,collections,date,spectype,virus,geometry
0,El Centro,1,9-16-2003,Mosquito Pool,WNV,POINT (-115.559913 32.79452400000002)
1,El Centro,1,7-16-2003,Mosquito Pool,WNV,POINT (-115.58498 32.79907899999996)
2,Niland,1,9-16-2003,Mosquito Pool,WNV,POINT (-115.41571 32.79871899999999)
3,Niland,4,"8-19-2003, 9-2-2003",Mosquito Pool,WNV,POINT (-115.6107 33.18109999999997)
4,Niland,8,"8-4-2003, 8-19-2003, 9-2-2003, 9-16-2003",Mosquito Pool,WNV,POINT (-115.575977 33.27748299999995)
5,Mecca,1,8-27-2003,Mosquito Pool,WNV,POINT (-115.929864 33.51479799999999)
6,Mecca,1,9-11-2003,Mosquito Pool,WNV,POINT (-115.926451 33.51748600000001)
7,Mecca,1,9-23-2003,Mosquito Pool,WNV,POINT (-116.084089 33.51548899999996)
8,Mecca,2,"8-26-2003, 9-9-2003",Mosquito Pool,WNV,POINT (-116.098347 33.52886999999998)
9,Mecca,1,9-9-2003,Mosquito Pool,WNV,POINT (-116.07588 33.52970400000002)


In [105]:
wnv_points_in_ca.to_file('./data/wnv/output/wnv2003_2018_cleaned.geojson', driver="GeoJSON")

  with fiona.drivers():
