#### Motivation

Is there a trend between population growth and stranding cases from 1985-2014?

In [1]:
import pandas as pd
import numpy as np
import math
import altair as alt
alt.data_transformers.disable_max_rows()
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import pickle
import folium

In [2]:
FOLDERPATH = '../data/'

In [3]:
stranded_geo = pd.read_csv(FOLDERPATH+'stranded_sea_otters/Table1_StrandedSeaOtter_byGeographicAreas_1985_2017.csv')
stranded_geo_orig = stranded_geo.copy()   # we will make some changes to stranded_geo later on.
stranded_geo.head()

Unnamed: 0,Year,Total_Strandings,North_of_Pigeon_Pt,Pigeon_Pt_to_Capitola,Capitola_to_Seaside,Seaside_to_Pt_Lobos,Pt_Lobos_to_Pt_Sur,Pt_Sur_to_Dolan_Rock,Dolan_Rock_to_C_San_Martin,C_San_Martin_to_San_Simeon_Pt,San_Simeon_Pt_to_Cayucos,Cayucos_to_Hazard_Canyon,Hazard_Canyon_to_Pismo_Beach,Pismo_Beach_to_Pt_Sal,Pt_Sal_to_Pt_Conception,SE_of_Pt_Conception
0,1985,72,0,1,14,11,5,4,0,13,6,13,1,1,1,2
1,1986,81,3,2,32,18,3,1,0,9,4,9,0,0,0,0
2,1987,94,5,0,27,19,2,1,0,11,11,8,4,1,1,4
3,1988,88,6,2,10,21,4,3,3,7,7,18,4,2,0,1
4,1989,99,2,6,23,13,3,0,1,5,11,26,5,1,3,0


The columns in the dataframe are
1. Year: From 1985-2017
2. Total Strandings: total stranding cases across all the regions in that year.
3. Rest are the different regions along the coast line of California.

This dataframe has location data in regions, whereas for census/population dataset we found the location for each habitat. Inorder to make a fair analysis, we have to group the habitats in the census dataframe into regions as per the stranded_geo dataframe.

For that we first find the location coordinates, in latitude-longitude, of each of the region end points from Google Maps, create a dataframe and group the habitats.

**Map coordinates of the region end points:**

1. Pigeon Point: (37.18434548901108, -122.39470954049565)
2. Capitola(Beach): (36.972049429039096, -121.95152041838831)
3. Seaside(Monterey State Beach): (36.615470635247696, -121.85781056508648)
4. Point Lobos: (36.5236214892572, -121.95278200355799)
5. Point Sur(Lighthouse): (36.3069690273742, -121.90086508992727)
6. Dolan Rock: (36.08561998793649, -121.61823511757963)
7. C San Martin(Cape): (35.88982498346323, -121.46281907163161)
8. San Simeon Point: (35.63684751230898, -121.19492013310929)
9. Cayucos(Beach): (35.450113738013975, -120.90547693639098)
10. Hazard Canyon: (35.290814980161336, -120.88370600256532)
11. Pismo Beach: (35.14265430661362, -120.64387229555241)
12. Point Sal: (34.90401826260435, -120.67040841806637)
13. Point Conception: (34.449016480303904, -120.47158983231594)

In [4]:
coords_data = '''
        1. Pigeon Point: (37.18434548901108, -122.39470954049565)
        2. Capitola(Beach): (36.972049429039096, -121.95152041838831)
        3. Seaside(Monterey State Beach): (36.615470635247696, -121.85781056508648)
        4. Point Lobos: (36.5236214892572, -121.95278200355799)
        5. Point Sur(Lighthouse): (36.3069690273742, -121.90086508992727)
        6. Dolan Rock: (36.08561998793649, -121.61823511757963)
        7. C San Martin(Cape): (35.88982498346323, -121.46281907163161)
        8. San Simeon Point: (35.63684751230898, -121.19492013310929)
        9. Cayucos(Beach): (35.450113738013975, -120.90547693639098)
        10. Hazard Canyon: (35.290814980161336, -120.88370600256532)
        11. Pismo Beach: (35.14265430661362, -120.64387229555241)
        12. Point Sal: (34.90401826260435, -120.67040841806637)
        13. Point Conception: (34.449016480303904, -120.47158983231594)
        '''

In [5]:
coords_split = [ c for c in coords_data.split('\n') if len(c) > 0 ]
coords = []
for c in coords_split[:-1]:
    nm, point = c.split(':') #split to get name, and lat-lon
    name = " ".join(nm.split()[1:])   
    lat,lon = point.split()
    coords.append([name,float(lat[1:-1]),float(lon[:-1])])

In [6]:
location_coords = pd.DataFrame(coords,columns='Name Latitude Longitude'.split())
location_coords.head()

Unnamed: 0,Name,Latitude,Longitude
0,Pigeon Point,37.184345,-122.39471
1,Capitola(Beach),36.972049,-121.95152
2,Seaside(Monterey State Beach),36.615471,-121.857811
3,Point Lobos,36.523621,-121.952782
4,Point Sur(Lighthouse),36.306969,-121.900865


In [27]:
# location_coords.to_csv(FOLDERPATH+'california_coastline_region.csv',index=False)

In [7]:
start_loc = [36.18907,-120.765664]    # same start location
base = folium.Map(location=start_loc,width='50%',height='70%',zoom_start=7,min_zoom=5,max_zoom=14)

for rows in location_coords.iterrows():
    
    folium.Marker(
                location=[rows[1].Latitude, rows[1].Longitude], 
                icon=folium.Icon(),
                tooltip=rows[1].Name
            ).add_to(base)
    
base

From census data, find otter count.

In [8]:
census = pd.read_csv(FOLDERPATH+'annual_census_1985-2014.csv')
org_data = census.copy()

census['independent_otters'] = census.dens_sm * census.AREA
census['independent_otters'] = census.independent_otters.apply(math.floor)  #should be round up or down??

census['pup_otters'] = census.independent_otters * census.pupratio
census['pup_otters'] = census.pup_otters.apply(math.floor)  #should be round up or down??

census['total_otters'] = census.independent_otters + census.pup_otters

attributes = ['HAB_ID','Year','total_otters']
census = census[attributes]
census.head()

Unnamed: 0,HAB_ID,Year,total_otters
0,960n,1985.0,0
1,958n,1985.0,0
2,956o,1985.0,0
3,961n,1985.0,0
4,959n,1985.0,0


Load habitat coordinates dataframe. Drop NaN values (1975 habitats in consideration, only the ones surveyed in 2014.)

In [9]:
habitat_coordinates = pd.read_csv(FOLDERPATH+'habitat_coordinates.csv')
habitat_coordinates.dropna(inplace=True)
habitat_coordinates.head()

Unnamed: 0,HAB_ID,Latitude,Longitude,Otter_Count
0,1000n,34.808753,-120.634792,0
1,1000o,34.807524,-120.681441,0
2,1001n,34.805066,-120.635534,0
3,1001o,34.805171,-120.681579,0
4,1002n,34.801391,-120.636842,0


In [10]:
def binning(df, bins):
    '''
    Return a list of masks (series of True/False). There will be a total of len(bins)+1 masks.
    Each mask has as many elements as there are in the dataframe, and represents whether the
    element falls in the latitude range.
    
    bins: list of latitudes
    df: dataframe to bin
    
    return masks
    '''
    masks = []
    mask = df.Latitude >= bins.iloc[0]
    masks += [mask]
    
    for high,low in zip(bins[:-1],bins[1:]):
        mask = (df.Latitude >= low) & (df.Latitude < high)
        masks += [mask]

    mask = df.Latitude < bins.iloc[-1]
    masks += [mask]
    
    return masks

Habitat coordinates dataframe has the latitude and longitude information of all the 1975 habitats, and also the otter count in the habitat. As said before, we have to group the habitats according to the regions in the stranded_geo dataframes.

In [11]:
names = stranded_geo.columns[2:]
region_hab_dict = {}

bins = location_coords.Latitude
masks = binning(habitat_coordinates, bins)

for i in range(len(masks)):
    region_hab_dict[names[i]] = masks[i].sum()
    
region_hab = pd.DataFrame(pd.Series(region_hab_dict)).reset_index()
region_hab.columns = ['Region','Habitats']
region_hab.head()

Unnamed: 0,Region,Habitats
0,North_of_Pigeon_Pt,0
1,Pigeon_Pt_to_Capitola,127
2,Capitola_to_Seaside,344
3,Seaside_to_Pt_Lobos,82
4,Pt_Lobos_to_Pt_Sur,118


In [12]:
alt.Chart(region_hab).mark_bar().encode(
    x=alt.X('Region',sort='-y'),
    y='Habitats'
)

Merge census dataframe with habitat coordinate dataframe to get **latitude-longitude**, **year** and **otter count** information in one dataframe.

In [13]:
merged_df = pd.merge(census, habitat_coordinates, how='inner', on='HAB_ID')
merged_df.head()

Unnamed: 0,HAB_ID,Year,total_otters,Latitude,Longitude,Otter_Count
0,960n,1985.0,0,34.952703,-120.675118,3
1,960n,1991.0,0,34.952703,-120.675118,3
2,960n,1992.0,0,34.952703,-120.675118,3
3,960n,1993.0,0,34.952703,-120.675118,3
4,960n,1994.0,0,34.952703,-120.675118,3


Group the habitats as per the regions in stranded_gro dataframe.

In [14]:
names = stranded_geo.columns[2:]
    
census_trend_dict = {}
masks = binning(merged_df,bins)

for i in range(len(masks)):
    census_trend_dict[names[i]] = merged_df[masks[i]].groupby('Year').total_otters.sum()

census_trend_df = pd.DataFrame(census_trend_dict).reset_index()
census_trend_df.head()

Unnamed: 0,Year,North_of_Pigeon_Pt,Pigeon_Pt_to_Capitola,Capitola_to_Seaside,Seaside_to_Pt_Lobos,Pt_Lobos_to_Pt_Sur,Pt_Sur_to_Dolan_Rock,Dolan_Rock_to_C_San_Martin,C_San_Martin_to_San_Simeon_Pt,San_Simeon_Pt_to_Cayucos,Cayucos_to_Hazard_Canyon,Hazard_Canyon_to_Pismo_Beach,Pismo_Beach_to_Pt_Sal,Pt_Sal_to_Pt_Conception,SE_of_Pt_Conception
0,1985.0,,0,73,134,126,125,122,160,126,80,4,0,,
1,1986.0,,0,78,146,141,133,144,189,129,97,24,0,,
2,1987.0,,3,74,191,182,142,147,232,119,95,30,10,,
3,1988.0,,3,74,195,189,144,168,258,139,93,54,10,,
4,1989.0,,9,83,212,211,178,166,263,139,92,71,14,,


Since, census data is available till 2014, we need to clip the stranded_geo data till 2014 as well, to make a fair analysis.

In [15]:
stranded_geo = stranded_geo_orig[stranded_geo_orig.Year <= 2014]
stranded_geo.drop(['Total_Strandings'],axis=1,inplace=True)

Add tags to each data type (census/strandings).

In [16]:
census_trend_df['Type'] = ['Census']*len(census_trend_df)
stranded_geo['Type'] = ['Strandings']*len(stranded_geo)

In [17]:
region_trend_df = pd.concat([census_trend_df,stranded_geo],ignore_index=True)
region_trend_df.head()

Unnamed: 0,Year,North_of_Pigeon_Pt,Pigeon_Pt_to_Capitola,Capitola_to_Seaside,Seaside_to_Pt_Lobos,Pt_Lobos_to_Pt_Sur,Pt_Sur_to_Dolan_Rock,Dolan_Rock_to_C_San_Martin,C_San_Martin_to_San_Simeon_Pt,San_Simeon_Pt_to_Cayucos,Cayucos_to_Hazard_Canyon,Hazard_Canyon_to_Pismo_Beach,Pismo_Beach_to_Pt_Sal,Pt_Sal_to_Pt_Conception,SE_of_Pt_Conception,Type
0,1985.0,,0,73,134,126,125,122,160,126,80,4,0,,,Census
1,1986.0,,0,78,146,141,133,144,189,129,97,24,0,,,Census
2,1987.0,,3,74,191,182,142,147,232,119,95,30,10,,,Census
3,1988.0,,3,74,195,189,144,168,258,139,93,54,10,,,Census
4,1989.0,,9,83,212,211,178,166,263,139,92,71,14,,,Census


In [18]:
melted_df = pd.melt(region_trend_df, id_vars=['Year','Type'], value_vars=region_trend_df.columns[1:-1], var_name='Location', value_name='Otter_Count')
melted_df.head()

Unnamed: 0,Year,Type,Location,Otter_Count
0,1985.0,Census,North_of_Pigeon_Pt,
1,1986.0,Census,North_of_Pigeon_Pt,
2,1987.0,Census,North_of_Pigeon_Pt,
3,1988.0,Census,North_of_Pigeon_Pt,
4,1989.0,Census,North_of_Pigeon_Pt,


In [19]:
alt.Chart(melted_df).mark_line().encode(
        x='Year',
        y='Otter_Count',
        color='Type'
).properties(
    width=150,
    height=100
).facet(
    facet='Location',
    columns=5
)

In [20]:
count_data_geo = pd.DataFrame(melted_df.groupby(['Location','Type']).Otter_Count.sum()).reset_index()
count_data_geo.head()

Unnamed: 0,Location,Type,Otter_Count
0,C_San_Martin_to_San_Simeon_Pt,Census,8216.0
1,C_San_Martin_to_San_Simeon_Pt,Strandings,248.0
2,Capitola_to_Seaside,Census,7204.0
3,Capitola_to_Seaside,Strandings,1521.0
4,Cayucos_to_Hazard_Canyon,Census,3944.0


In [25]:
alt.Chart(count_data_geo).mark_bar(opacity=0.8).encode(
    x=alt.X('Location',sort='-color'),
    y='Otter_Count',
    color='Type'
)