In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
calcofi_data = pd.read_csv('final_data.csv')
calcofi_data.head(2)

In [None]:
calcofi_data.columns

In [None]:
calcofi_data.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1'], inplace = True)
calcofi_data.head(2)

In [None]:
calcofi_data.tail(2)

In [None]:
calcofi_data.shape

In [None]:
calcofi_data = calcofi_data.loc[(calcofi_data['O2ml_L'] >= 0) & (calcofi_data['PO4uM'] >= 0) & (calcofi_data['NO3uM'] >= 0)]


In [None]:
len(calcofi_data.Sta_ID.unique())

#### OK, so we have a pretty comprehensive dataset in front of us that spans 43 years with data taken at ~1000 different geographical sampling stations along the California coast.

 During EDA for imputation, we noticed that there was some evidence for depth segmentation in the data. Particularly, we noticed that there was segmentation of dissolved oxygen and mineral levels in the water column. We also noticed that there were regions in the water column where the dissolved oxygen content was quite low -- in some cases, actually hitting zero. From the standpoint of supporting fish populations, this seems like it would be something of concern. 

 A few questions are:

 1. Are there other factors within the dataset that play roles in creating these depleted oxygen (hypoxic) regions (I'm thinking about coorelations here)?
 2. Are there different classes of hypoxic regions? By this we means, ones that occur nearer to/ further off the coast, at different ranges of depths, etc?
 3. Of particular concern for coastal fisheries would be low oxygen levels in shallow waters. Can we identify some of these and the conditions which create them? 
 4. This is a depth column. Things that happen at the top near the surface are likely to affect things that happen deeper in the column. Is this hypothesis backed in any way by the data? One more precise set of questions is: how do surface temperature, salinity, chlorophyll, and phaeopigment concentrations at the surface affect oxygenation?
 5. We have time data. How persistent are low oxygen regions? Are they like shocks, cyclical, or are they persistent zones?

 These are obviously a lot of questions. We probably won't answer all of these in the notebook. But we can certainly use the data to start addressing them. 



## Variable Correlation / Visualization

In [None]:
sns.set_style("ticks")
sns.set_context("talk")


Linear regression plot of Oxygen on both Phosphates and Nitrates.

In [None]:
fig, axes = plt.subplots(2,1, figsize =(6,8))
sns.regplot(ax = axes[0],x = 'PO4uM', y ='O2ml_L', data = calcofi_data.sample(50000), scatter_kws={"s":10}, line_kws={'color': 'red'} )
axes[0].set_xlabel('$PO_4$ ($\mu $mol/L)')
axes[0].set_ylabel(' $O_2$ (mL/L)')
sns.regplot(ax = axes[1], x = 'NO3uM', y ='O2ml_L', data = calcofi_data.sample(50000), scatter_kws={"s":10}, line_kws={'color': 'red'} )
axes[1].set_xlabel('$NO_3$ ($\mu $mol/L)')
axes[1].set_ylabel('$O_2$ (mL/L)')
plt.tight_layout()
plt.savefig('O2vsnutrients.png')
plt.show()


Yea, until saturation, its pretty linear. This is an interesting point. Now let's take a look at Phosphates vs Depth and Nitrates vs Depth. We'll have O2 be our color bar.

In [None]:
calcofi_data_r = calcofi_data[calcofi_data['Depthm']<=4000].sample(50000)
x = calcofi_data_r['Depthm']
y = calcofi_data_r['PO4uM'] 

plt.scatter(x, y, c = calcofi_data_r['O2ml_L'], cmap = 'viridis' )
plt.ylabel('$PO_4$ ($\mu$ mol/L)')
plt.xlabel(' Depth (m)')
plt.clim(0, 8)
plt.colorbar(label = '$O_2$ (mL/L)')
plt.tight_layout()
plt.minorticks_on()
plt.savefig('PO4withdepth.png')
plt.show()

In [None]:
calcofi_data_r = calcofi_data[calcofi_data['Depthm']<=4000].sample(50000)
x = calcofi_data_r['Depthm']
y = calcofi_data_r['NO3uM'] 

plt.scatter(x, y, c = calcofi_data_r['O2ml_L'], cmap = 'viridis' )
plt.ylabel('$NO_3$ ($\mu$ mol/L)')
plt.xlabel(' Depth (m)')
plt.clim(0, 8)
plt.colorbar(label = '$O_2$ (mL/L)')
plt.tight_layout()
plt.minorticks_on()
plt.savefig('NO3withdepth.png')
plt.show()

In [None]:
fig, axes = plt.subplots(2,1, figsize =(6,8))
sns.scatterplot(ax = axes[0],x = 'T_degC', y ='O2ml_L', data = calcofi_data.sample(50000), color = 'g' )
axes[0].set_xlabel('T (celsius')
axes[0].set_ylabel(' $O_2$ (mL/L)')

sns.scatterplot(ax = axes[1], x = 'STheta', y ='O2ml_L', data = calcofi_data.sample(50000), color = 'orange' )
axes[1].set_xlabel('$\sigma_\Theta$ ($kg/m^3$)')
axes[1].set_ylabel(' $O_2$ (mL/L)')
plt.tight_layout()
plt.savefig('O2vsTStheta.png')

plt.show()

In [None]:
calcofi_data_r = calcofi_data[calcofi_data['Depthm']<=4000].sample(50000)
x = calcofi_data_r['T_degC']
y = calcofi_data_r['O2ml_L'] 

plt.scatter(x, y, c = calcofi_data_r.Depthm, cmap = 'viridis' )
plt.xlabel('T (Celsius)')
plt.ylabel(' $O_2$ (mL/L)')
plt.clim(0, 4000)
plt.colorbar(label = 'Depth [m]')
plt.tight_layout()
plt.savefig('OvsTwithdepth.png')
plt.show()

Zoom in out to middle depth to see the color scale more clearly.


In [None]:
calcofi_data_r = calcofi_data[calcofi_data['Depthm']<800].sample(50000)
x = calcofi_data_r['T_degC']
y = calcofi_data_r['O2ml_L'] 

plt.scatter(x, y, c = calcofi_data_r.Depthm, cmap = 'viridis' )
plt.xlabel('T (Celsius)')
plt.ylabel(' $O_2$ (mL/L)')
plt.clim(0, 800)
plt.colorbar(label = 'Depth (m)')
plt.tight_layout()
plt.savefig('OvsTdepthfine.png')
plt.show()


In [None]:
calcofi_data_r = calcofi_data[calcofi_data['Depthm']<3000].sample(50000)
x = calcofi_data_r['STheta']
y = calcofi_data_r['O2ml_L'] 

plt.scatter(x, y, c = calcofi_data_r.Depthm, cmap = 'viridis' )
plt.xlabel('$\sigma_\Theta$ ($kg/m^3$)')
plt.ylabel(' $O_2$ (mL/L)')
plt.clim(0, 3000)
plt.colorbar(label = 'Depth (m)')
plt.tight_layout()
plt.savefig('OvsDensity.png')
plt.show()

In [None]:
calcofi_data_r = calcofi_data[calcofi_data['Depthm']<800].sample(50000)
x = calcofi_data_r['STheta']
y = calcofi_data_r['O2ml_L'] 

plt.scatter(x, y, c = calcofi_data_r.Depthm, cmap = 'viridis' )
plt.xlabel('$\sigma_\Theta$ ($kg/m^3$)')
plt.ylabel(' $O_2$ (mL/L)')
plt.clim(0, 800)
plt.colorbar(label = 'Depth (m)')
plt.tight_layout()
plt.savefig('OvsDensityFine.png')
plt.show()

The takeaway here is that the dissolved oxygen level decreases linearly with increasing nutrient levels (phosphate/nitrates). It seems, in particular, that oxygen concentration keeps decreasing with increasing phosphate levels until it hits 0 and stays that way for further phosphate concentration increases.

There is also clear indication of depth segmentation. There are three different regimes in the O2 vs T and O2 vs water density curves. A quick conditioning of the color bar on Depth reveals that these different regimes correspond to different depth ranges.

Let's take a closer look at the low (< 1.4 ml/L) dissociated O2 levels. In EDA we cursorily saw that depth might be important here.

In [None]:
lowO2df = calcofi_data[calcofi_data['O2ml_L'] < 1.4]

In [None]:
lowO2df.hist(column = 'Depthm', bins = 70)
plt.xlim(0,1200)
plt.title('Depth Count Frequency in Hypoxic Regime ')
plt.xlabel('Ocean Depth (m)')
plt.tight_layout()
plt.ylabel('Count')
plt.savefig('Hypoxiccountvsdepth.png')

In [None]:
sns.scatterplot(x = 'Depthm', y = 'O2ml_L', data = calcofi_data)
plt.axhline(y = 1.4, c ='r')
plt.axvline(x = 200, c = 'red')
plt.axvline(x = 1000, c = 'red')
plt.fill_between(x = [200,1000], y1 = [1.4,1.4], y2=[-0.1,-0.1], alpha =0.5 )
plt.xlabel('Ocean Depth [m]')
plt.ylabel('$O_2$ (mL/L)')
plt.tight_layout()
plt.savefig('scatterO2vsdepth.png')

Both visualizations have some weaknesses: 
- The count frequency histogram is distorted by the fact that there are far less data points in the high depth regime. One good thing though is that it does call to attention the hypoxic regions in waters of depth < 200 m.

- The scatter plot visually overrepresents data that is actually sparse. Most of the data is huddled in the 0-200m regime but thats not immediately obvious. 

#### One good thing though is that both visualizations point to a region of ~200-1000 m where the oxygen concentration finds its minimum and is the region in the entire water depth column where most of the hypoxic water is concentrated. 

We can get a corrected version of our depth count frequency by normalizing each bin in the hypoxic subset by the total number of points in the entire dataset in a given depth bin. This provides a measure of how hypoxic the water is in a given depth (tracing over all other variables).


In [None]:
depthmin = 0
depthmax = calcofi_data['Depthm'].max()
ranges = list(range(depthmin,200, 20))
mid_ranges = list(range(200,1000, 50))
deep_ranges = list(range(1000,depthmax,400))



In [None]:
ranges.extend(mid_ranges)
ranges.extend(deep_ranges)

In [None]:
print(ranges)

In [None]:
calcofi_data['DepthRange'] = pd.cut(calcofi_data['Depthm'], bins = ranges, include_lowest=True )
totalcountsperbin = calcofi_data.groupby('DepthRange').count()['Depthm']
lowO2df['DepthRange'] = pd.cut(lowO2df['Depthm'], bins = ranges, include_lowest=True )
anoxiccountsperbin = lowO2df.groupby('DepthRange').count()['Depthm']


In [None]:
hypoxicityperbin = anoxiccountsperbin/totalcountsperbin # the values here are the ratio of hypoxic counts to total observations within a bin which I'll call the hypoxicity 
#let's use the bin start point for constructing a plot of hypoxicity vs. depth

print(hypoxicityperbin.head(30))

In [None]:
plt.plot(ranges[:-1], hypoxicityperbin)
plt.xlim(0,3000)
plt.axvline(x = 200, c ='r')
plt.axvline(x = 1000, c ='r')
plt.title('$P_{hypox}$ vs. Depth')
plt.ylabel('$n^{hyp}_{bin}/n^{tot}_{bin}$')
plt.xlabel('Depth (m)')
plt.minorticks_on()
plt.tight_layout()
plt.savefig('hypoxicityvsdepth.png')

The turn on and roll off of our derived hypoxicity quantity is at 200 m and 1000 m respectively. The region from 200-1000m seems like a large hypoxic layer in the water column. As this data is essentially time averaged over 40 years this seems like it is a pretty persistent low oxygen layer in the ocean off the coast.

The slow turn could be due to an actual static gradient in the hypoxicity, but its more likely a time averaging effect (fluctuation on the top depth of this zone in time)

Undoubtedly, this is something that we could figure out systematically with this very dataset.

Where are these mid-depth oxygen hypoxic zones located?


In [None]:
middepthlowO2df = lowO2df[(lowO2df['Depthm'] > 200) & (lowO2df['Depthm'] < 1400)]

In [None]:
import plotly.express as px

fig = px.scatter_mapbox(middepthlowO2df, lat = 'Lat_Dec', lon = 'Lon_Dec', color_discrete_sequence=["yellow"], zoom=4, height=500)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(
    mapbox_style="white-bg",
    mapbox_layers=[
        {
            "below": 'traces',
            "sourcetype": "raster",
            "sourceattribution": "United States Geological Survey",
            "source": [
                "https://basemap.nationalmap.gov/arcgis/rest/services/USGSImageryOnly/MapServer/tile/{z}/{y}/{x}"
            ]
        }
      ])
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()


The majority of these stations are located off the coast of Southern California down into Baja California. An interesting point is that a majority of these points extend pretty far off coast and are in the open ocean (of course excluding the points inside the Gulf of California).

Our analysis then shows that there is a layer of hypoxic water at intermediate depths that starts close to the coast and extends into the open ocean.

Can we see any features in the data that tell us something about this depth zone and/or might be of interest in explaining the rapid decrease in O2 levels?


In [None]:
calcofi_data.columns

In [None]:
deep_cast_counts = calcofi_data[calcofi_data['Depthm'] > 1400].Cst_Cnt.unique()



single_example_scan = calcofi_data[calcofi_data['Cst_Cnt'] == deep_cast_counts[36]]

sns.lineplot(x = 'Depthm', y = 'O2ml_L', data = single_example_scan, color="g")
plt.xlabel('Ocean Depth (m)')
plt.ylabel('$O_2$ (mL/L)', color = 'green')
plt.tick_params(axis='y', colors='green')
plt.minorticks_on()

ax2 = plt.twinx()
sns.lineplot(x = 'Depthm', y = 'STheta', data = single_example_scan, color="b", ax=ax2)
plt.ylabel('$\sigma_{\Theta}$ ($kg/m^3$)', color = 'blue')
ax2.tick_params(axis='y', colors='blue')
ax2.spines['right'].set_color('blue')
ax2.spines['left'].set_color('green')

plt.minorticks_on()

plt.xlim(0,1500)
plt.tight_layout()
plt.savefig('O2sigmavsdepth.png')

In [None]:

sns.lineplot(x = 'Depthm', y = 'O2ml_L', data = single_example_scan, color="g")
plt.xlabel('Ocean Depth (m)')
plt.ylabel('$O_2$ (mL/L)', color = 'green')
plt.tick_params(axis='y', colors='green')
plt.minorticks_on()

ax2 = plt.twinx()
sns.lineplot(x = 'Depthm', y = 'T_degC', data = single_example_scan, color="b", ax=ax2)
plt.ylabel('T (C)', color = 'blue')
ax2.tick_params(axis='y', colors='blue')
ax2.spines['right'].set_color('blue')
ax2.spines['left'].set_color('green')

plt.minorticks_on()

plt.xlim(0,1500)
plt.tight_layout()
plt.savefig('O2Tvsdepth.png')

In [None]:
lowdepthlowO2df = lowO2df[(lowO2df['Depthm'] < 200)]

Look for hypoxia at shallow depths and invesigate years in which they occurred

In [None]:
fig = px.scatter_mapbox(lowdepthlowO2df, lat = 'Lat_Dec', lon = 'Lon_Dec', color_discrete_sequence=["yellow"],  zoom=4, height=500)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(
    mapbox_style="white-bg",
    mapbox_layers=[
        {
            "below": 'traces',
            "sourcetype": "raster",
            "sourceattribution": "United States Geological Survey",
            "source": [
                "https://basemap.nationalmap.gov/arcgis/rest/services/USGSImageryOnly/MapServer/tile/{z}/{y}/{x}"
            ]
        }
      ])
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.show()

In [None]:
lowdepthhypox = pd.to_datetime(lowdepthlowO2df['DateTime']).dt.year
lowdepthhypox[lowdepthhypox > 1980].hist(bins = 20)
plt.title('Low Depth Hypoxia Events since 1980')
plt.xlabel('Year')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('lowdepthhypoxia.png')
