# dbCamHD

This notebook loads (or creates) a metadata database for the CamHD data and plots some statistics.

## Setup environment
Do these everytime running this notebook.

In [None]:
%matplotlib inline
import pycamhd as camhd
import numpy as np
import pandas as pd
import ooiod
import requests
import dask
from dask import delayed

## Starting from local JSON file
Here we load the dbcamhd.json file that contains the results from the cells near the bottom of this notebook.

In [None]:
dbcamhd = pd.read_json('dbcamhd.json', orient="records", lines=True)

In [None]:
dbcamhd.tail()

### Calculate the total number of frames
There are currently almost a half-billion video frames.

In [None]:
dbcamhd['frame_count'].sum()

### Plot histogram of MOV sizes

In [None]:
import holoviews as hv
hv.extension('bokeh')
from bokeh.plotting import figure, show

frequencies, edges = np.histogram(dbcamhd['blob_size']/1024/1024/1024, bins=np.linspace(0,20,100))

p = figure(title="MOV Size Distribution")
p.quad(top=frequencies, bottom=0, left=edges[:-1], right=edges[1:], fill_color="blue", line_color="black")
p.xaxis.axis_label = 'Filesize (GB)'
p.yaxis.axis_label = 'N'
show(p)

### Plot histogram of frame counts

In [None]:
frequencies, edges = np.histogram(dbcamhd['frame_count'], bins=np.linspace(0,50000,100))

p = figure(title="MOV Frame Count Distribution")
p.quad(top=frequencies, bottom=0, left=edges[:-1], right=edges[1:], fill_color="blue", line_color="black")
p.xaxis.axis_label = 'Frame Count'
p.yaxis.axis_label = 'N'
show(p)

### Plot subset of frame_counts by date

In [None]:
p = figure(x_axis_type='datetime', y_range=(0, 30000))
min_t = 1400000000

dates = pd.to_datetime(dbcamhd['timestamp'][dbcamhd.timestamp>min_t],unit='s')
frame_count = dbcamhd['frame_count'][dbcamhd.timestamp>min_t]

p.circle(dates, frame_count, size=1)
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Frame Count'
show(p)

## Starting from scratch
Example code showing how the database above was built. Since dbcamhd.json has been built, it is not normally necessary to run these cells. They can take a very long time without a large Dask worker cluster.

### Start Dask cluster using Jupyterlab extension
Use the new Jupyterlab Dask extension buttons!

In [None]:
client

### Get a list of files from the server

In [None]:
%%time
blob_urls = ooiod.blobs.list_blobs('ooiopendata', 'camhd')

In [None]:
blob_urls[0:5]

### Get file sizes

In [None]:
def get_blob_size(blob_url):
    try:
        header = requests.head(blob_url)
        blob_size = header.headers.get('Content-Length')
        return blob_size
    except:
        return None

In [None]:
blob_sizes_delayed = []
for blob_url in blob_urls:
    blob_sizes_delayed.append(delayed(get_blob_size)(blob_url))

In [None]:
%%time
blob_sizes = dask.compute(*blob_sizes_delayed)

### Create a Pandas dataframe from these Lists

In [None]:
dbcamhd = pd.DataFrame(
    {'blob_url': blob_urls,
     'blob_size': blob_sizes,
    })
dbcamhd.tail()

### Get file timestamps and frame counts

In [None]:
def get_file_info_delayed(blob_url):
    try:
        moov_atom = camhd.get_moov_atom(blob_url)
        timestamp = camhd.get_timestamp(blob_url, moov_atom)
        frame_count = camhd.get_frame_count(blob_url, moov_atom)
    except:
        timestamp = False
        frame_count = False
    return (timestamp, frame_count)

In [None]:
delayed_file_info = []
for blob_url in blob_urls:
    delayed_file_info.append(delayed(get_file_info_delayed)(blob_url))   

In [None]:
%%time
file_info = dask.compute(*delayed_file_info)

### Add these to the dbcamhd dataframe

In [None]:
#dbcamhd['moov'] = pd.Series(moov, index=dbcamhd.index)
dbcamhd['timestamp'] = pd.Series([i[0] for i in file_info], index=dbcamhd.index)
dbcamhd['frame_count'] = pd.Series([i[1] for i in file_info], index=dbcamhd.index)
dbcamhd.tail()

### Save dataframe to JSON file

In [None]:
dbcamhd.to_json('dbcamhd.json', orient="records", lines=True)

### Add deployment numbers to database

See the [asset management](https://github.com/ooi-integration/asset-management/blob/master/deployment/RS03ASHS_Deploy.csv) page for deployment information.

In [None]:
import pandas as pd

In [None]:
dbcamhd = pd.read_json('dbcamhd.json', orient="records", lines=True)
dbcamhd.blob_url[0]

In [None]:
dbcamhd.tail()

In [None]:
dt = pd.to_datetime(dbcamhd.timestamp, unit='s')
dbcamhd['deployment'] = dbcamhd.timestamp*0
dbcamhd.loc[dt < '2016-07-26 21:18:00', 'deployment'] = 2
dbcamhd.loc[dt >= '2016-07-26 21:18:00', 'deployment'] = 3
dbcamhd.loc[dt >= '2017-08-14 06:00:00', 'deployment'] = 4
dbcamhd.loc[dt >= '2018-07-04 00:00:00', 'deployment'] = 5
dbcamhd.loc[dt >= '2019-06-16 22:02:00', 'deployment'] = 6

In [None]:
dbcamhd.tail()

In [None]:
dbcamhd.to_json('dbcamhd.json', orient="records", lines=True)

## References

https://github.com/tjcrone/pycamhd<br>
https://rawdata.oceanobservatories.org/files/RS03ASHS/PN03B/06-CAMHDA301/<br>
https://pandas.pydata.org/