# Xtractopy - `combine_tracks` function

*Andrew Chin, 11/21/2021*

Running the base function multiple times may be ok with 1-20 animals, but if you have a lot to run, it would be more time efficient to combine all the satellite data together into one `pandas` df and running the `xtractopy` function on it. Hence, the `combine_tracks` function here.

This function will take the following input:

1. the folder path that contains multiple .csv files

and output:
1. a pandas df with a new column, "tagID", which contains the corresponding csv file names

The output can be plugged directly into the base `xtractopy` function as the 'tagdata' argument.


In [1]:
# necessary packages
import datetime as dt
import xarray as xr
import numpy as np
import pandas as pd
from typing import Dict, Union
import fsspec
import matplotlib.pyplot as plt
from datetime import datetime 
import glob
import os
import re

In [2]:
def combine_tracks(filepath):
    '''
    filepath: file path to folder where .csv data are stored
    '''
    path=filepath
    files = glob.glob(os.path.join(path,'*.csv'))
    data = []
    for filename in files:
        concat_df = pd.read_csv(filename)
        tag_id = filename.split("/")[1].split(".csv")[0]
        concat_df['tagID'] = tag_id
        data.append(concat_df)
        concat_df = pd.concat(data)
    return(concat_df)

In [3]:
test = combine_tracks('shark_track_data')
print(test)

           lon        lat    datetime              tagID
0   -78.984392  27.187935  2014-11-15  track_shark144020
1   -78.952825  27.172386  2014-11-16  track_shark144020
2   -78.924598  27.156750  2014-11-17  track_shark144020
3   -78.912462  27.146158  2014-11-18  track_shark144020
4   -78.929166  27.145738  2014-11-19  track_shark144020
..         ...        ...         ...                ...
313 -77.064386  33.905234  2015-05-21  track_shark137736
314 -77.029221  33.983800  2015-05-22  track_shark137736
315 -77.052281  34.066212  2015-05-23  track_shark137736
316 -77.114133  34.151152  2015-05-24  track_shark137736
317 -77.195281  34.237302  2015-05-25  track_shark137736

[690 rows x 4 columns]


In [4]:
# bring in data for SST
file_location = 's3://mur-sst/zarr'
ikey = fsspec.get_mapper(file_location, anon=True)
ds_sst = xr.open_zarr(ikey,consolidated=True)
ds_sst

Unnamed: 0,Array,Chunk
Bytes,15.19 TiB,245.78 MiB
Shape,"(6443, 17999, 36000)","(6443, 100, 100)"
Count,64801 Tasks,64800 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 15.19 TiB 245.78 MiB Shape (6443, 17999, 36000) (6443, 100, 100) Count 64801 Tasks 64800 Chunks Type float32 numpy.ndarray",36000  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,15.19 TiB,245.78 MiB
Shape,"(6443, 17999, 36000)","(6443, 100, 100)"
Count,64801 Tasks,64800 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.19 TiB,245.78 MiB
Shape,"(6443, 17999, 36000)","(6443, 100, 100)"
Count,64801 Tasks,64800 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 15.19 TiB 245.78 MiB Shape (6443, 17999, 36000) (6443, 100, 100) Count 64801 Tasks 64800 Chunks Type float32 numpy.ndarray",36000  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,15.19 TiB,245.78 MiB
Shape,"(6443, 17999, 36000)","(6443, 100, 100)"
Count,64801 Tasks,64800 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.80 TiB,61.45 MiB
Shape,"(6443, 17999, 36000)","(6443, 100, 100)"
Count,64801 Tasks,64800 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 3.80 TiB 61.45 MiB Shape (6443, 17999, 36000) (6443, 100, 100) Count 64801 Tasks 64800 Chunks Type int8 numpy.ndarray",36000  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,3.80 TiB,61.45 MiB
Shape,"(6443, 17999, 36000)","(6443, 100, 100)"
Count,64801 Tasks,64800 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.19 TiB,245.78 MiB
Shape,"(6443, 17999, 36000)","(6443, 100, 100)"
Count,64801 Tasks,64800 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 15.19 TiB 245.78 MiB Shape (6443, 17999, 36000) (6443, 100, 100) Count 64801 Tasks 64800 Chunks Type float32 numpy.ndarray",36000  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,15.19 TiB,245.78 MiB
Shape,"(6443, 17999, 36000)","(6443, 100, 100)"
Count,64801 Tasks,64800 Chunks
Type,float32,numpy.ndarray


In [5]:
# Subset of Gulf Stream 
max_lon_glf = -70
min_lon_glf = -82

In [6]:
def subset_area(env_data,
                max_lon,
                min_lon):
    subset_lon = (env_data.lon >= min_lon) & (env_data.lon <= max_lon)
    subset_env_data = env_data.where(subset_lon, drop=True)
    return subset_env_data

In [7]:
gulf_stream_sst = subset_area(ds_sst, max_lon_glf, min_lon_glf)
gulf_stream_sst

    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]


Unnamed: 0,Array,Chunk
Bytes,518.85 GiB,245.78 MiB
Shape,"(6443, 17999, 1201)","(6443, 100, 100)"
Count,69508 Tasks,2340 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 518.85 GiB 245.78 MiB Shape (6443, 17999, 1201) (6443, 100, 100) Count 69508 Tasks 2340 Chunks Type float32 numpy.ndarray",1201  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,518.85 GiB,245.78 MiB
Shape,"(6443, 17999, 1201)","(6443, 100, 100)"
Count,69508 Tasks,2340 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,518.85 GiB,245.78 MiB
Shape,"(6443, 17999, 1201)","(6443, 100, 100)"
Count,69508 Tasks,2340 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 518.85 GiB 245.78 MiB Shape (6443, 17999, 1201) (6443, 100, 100) Count 69508 Tasks 2340 Chunks Type float32 numpy.ndarray",1201  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,518.85 GiB,245.78 MiB
Shape,"(6443, 17999, 1201)","(6443, 100, 100)"
Count,69508 Tasks,2340 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.01 TiB,491.56 MiB
Shape,"(6443, 17999, 1201)","(6443, 100, 100)"
Count,71848 Tasks,2340 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.01 TiB 491.56 MiB Shape (6443, 17999, 1201) (6443, 100, 100) Count 71848 Tasks 2340 Chunks Type float64 numpy.ndarray",1201  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,1.01 TiB,491.56 MiB
Shape,"(6443, 17999, 1201)","(6443, 100, 100)"
Count,71848 Tasks,2340 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,518.85 GiB,245.78 MiB
Shape,"(6443, 17999, 1201)","(6443, 100, 100)"
Count,69508 Tasks,2340 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 518.85 GiB 245.78 MiB Shape (6443, 17999, 1201) (6443, 100, 100) Count 69508 Tasks 2340 Chunks Type float32 numpy.ndarray",1201  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,518.85 GiB,245.78 MiB
Shape,"(6443, 17999, 1201)","(6443, 100, 100)"
Count,69508 Tasks,2340 Chunks
Type,float32,numpy.ndarray


In [9]:

def xtractopy(envdata,
              tagdata: pd.DataFrame,
             filename: [str]):
    """
    envdata: environmental data in an DataArray format
    tagdata: tag data in a pandas format
    filename: the name of the file .csv output file, as a "string"
    """
    def fuction_dataset_point(**kwargs) -> Dict[str, Union[float, int]]:
        pass

    def extract(function_dataset_point, 
                df: tagdata, 
                map_coordinates: Dict[str, str], 
                rename_variables: Dict[str, str]
               ) -> pd.DataFrame:
        """
        function_dataset_point: environmental data in a point format, to be transformed
        map_coordinates: key is name of column in dataframe, value is the name of the coordinate in dataset
        rename_variables: TBD
        """
    
        def get_row(row) -> Dict[str, Union[float, int]]:
            extract_coordinates = {}
        
            for key, val in map_coordinates.items():
                extract_coordinates[val] = row[key]
        
            result = function_dataset_point(**extract_coordinates)
        
            # rename variables here and transform result TBD
            return result
    
        return df.apply(
            lambda row: get_row(row), axis=1, result_type="expand"
        )


    def envdata_point(lat, lon, time) -> Dict[str, Union[float, int]]:
        ds = envdata.sel(lat=lat, lon=lon, time=time, method="nearest")

        results = {}
    
        for var in ds.variables:
            if var not in ds.coords:
                results[var] = ds[var].values
    
        return results

    combined_dat = pd.concat([tagdata, # hopefully this includes "tagID"
                        extract(envdata_point,
                                tagdata, 
                                {"lat": "lat", "lon": "lon", "datetime": "time"}, 
                                {}
                               )
                       ], axis=1)
    combined_dat.to_csv("".join([filename, ".csv"])) # need to figure out how to paste the title into the csv file
    return combined_dat


## TEST THE FUNCTION

In [None]:
# test
xtractopy(gulf_stream_sst, test, "concat_sst")