In [232]:
import scipy as sp
import numpy as np
import pandas as pd
from mpmath import besseli
import utm
from statistics import mode
import torch
import os, re, glob, pyproj, math, datetime
from sys import platform

## Goto wind estimation

Originating in the [2016 paper](https://www.science.org/doi/10.1126/sciadv.1700097), this method was originally written for R. Here, we will translate it to work in Python, and try to improve run speeds as well. Comparisons will be made across methods to ensure differences are not being produced.

In [236]:
samplingInterval = 60
timeWindow = 51
cutlength = 45
cutv = 4.1667
constv = 34.7/3.6

def Likelihoodww(data1,data2,cv): # calculate log-likelihood of the model
    def f(par):
        a = par[1]
        b = cv/sp.gamma(1+1/a)
        mx = par[2]
        my = par[3]
        wx = par[4]
        wy = par[5]
        L = 0
        for i in range(len(data1)):
            rr = np.sqrt((data1[i]*np.cos(data2[i]) - wx)**2 + (data1[i]*np.sin(data2[i]) - wy)**2)
            rx = (data1[i]*np.cos(data2[i])-wx)/rr
            ry = (data1[i]*np.sin(data2[i])-wy)/rr
            lp = (a-2)*math.log(rr) - (rr/b)**a + mx*rx + my*ry + math.log(a) - math.log(b) + (1-a)*math.log(b) - math.log(besseli(np.sqrt(mx**2 + my**2),0,))
            L = L+lp
        return L

def Weibull_sd(a,b): # standard deviation of Weibull distribution
    return b*np.sqrt(sp.gamma(1+2/a) - sp.gamma(1+1/a)*sp.gamma(1+1/a))

def Weibull_mean(a,b): # mean of Weibull distribution
    return b*sp.gamma(1+1/a)

def Von_Mises_sd(kappa): # standard deviation of von Mises distribution
    return 1/np.sqrt(kappa)

def readAxyGPS(filename): # read in AxyTrek GPS data (txt files)
    df = pd.read_csv(filename, sep = "\t", header = None, usecols = [0,1,2,3],
    names = ['Date','Time','lat','lon'])
    df['DT'] = pd.to_datetime(df['Date'] + " " + df['Time'],format="%d/%m/%Y %H:%M:%S")
    return df

def readBIPAxy(filename): # read in AxyTrek data as formatted by BIP system
    df = pd.read_csv(filename, sep = ",", header = 0, usecols = [0,1,2], names = ['DT','lat','lon']).dropna()
    df['DT'] = pd.to_datetime(df['DT'].str[0:-6], format = "%Y-%m-%d %H:%M:%S")
    return df

def nearest(items, pivot): # find the nearest time position
    return min(items, key=lambda x: abs(x - pivot))

def timeRescale(dat,tdiff): # calculated indeces for rescaling time (DT) for regular sampling every tdiff mins
    return dat.iloc[np.arange(0,len(dat),step=np.timedelta64(tdiff,'m')/np.timedelta64(mode(np.diff(dat['DT'])),'s')).astype(int),:].reset_index()

def spTrav(DT,lat,lon,threshold=0): # speed from time (DT), lat, and lon
    geod = pyproj.Geod(ellps='WGS84')
    _, _, distance = geod.inv(lon.iloc[:-1],lat[:-1],lon[1:],lat[1:])
    speed = (distance*10**-3)/np.array(np.diff(DT)/np.timedelta64(3600,'s'))
    if threshold != 0:
        while np.nanmax(speed) > threshold:
            lat = lat[speed < threshold]
            lon = lon[speed < threshold]
            _, _, distance = geod.inv(lon[:-1],lat[:-1],lon[1:],lat[1:])
            speed = (distance)/np.array(np.diff(DT)/np.timedelta64(1,'s'))
    return np.append(np.nan,distance), np.append(np.nan,speed)

def XYfromUTM(lat,lon): # extract UTM data for lat,lon
    return utm.from_latlon(lat,lon)

def prePare(filename, convertToMin: bool = True): # prepare BIP data as per required for Goto original method. Adds columns 'dt' (elapsed time from fix to previous time point in seconds), 'dist' (distance travelled from previous point in m), 'track_speed' (in m/sec), 'track_direction' in rad
    df = readBIPAxy(filename)
    if convertToMin:
        df = timeRescale(df,1)
    df['dt'] = np.append(np.nan,(np.diff(df['DT']) / np.timedelta64(1,'s')).astype(int))
    X,Y,_,_ = XYfromUTM(np.array(df['lat']),np.array(df['lon']))
    vg_x_obs = np.diff(X)
    vg_y_obs = np.diff(Y)
    df['dist'], df['track_speed'] = spTrav(df['DT'],df['lat'],df['lon'])
    df['track_direction'] = np.append(np.nan,[math.atan2(vg_y_obs[x],vg_x_obs[x]) for x in range(len(vg_x_obs))])
    return df

In [6]:

# if platform == "darwin":
#     fileloc = "/Volumes/GoogleDrive-112399531131798335686/My Drive/PhD/Data/2018Shearwater/AxyTrek/"
# else:
#     fileloc = "I:/My Drive/PD/Data/2018Shearwater/AxyTrek/"
# # list all files
# files = glob.glob(fileloc + "**/*.txt")
# tags = np.unique([re.search('(AxyTrek[\\\\|/][0-9\-]+)[\\\\|/]',f).group(1) for f in files])
# dat = readAxyGPS(files[1]) # read in
# minDat = timeRescale(dat,1).reset_index() # convert to 1 min fs
# dt = (np.diff(minDat['DT']) / np.timedelta64(1,'s')).astype(int)

In [238]:
# reread data as sample from BIP system
if platform == "darwin":
    filename = "/Volumes/GoogleDrive-102199952889875375671/My Drive/PD/Data/TestingData/SampleAxyTrek.csv"
else:
    filename = "I:/My Drive/PD/Data/TestingData/SampleAxyTrek.csv"
df = prePare(filename)

### Estimation system

Once the data are read in, the initial portion of the program deals with idnetifying suitable windows to run the estimation model. These windows are required to by 51 minutes (approximately) in length, and within those 51 minutes, have over 45 samples, assuming a sampling frequency of 1 fix per minute. We assume there to be some error in the sampling interval. In the original study, this was taken as 5 seconds (i.e. we can expect samples to be 60 $\pm$ 5 seconds).

Starting from the first possible startpoint (half the window size in samples), the model then runs through the following processes:

1. Define window size.
   1. Find position of data which is 25.5 minutes after initial point.
   2. Repeat but for before initial point.
   3. Assign these positions as start and end of the window.
2. Create a new vector of track speed and direction where the speed is above the threshold of 4.1667 m/s, the sample is within 65 s of the previous sample, and direction is not equal to 100.

At this stage, the model starts to run through a variety of 'initial headings', set as each integer between -3 and 3. For each initial heading, the following processes are run:

1. Create variable `inita` as a random variable generated from a normal distribution with mean 12.5 and standard deviation 5. `inita` must be greater than 5.
2. Calculate the mean heading from all headings within the window that passed the above requirements.
3. The sum of the initial heading and the mean heading is determined, and using these data, the following variables are estimated:
   1. `kappa`: the concentration parameter for a von Mises distribution
   2. `mux` and `muy`: the x and y components of `kappa`
   3. `wx` and `wy`: the x and y components of wind (the track vector - the heading)
4. The `inita`, `mux`, `muy`, `wx`, and `wy` variables are then optimised using log-likelihood and track speed and direction data alongside a constant assumed mean air speed (34.7 m/s).
5. The standard deviation of the heading vector perpendicular to the mean direction `yoko` and the standard deviation of the heading vector along the mean direction `tate` are calculated.

If convergence is not reached but `tate` can be calculated, the process is repeated until convergence is reached.

In [282]:
time_window = 51
fs = (np.timedelta64(60,'s')/np.timedelta64(mode(np.diff(df['DT'])),'s')).astype(int)
cutlength = round(45/51 * (fs * 51))
expSamp = round(51 * fs)
cutv = 4.1667
constv = 34.7/3.6
error_of_sampling_interval = 5 * fs
cutt = (60 * fs) + error_of_sampling_interval
winwidth = (time_window * fs) - 1

Run through each datetime value from 1 to end - cutlength. First test if `datetime[i]:datetime[i+51]` is over 51 mins. If yes, then find if `datetime[i]:datetime[i+cutlength]` is over 51 mins. If no, set up a while loop to identify at what index the value is over 51 minutes.

In [283]:
# generate a function to create windows capable of running the estimation method. Requirements are 51 minutes of data, with 75% of expected samples
def windowFit(DT,start,end,end2,cutt):
    if (DT[end] - DT[start]) > np.timedelta64(cutt,'m'):
        if (DT[end2 - 1] - DT[start]) < np.timedelta64(cutt,'m'):
            while (DT[end2] - DT[start]) < np.timedelta64(cutt,'m'):
                end2 = end2 + 1
            return range(start,end2)
        
def findWindows(DT,cutt):
    # start from minimum possible point
    return list(filter(None,[windowFit(DT,x,x+expSamp,x+cutlength,51) for x in range(0,len(DT) - expSamp)]))

## Process:

1. First identify a suitable window.
   1. Determine the start and end such that they are greater than `windwidthsec` apart.
   2. Extract only data where the speed is greater than `cutv`, time difference is less than `cutt`, and the heading is not equal to 100 (when direction cannot be calculated i.e. track speed = 0).
2. If the resultant window is greater in length than `cutlength`, proceed to maximum likelihood modelling.

In [284]:
# remove low speed values and where time differences are too long
dfRm = df[(df['track_speed'] > cutv) & (df['dt'] < cutt)].reset_index()
# find suitable windows
windows = findWindows(dfRm['DT'],51)
windows[1]

range(185, 230)

In R, the function `A1inv` returns a value `k` from input argument `r` such that

$A1inv(k) = A1(r)$

where 

$A1(r) = \frac{I_1(\kappa)}{I_0(\kappa)}$

where $I_1$ and $I_0$ are the first and zeroth order Bessel functions, respectively.

However, Python does not have such a function, but the `circular` package has a [GitHub page](https://github.com/cran/circular) documenting the functions, including `A1inv` and so we can now simply implement this within Python as a new function.

In [313]:
def A1inv(x):
    if ((0 <= x) * (x < 0.53)):
        return 2 * x + x**3 + (5 * x**5)/6
    else:
        if (x < 0.85):
            return -0.4 + 1.39 * x + 0.43/(1-x)
        else:
            return 1/(x**3 - 4 * x**2 + 3 * x)

The initial method ran through a loop groung through every potential fitting time window. Instead, I think it would be best to optimise the process by first identifying usable windows then running through all those. This would remove the processing time for all none-fitting windows.

Suitable windows are ones which pass the following requirements:
1. Window length is greater or equal to minimum window size in seconds
2. Windows contain sufficient number of data points (75% of expected samples)
3. Data in window must be collected at speeds greater than the minimum ground speed (4.1667 m/s)

In [314]:
# from tokenize import Double

def findWindow(dt,center,windwidthsec):
    entr = 0
    passesE = False
    for qf in range(len(dt) - center):
        entr = entr + dt[center + qf]
        if entr > windwidthsec:
            passesE = True
            break
    entr = 0
    passesS = False
    for qb in range(center-1):
        entr = entr + dt[center-qb]
        if entr > windwidthsec:
            passesS = True
            break
    return passesE * passesS * (passesE - passesS > 44)

def trackVectors(id_hd,r,d,index):
    rr = []
    dd = []
    iindex = []
    for k in range(len(r)):
        if r[k] > cutv:
            rr = np.append(rr,r)
            dd = np.append(dd,d)
            iindex = np.append(iindex,index[k])
    inithd_first = id_hd/(3*pi/2)
    inita = 0
    while inita < 5:
        inita = np.abs(np.random.normal(12.5,5))
    meangd = np.arctan2(np.sum(np.sin(d)),np.sum(np.cos(d)))
    inithd = meangd + inithd_first
    initkappa = A1inv(np.mean(np.cos(d - meangd)))
    initmux = initkappa * np.cos(inithd)
    initmuy = initkappa * np.sin(inithd)
    initwx = np.mean(r) * np.cos(meangd) - constv * np.cos(inithd)
    initwy = np.mean(r) * np.sin(meangd) - constv * np.sin(inithd)

    # answ = sp.optimize.minimize(Likelihoodww(r,d,constv), np.array([inita,initmux,initmuy,initwx,initwy]),method='nelder-mead')

    # yoko = Von_Mises_sd(np.sqrt())

In [315]:
rr = rrow[windows[0]]
dd = rrow[windows[0]]
index = windows[0]
max_like = "NaN"
hd_try = 3

# CHANGE THIS TO FOR LOOP
id_hd = -hd_try

inithd_first = id_hd/(3*np.pi/2)
inita = 0
while inita < 5:
    inita = np.abs(np.random.normal(12.5,5))
meangd = np.arctan2(np.sum(np.sin(d)),np.sum(np.cos(d)))
inithd = meangd + inithd_first
initkappa = A1inv(np.mean(np.cos(d - meangd)))
initmux = initkappa * np.cos(inithd)
initmuy = initkappa * np.sin(inithd)
initwx = np.mean(r) * np.cos(meangd) - constv * np.cos(inithd)
initwy = np.mean(r) * np.sin(meangd) - constv * np.sin(inithd)

To speed up calculation, create two versions, one which generates winds every hour or so. Wind should not change so frequently and this may be optimal for more users. Also very useful for trial users.