In [1]:
import numpy as np
import pandas as pd
from ptrail.core.TrajectoryDF import PTRAILDataFrame
from ptrail.features.kinematic_features import KinematicFeatures
from ptrail.core.Datasets import Datasets

from TestUtils.test_utils import TestUtils
from src.selection.select import Selection
from src.utils.general_utils import Utilities

In [2]:
starkey_dataset = Datasets.load_starkey()
ready_dataset = KinematicFeatures.create_distance_column(starkey_dataset)
ready_dataset

------------------------ Dataset Facts ------------------------------

Number of unique Trajectories in the data: 253
Number of points in the data: 287136
Dataset time range: 1196 days 22:51:45
Datatype of the DataFrame: <class 'ptrail.core.TrajectoryDF.PTRAILDataFrame'>
Dataset Bounding Box: (45.18896978643169, -118.61020848239596, 45.314545642992, -118.50455596234036)

---------------------------------------------------------------------


Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,StarkeyTime,GMDate,GMTime,LocDate,LocTime,RadNum,Species,UTME,UTMN,Year,Grensunr,Grensuns,Obswt,Distance
traj_id,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
880109D01,1995-04-13 13:40:06,45.239682,-118.533204,229902006,21:40:06,19950413,19950413,13:40:06,409,0,379662,5010734,95,13:13:00,02:39:00,1.47,
880109D01,1995-04-15 12:16:15,45.250521,-118.530438,230069775,20:16:15,19950415,19950415,12:16:15,409,0,379895,5011927,95,13:09:00,02:41:00,1.59,1224.551334
880109D01,1995-04-15 21:39:38,45.247943,-118.541455,230103578,05:39:38,19950416,19950415,21:39:38,409,0,379039,5011656,95,13:07:00,02:43:00,1.34,908.878736
880109D01,1995-04-16 03:32:14,45.247429,-118.539530,230124734,11:32:14,19950416,19950416,03:32:14,409,0,379188,5011581,95,13:07:00,02:43:00,1.50,161.204428
880109D01,1995-04-16 04:08:28,45.247117,-118.542579,230126908,12:08:28,19950416,19950416,04:08:28,409,0,378938,5011567,95,13:07:00,02:43:00,1.34,241.258531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OSUX93191,1996-08-15 06:51:06,45.220642,-118.543392,272213466,14:51:06,19960815,19960815,06:51:06,390,2,378821,5008634,96,12:56:00,03:04:00,1.60,892.331554
OSUX93191,1996-08-15 08:45:15,45.219785,-118.546807,272220315,16:45:15,19960815,19960815,08:45:15,390,2,378568,5008518,96,12:56:00,03:04:00,1.39,283.975120
OSUX93191,1996-08-15 10:36:54,45.219801,-118.545661,272227014,18:36:54,19960815,19960815,10:36:54,390,2,378645,5008543,96,12:56:00,03:04:00,1.43,89.767305
OSUX93191,1996-08-15 12:31:22,45.220268,-118.551024,272233882,20:31:22,19960815,19960815,12:31:22,390,2,378232,5008600,96,12:56:00,03:04:00,1.53,423.187635


In [3]:
def find_original_and_augmentation_pairs_and_calculate_differences(augmented_dataset, selected):
    # Find augmented trajectories associated with each original trajectory.
    select_to_augment_map = {}
    for traj_id in selected:
        pattern = r'\b{}aug'.format(traj_id)
        conditions = augmented_dataset.index.str.match(pattern)
        select_to_augment_map[traj_id] = augmented_dataset.loc[conditions].index.unique()

    # Now, for each original trajectory, calculate the features for all of them
    # and then find the vector difference between the vectors.
    distances = []
    for traj_id in selected:
        # Get the features of the original traj.
        original_features = augmented_dataset.loc[augmented_dataset.index == traj_id].to_numpy()

        # Get the features of the augmented trajectories.
        aug_features = augmented_dataset.loc[augmented_dataset.index.isin(select_to_augment_map[traj_id])].to_numpy()

        # # Now, for each augmented trajectory, find the euclidean distance between the
        # # features of original trajectory and augmented trajectory and store it in a list.
        for aug in aug_features:
            distance = np.linalg.norm(original_features - aug)
            distances.append(distance)

    return round(np.mean(distances), 4), round(np.std(distances), 4)

In [4]:
seed_generator = Utilities.generate_pi_seed(20)
seed_vals = [next(seed_generator) for i in range(20)]
shake_percentages = [0.2, 0.4, 0.6]
circle_methods = ['on', 'in']

results = ["on_20%_dist,on_20%_std,on_40%_std,on_40%_std,on_60%_std,on_60%_std,"
           "in_20%_dist,in_20%_std,in_40%_std,in_40%_std,in_60%_std,in_60%_std"]

for seed in seed_vals:
    row = []
    for shake in shake_percentages:
        for method in circle_methods:
            train, test_x, test_y = TestUtils.get_test_train_data(dataset=ready_dataset, seed_val=seed,
                                                                  class_col='Species', k=0.8)

            selected = Selection.select_randomly(train, seed, k=0.3)
            train_x, train_y = TestUtils.augment_trajectories_using_random_strategy(dataset=train,
                                                                                    percent_to_shake=shake,
                                                                                    ids_to_augment=selected,
                                                                                    circle=method,
                                                                                    n_augmentations=20,
                                                                                    class_col="Species")
            mean, std = find_original_and_augmentation_pairs_and_calculate_differences(train_x, selected)
            row.append(mean)
            row.append(std)
    print(row)
    results.append(row)


[19888.0202, 12885.1702, 2962.3119, 1804.8729, 25287.2707, 13660.7476, 3995.0658, 2307.7126, 29202.1452, 14376.7183, 4617.7559, 2492.8163]
[20334.2163, 12285.2713, 3361.982, 1797.526, 26644.1466, 13647.3283, 4393.2158, 2201.3673, 29905.7768, 14107.1622, 4967.9293, 2386.488]


KeyboardInterrupt: 