In [1]:
import numpy as np
import pandas as pd
from ptrail.core.TrajectoryDF import PTRAILDataFrame
from ptrail.features.kinematic_features import KinematicFeatures
from ptrail.core.Datasets import Datasets
from ptrail.preprocessing.statistics import Statistics

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from TestUtils.test_utils import TestUtils
from src.selection.select import Selection
from src.utils.general_utils import Utilities
import datetime

In [2]:
df = pd.read_csv('../datasets/fishes.csv', engine='python')
df['DateTime'] = pd.to_datetime(df['DateTime'], format='%Y-%m-%d %H:%M:%S.%f', utc=True)
fishes_dataset = PTRAILDataFrame(data_set=df,
                                 traj_id='traj_id',
                                 datetime='DateTime',
                                 latitude='lat',
                                 longitude='lon')
ready_dataset = KinematicFeatures.create_distance_column(fishes_dataset)
ready_dataset

  data[const.DateTime] = data[const.DateTime].astype('datetime64[ns]')


Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,Species,Distance
traj_id,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1058,2013-01-04 05:05:00,-5.256800,71.657683,1,
1058,2013-01-05 18:27:00,-5.256800,71.657683,1,0.000000
1058,2013-01-05 18:29:00,-5.256800,71.657683,1,0.000000
1058,2013-01-05 18:33:00,-5.256800,71.657683,1,0.000000
1058,2013-01-06 04:52:00,-5.256800,71.657683,1,0.000000
...,...,...,...,...,...
t4961,2011-12-11 04:00:00,48.561960,-53.889830,0,0.000000
t4961,2011-12-11 05:00:00,48.562427,-53.890549,0,74.170844
t4961,2011-12-11 06:00:00,48.564111,-53.890225,0,188.663850
t4961,2011-12-11 07:00:00,48.564644,-53.889088,0,102.582900


In [None]:
seed_generator = Utilities.generate_pi_seed(1)
seed_vals = [next(seed_generator) for i in range(1)]
shake_percentages = [0.2, 0.4, 0.6]
circle_methods = ['on', 'in']
ml_models = [ExtraTreesClassifier(), GradientBoostingClassifier(), RandomForestClassifier()]
scaler = MinMaxScaler((0, 1))

distance_results = [["seed", "on_20%_dist", "on_20%_std", "on_40%_dist", "on_40%_std", "on_60%_dist", "on_60%_std",
                    "in_20%_dist","in_20%_std","in_40%_dist","in_40%_std","in_60%_dist","in_60%_std"]]

model_results = [["seed", "model", "baseline", "in_20%_f1", "in_40%_f1", "in_60%_f1", "on_20%_f1", "on_40%_f1", "on_60%_f1"]]

for seed in seed_vals:
    # Intermediate lists for storing distance and model score values.
    distance_row = [seed]

    # Set apart 20% data for testing that augmentation process will never see.
    train, test_x, test_y = TestUtils.get_test_train_data(dataset=ready_dataset, seed_val=seed,
                                                          class_col='Species', k=0.8)

    model_row = TestUtils.create_model_row(seed, ml_models, "Species", train, test_x, test_y)
    for shake in shake_percentages:
        for method in circle_methods:
            # Randomly select 30% of trajectories to be augmented.
            selected = Selection.select_randomly(train, seed, k=0.3)

            # Augment the trajectories.
            train_x, train_y = TestUtils.augment_trajectories_using_random_strategy(dataset=train,
                                                                                    percent_to_shake=shake,
                                                                                    ids_to_augment=selected,
                                                                                    circle=method,
                                                                                    n_augmentations=1,
                                                                                    class_col="Species")
            mean, std = TestUtils.find_original_and_augmentation_pairs_and_calculate_differences(train_x, selected)
            distance_row.append(mean)
            distance_row.append(std)

            for i in range(len(ml_models)):
                f1_score = TestUtils.train_model_and_evaluate(ml_models[i], scaler.fit_transform(train_x), train_y,
                                                              scaler.fit_transform(test_x), test_y, seed)
                model_row[i].append(f1_score)

    model_results.extend(model_row)
    distance_results.append(distance_row)

    print(model_row)