In [1]:
import numpy as np
import pandas as pd
from ptrail.core.TrajectoryDF import PTRAILDataFrame
from ptrail.features.kinematic_features import KinematicFeatures
from ptrail.core.Datasets import Datasets
from ptrail.preprocessing.statistics import Statistics

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from src.utils.test_utils import TestUtils
from src.selection.select import Selection
from src.utils.general_utils import Utilities

In [2]:
starkey_dataset = Datasets.load_starkey()
ready_dataset = KinematicFeatures.create_distance_column(starkey_dataset)
ready_dataset

------------------------ Dataset Facts ------------------------------

Number of unique Trajectories in the data: 253
Number of points in the data: 287136
Dataset time range: 1196 days 22:51:45
Datatype of the DataFrame: <class 'ptrail.core.TrajectoryDF.PTRAILDataFrame'>
Dataset Bounding Box: (45.18896978643169, -118.61020848239596, 45.314545642992, -118.50455596234036)

---------------------------------------------------------------------


Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,StarkeyTime,GMDate,GMTime,LocDate,LocTime,RadNum,Species,UTME,UTMN,Year,Grensunr,Grensuns,Obswt,Distance
traj_id,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
880109D01,1995-04-13 13:40:06,45.239682,-118.533204,229902006,21:40:06,19950413,19950413,13:40:06,409,0,379662,5010734,95,13:13:00,02:39:00,1.47,
880109D01,1995-04-15 12:16:15,45.250521,-118.530438,230069775,20:16:15,19950415,19950415,12:16:15,409,0,379895,5011927,95,13:09:00,02:41:00,1.59,1224.551334
880109D01,1995-04-15 21:39:38,45.247943,-118.541455,230103578,05:39:38,19950416,19950415,21:39:38,409,0,379039,5011656,95,13:07:00,02:43:00,1.34,908.878736
880109D01,1995-04-16 03:32:14,45.247429,-118.539530,230124734,11:32:14,19950416,19950416,03:32:14,409,0,379188,5011581,95,13:07:00,02:43:00,1.50,161.204428
880109D01,1995-04-16 04:08:28,45.247117,-118.542579,230126908,12:08:28,19950416,19950416,04:08:28,409,0,378938,5011567,95,13:07:00,02:43:00,1.34,241.258531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OSUX93191,1996-08-15 06:51:06,45.220642,-118.543392,272213466,14:51:06,19960815,19960815,06:51:06,390,2,378821,5008634,96,12:56:00,03:04:00,1.60,892.331554
OSUX93191,1996-08-15 08:45:15,45.219785,-118.546807,272220315,16:45:15,19960815,19960815,08:45:15,390,2,378568,5008518,96,12:56:00,03:04:00,1.39,283.975120
OSUX93191,1996-08-15 10:36:54,45.219801,-118.545661,272227014,18:36:54,19960815,19960815,10:36:54,390,2,378645,5008543,96,12:56:00,03:04:00,1.43,89.767305
OSUX93191,1996-08-15 12:31:22,45.220268,-118.551024,272233882,20:31:22,19960815,19960815,12:31:22,390,2,378232,5008600,96,12:56:00,03:04:00,1.53,423.187635


In [3]:
seed_generator = Utilities.generate_pi_seed(20)
seed_vals = [next(seed_generator) for i in range(20)]
shake_percentages = [0.2, 0.4, 0.6]
circle_methods = ['on', 'in']
ml_models = [ExtraTreesClassifier(), GradientBoostingClassifier(), RandomForestClassifier()]
scaler = MinMaxScaler((0, 1))

distance_results = [["seed", "on_20%_dist", "on_20%_std", "on_40%_dist", "on_40%_std", "on_60%_dist", "on_60%_std",
                    "in_20%_dist","in_20%_std","in_40%_dist","in_40%_std","in_60%_dist","in_60%_std"]]

model_results = [["seed", "model", "baseline", "in_20%_f1", "in_40%_f1", "in_60%_f1", "on_20%_f1", "on_40%_f1", "on_60%_f1"]]

for seed in seed_vals:
    # Intermediate lists for storing distance and model score values.
    distance_row = [seed]

    # Set apart 20% data for testing that augmentation process will never see.
    train, test_x, test_y = TestUtils.get_test_train_data(dataset=ready_dataset, seed_val=seed,
                                                          class_col='Species', k=0.8)

    model_row = TestUtils.create_model_row(seed, ml_models, "Species", train, test_x, test_y)
    for shake in shake_percentages:
        for method in circle_methods:
            # Randomly select 30% of trajectories to be augmented.
            selected = Selection.select_randomly(train, seed, k=0.3)

            # Augment the trajectories.
            train_x, train_y = TestUtils.augment_trajectories_using_random_strategy(dataset=train,
                                                                                    percent_to_shake=shake,
                                                                                    ids_to_augment=selected,
                                                                                    circle=method,
                                                                                    n_augmentations=20,
                                                                                    class_col="Species")
            mean, std = TestUtils.find_original_and_augmentation_pairs_and_calculate_differences(train_x, selected)
            distance_row.append(mean)
            distance_row.append(std)

            for i in range(len(ml_models)):
                f1_score = TestUtils.train_model_and_evaluate(ml_models[i], scaler.fit_transform(train_x), train_y,
                                                              scaler.fit_transform(test_x), test_y, seed)
                model_row[i].append(f1_score)

    model_results.extend(model_row)
    distance_results.append(distance_row)

    print(model_row)

[[1415, 'ExtraTreesClassifier', 0.9005, 0.7641, 0.8381, 0.4812, 0.7985, 0.2784, 0.7534], [1415, 'GradientBoostingClassifier', 0.8555, 0.7076, 0.8557, 0.4953, 0.8574, 0.5394, 0.8347], [1415, 'RandomForestClassifier', 0.8779, 0.6846, 0.8813, 0.4084, 0.8163, 0.2893, 0.6839]]
[[9265, 'ExtraTreesClassifier', 1.0, 0.891, 0.9405, 0.9184, 0.9413, 0.8763, 0.9801], [9265, 'GradientBoostingClassifier', 0.9231, 0.9016, 0.9431, 0.9046, 0.9236, 0.9015, 0.9431], [9265, 'RandomForestClassifier', 0.9801, 0.8998, 0.94, 0.8558, 0.9606, 0.9015, 0.9412]]
[[3589, 'ExtraTreesClassifier', 0.8791, 0.8823, 0.8601, 0.8413, 0.8799, 0.8093, 0.8601], [3589, 'GradientBoostingClassifier', 0.7771, 0.8191, 0.8794, 0.7588, 0.8363, 0.7965, 0.7959], [3589, 'RandomForestClassifier', 0.8433, 0.8257, 0.8823, 0.8212, 0.7541, 0.7787, 0.8589]]
[[7932, 'ExtraTreesClassifier', 0.8381, 0.635, 0.7472, 0.3431, 0.73, 0.2377, 0.5481], [7932, 'GradientBoostingClassifier', 0.7835, 0.6576, 0.8062, 0.4925, 0.7672, 0.5436, 0.7471], [7932, 

In [4]:
import csv

file_path = "./starkey_distances.csv"
with open(file_path, mode="w") as file:
    writer = csv.writer(file)
    for item in distance_results:
        writer.writerow(item)
    print(f"File successfully written to: {file_path}")

file_path = "./starkey_f1_score.csv"
with open(file_path, mode="w") as file:
    writer = csv.writer(file)
    for item in model_results:
        writer.writerow(item)
    print(f"File successfully written to: {file_path}")

File successfully written to: ./starkey_distances.csv
File successfully written to: ./starkey_f1_score.csv
