In [1]:
import random

import pandas as pd
from ptrail.core.Datasets import Datasets
from ptrail.features.kinematic_features import KinematicFeatures
from ptrail.preprocessing.statistics import Statistics
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score

from src.augmentation.augment import Augmentation
from src.selection.select import Selection
from src.utils.general_utils import Utilities

In [2]:
def augment_trajectories(dataset, ids_to_augment, circle, class_col, random_):
    if random_:
        print("Performing Random Augmentation")
        dataset = Augmentation.augment_trajectories_with_randomly_generated_points(dataset,
                                                                                   ids_to_augment=ids_to_augment,
                                                                                   circle=circle)

        for i in range(1, 3):
            dataset = Augmentation.augment_trajectories_with_randomly_generated_points(dataset,
                                                                                       ids_to_augment=ids_to_augment,
                                                                                       circle=circle)

    else:
        print("Performing Drop-Augmentation")
        dataset = Augmentation.augment_trajectories_by_dropping_points(dataset, ids_to_augment)

        for i in range(1, 3):
            dataset = Augmentation.augment_trajectories_by_dropping_points(dataset, ids_to_augment)

    pivoted = Statistics.pivot_stats_df(dataframe=Statistics.generate_kinematic_stats(dataset, class_col), target_col_name=class_col)
    return pivoted.drop(columns=[class_col]), pivoted[class_col]


In [3]:
starkey_data = Datasets.load_starkey()
ready_dataset = KinematicFeatures.create_distance_column(starkey_data).reset_index()

------------------------ Dataset Facts ------------------------------

Number of unique Trajectories in the data: 253
Number of points in the data: 287136
Dataset time range: 1196 days 22:51:45
Datatype of the DataFrame: <class 'ptrail.core.TrajectoryDF.PTRAILDataFrame'>
Dataset Bounding Box: (45.18896978643169, -118.61020848239596, 45.314545642992, -118.50455596234036)

---------------------------------------------------------------------


In [4]:
def get_test_train_data(seed_val, k=0.8):
    """
        Given the seed value and a proportion, split the
        data into training and testing set and return it.

        Parameters
        ----------
        seed_val: int
            The seed value to use to control the randomness
            while selecting the train-test split.
        k: float
            The percent of data to be used as training data.

        Returns
        -------
            tuple:
                training, testing_x and testing_y
    """
    # Get all the Trajectory Ids and set the random state.
    traj_ids = list(ready_dataset['traj_id'].unique())
    random.seed(seed_val)

    # Select the ids to be used as training set and the calculate subsequent testing ids.
    train_size = int(len(traj_ids) * k)
    train_traj_ids = random.sample(traj_ids, train_size)
    test_traj_ids = [id_ for id_ in traj_ids if id_ not in train_traj_ids]

    # Split the data into training and testing sets.
    training = ready_dataset.loc[ready_dataset.traj_id.isin(train_traj_ids)]
    testing = ready_dataset.loc[ready_dataset.traj_id.isin(test_traj_ids)]

    pivoted_test = Statistics.pivot_stats_df(Statistics.generate_kinematic_stats(testing, 'Species', False), 'Species')

    return training, pivoted_test.drop(columns=['Species']), pivoted_test['Species']

In [5]:
seed_generator = Utilities.generate_pi_seed(20)
seed_vals = [next(seed_generator) for i in range(20)]

training, test_x, test_y = get_test_train_data(seed_vals[0], 0.8)
model = DecisionTreeClassifier()

selected = Selection.select_randomly(training, seed_vals[0])
train_x, train_y = augment_trajectories(dataset=training, ids_to_augment=selected, circle='in', class_col='Species', random_=True)

model.fit(X=train_x, y=train_y)
predicted = model.predict(X=test_x)

f1 = f1_score(y_true=test_y, y_pred=predicted, average='weighted')
accuracy = accuracy_score(y_true=test_y, y_pred=predicted)

print(f"F1 Score: {f1}, Accuracy: {accuracy}")

Performing Random Augmentation
F1 Score: 0.9011800849372007, Accuracy: 0.9019607843137255


In [6]:
seed_generator = Utilities.generate_pi_seed(20)
seed_vals = [next(seed_generator) for i in range(20)]

training, test_x, test_y = get_test_train_data(seed_vals[0], 0.8)
model = DecisionTreeClassifier()

selected = Selection.select_randomly(training, seed_vals[0])
train_x, train_y = augment_trajectories(dataset=training, ids_to_augment=selected, circle='in', class_col='Species', random_=False)

model.fit(X=train_x, y=train_y)
predicted = model.predict(X=test_x)

f1 = f1_score(y_true=test_y, y_pred=predicted, average='weighted')
accuracy = accuracy_score(y_true=test_y, y_pred=predicted)

print(f"F1 Score: {f1}, Accuracy: {accuracy}")

Performing Drop-Augmentation
F1 Score: 0.8573583279465632, Accuracy: 0.8627450980392157
