In [1]:
import random

from ptrail.core.Datasets import Datasets
from ptrail.features.kinematic_features import KinematicFeatures
from ptrail.preprocessing.statistics import Statistics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

from src.augmentation.augment import Augmentation
from src.selection.select import Selection

In [2]:
# Load the dataset and create the distance column in it which is necessary for Augmentation.
starkey_data = Datasets.load_starkey()
ready_dataset = KinematicFeatures.create_distance_column(starkey_data).reset_index()

------------------------ Dataset Facts ------------------------------

Number of unique Trajectories in the data: 253
Number of points in the data: 287136
Dataset time range: 1196 days 22:51:45
Datatype of the DataFrame: <class 'ptrail.core.TrajectoryDF.PTRAILDataFrame'>
Dataset Bounding Box: (45.18896978643169, -118.61020848239596, 45.314545642992, -118.50455596234036)

---------------------------------------------------------------------


In [3]:
traj_ids = list(ready_dataset['traj_id'].unique())
train_size = int(len(traj_ids) * 0.75)

train_traj_ids = random.sample(traj_ids, train_size)
test_traj_ids = []
for t in traj_ids:
    if t not in train_traj_ids:
        test_traj_ids.append(t)

training, testing = ready_dataset.loc[ready_dataset.traj_id.isin(train_traj_ids)], \
    ready_dataset.loc[ready_dataset.traj_id.isin(test_traj_ids)]

# Get the original train and test data ready.
pivoted_train =Statistics.pivot_stats_df(
    dataframe=Statistics.generate_kinematic_stats(training, 'Species'),
    target_col_name='Species'
)

pivoted_test = Statistics.pivot_stats_df(
    dataframe=Statistics.generate_kinematic_stats(testing, 'Species'),
    target_col_name='Species'
)

In [4]:
def augment_trajectories(dataset, to_augment):
    print(f"Trajectories before augmentation: {len(dataset.traj_id.unique())}")
    dataset = Augmentation.augment_trajectories_with_randomly_generated_points(dataset,
                                                                           ids_to_augment=to_augment,
                                                                           circle='in')

    for i in range(1, 5):
        dataset = Augmentation.augment_trajectories_with_randomly_generated_points(dataset,
                                                                                   ids_to_augment=to_augment,
                                                                                   circle='in')


    for i in range(0, 5):
        dataset = Augmentation.augment_trajectories_with_randomly_generated_points(dataset,
                                                                                   ids_to_augment=to_augment,
                                                                                   circle='on')

    print(f"Trajectories after augmentation: {len(dataset.traj_id.unique())}")
    return Statistics.pivot_stats_df(dataframe=Statistics.generate_kinematic_stats(dataset, 'Species'), target_col_name='Species')


In [5]:
%%time

# Random selection
selected = Selection.select_randomly(training, k=0.2)
random_augmented = augment_trajectories(dataset=training, to_augment=selected)

Trajectories before augmentation: 189
Trajectories after augmentation: 559
CPU times: user 2min 18s, sys: 2.52 s, total: 2min 21s
Wall time: 2min 25s


In [6]:
%%time

# Proportional selection
proportionally_selected = Selection.select_trajectories_proportionally(training, classification_col='Species', k=0.2)
prop_augmented = augment_trajectories(dataset=training, to_augment=proportionally_selected)

Trajectories before augmentation: 189
Trajectories after augmentation: 579
CPU times: user 2min 20s, sys: 2.75 s, total: 2min 23s
Wall time: 2min 27s


In [7]:
%%time

# Fewest selection
fewest_selection = Selection.select_with_fewest_points(training, k=0.2)
fewest_augmented = augment_trajectories(dataset=training, to_augment=fewest_selection)

Trajectories before augmentation: 189
Trajectories after augmentation: 569
CPU times: user 1min 10s, sys: 1.48 s, total: 1min 12s
Wall time: 1min 14s


In [8]:
%%time

# Representative selection
rep_selection = Selection.select_representative_trajectories(training, 'Species', closeness_cutoff=0.7, tolerance=10)
rep_augmented = augment_trajectories(dataset=training, to_augment=rep_selection)

Trajectories before augmentation: 189
Trajectories after augmentation: 549
CPU times: user 2min 38s, sys: 4.4 s, total: 2min 43s
Wall time: 2min 49s


In [9]:
training_names = ["Original", "Random Augmented", "Proportionally Augmented", "Fewest Augmented", "Representative Augmented"]
train_sets = [pivoted_train, random_augmented, prop_augmented, fewest_augmented, rep_augmented]

for name, train in zip(training_names, train_sets):
    model = RandomForestClassifier(n_estimators=256)
    x_train = train.drop(columns=['Species'])
    y_train = train['Species']
    model.fit(X=x_train, y=y_train)

    x_test = pivoted_test.drop(columns=['Species'])
    y_test = pivoted_test['Species']
    pred_vals = model.predict(X=x_test)
    score = f1_score(y_true=y_test, y_pred=pred_vals, average='weighted')
    print(f"({name}) F1 Score: {score:.4f}")

(Original) F1 Score: 0.9369
(Random Augmented) F1 Score: 0.9519
(Proportionally Augmented) F1 Score: 0.9221
(Fewest Augmented) F1 Score: 0.9517
(Representative Augmented) F1 Score: 0.9221
