In [1]:
import random

import pandas as pd
from ptrail.core.Datasets import Datasets
from ptrail.features.kinematic_features import KinematicFeatures
from ptrail.preprocessing.statistics import Statistics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, accuracy_score

from src.augmentation.augment import Augmentation
from src.selection.select import Selection
from src.utils.general_utils import Utilities

In [2]:
# Load the dataset and create the distance column in it which is necessary for Augmentation.
final_results = ["seed, strategy, model, accuracy, f1_score"]
seed = Utilities.generate_pi_seed(1)
seed = next(seed)

def augment_trajectories(dataset, to_augment, circle, class_col):
    dataset = Augmentation.augment_trajectories_with_randomly_generated_points(dataset,
                                                                               ids_to_augment=to_augment,
                                                                               circle=circle)

    for i in range(1, 10):
        dataset = Augmentation.augment_trajectories_with_randomly_generated_points(dataset,
                                                                                   ids_to_augment=to_augment,
                                                                                   circle=circle)

    return Statistics.pivot_stats_df(dataframe=Statistics.generate_kinematic_stats(dataset, class_col), target_col_name=class_col)


In [3]:
starkey_data = Datasets.load_starkey()
ready_dataset = KinematicFeatures.create_distance_column(starkey_data).reset_index()

------------------------ Dataset Facts ------------------------------

Number of unique Trajectories in the data: 253
Number of points in the data: 287136
Dataset time range: 1196 days 22:51:45
Datatype of the DataFrame: <class 'ptrail.core.TrajectoryDF.PTRAILDataFrame'>
Dataset Bounding Box: (45.18896978643169, -118.61020848239596, 45.314545642992, -118.50455596234036)

---------------------------------------------------------------------


In [4]:
traj_ids = list(ready_dataset['traj_id'].unique())
train_size = int(len(traj_ids) * 0.8)

random.seed(seed)
train_traj_ids = random.sample(traj_ids, train_size)
test_traj_ids = []
for t in traj_ids:
    if t not in train_traj_ids:
        test_traj_ids.append(t)

training = ready_dataset.loc[ready_dataset.traj_id.isin(train_traj_ids)]
testing = ready_dataset.loc[ready_dataset.traj_id.isin(test_traj_ids)]

# Get the original train and test data ready.
pivoted_train =Statistics.pivot_stats_df(
    dataframe=Statistics.generate_kinematic_stats(training, 'Species'),
    target_col_name='Species'
)

pivoted_test = Statistics.pivot_stats_df(
    dataframe=Statistics.generate_kinematic_stats(testing, 'Species'),
    target_col_name='Species'
)

x_test = pivoted_test.drop(columns=['Species'])
y_test = pivoted_test['Species']

In [5]:
%%time
b_trainX = pivoted_train.drop(columns='Species')
b_trainY = pivoted_train['Species']

model = GradientBoostingClassifier(n_estimators=512)
model.fit(b_trainX, b_trainY)

predicted = model.predict(x_test)
acc = accuracy_score(y_true=y_test, y_pred=predicted)
f1 = f1_score(y_true=y_test, y_pred=predicted, average='weighted')
final_results.append(f"{seed}, Base, XGBoost, {acc}, {f1}")
final_results

CPU times: user 3.64 s, sys: 8.11 ms, total: 3.64 s
Wall time: 3.64 s


['seed, strategy, model, accuracy, f1_score',
 '1415, Base, XGBoost, 0.9607843137254902, 0.9607843137254902']

In [6]:
%%time

# Random selection.
random_selected = Selection.select_randomly(training, seed=seed, k=0.2)

# Proportional selection.
proportional_selected = Selection.select_trajectories_proportionally(training, classification_col='Species', seed=seed, k=0.2)

# Fewest selection.
fewest_selected = Selection.select_with_fewest_points(training, k=0.2)

# Representative Selection
rep_selected = Selection.select_representative_trajectories(training, 'Species', closeness_cutoff=0.7, tolerance=10)

CPU times: user 34.8 s, sys: 2.85 s, total: 37.6 s
Wall time: 41.5 s


In [7]:
%%time

select_strategies = [
    'random', 'random', 'proportional', 'proportional',
    'fewest', 'fewest', 'representation', 'representative'
]
circle_strategies = [
    'on', 'in', 'on', 'in', 'on', 'in', 'on', 'in'
]
selected_traj = [
    random_selected, random_selected, proportional_selected, proportional_selected,
    fewest_selected, fewest_selected, rep_selected, rep_selected
]


for select, circle_strategy, to_augment in zip(select_strategies, circle_strategies, selected_traj):
    # Create the model.
    model = GradientBoostingClassifier(n_estimators=512)

    # Augment the trajectories and create the training set.
    train = augment_trajectories(dataset=training, to_augment=to_augment, circle=circle_strategy, class_col='Species')
    x_train = train.drop(columns=['Species'])
    y_train = train['Species']

    # Fit the model and predict.
    model.fit(X=x_train, y=y_train)
    pred_vals = model.predict(X=x_test)

    # Calculate the accuracy and f1 score.
    acc = accuracy_score(y_true=y_test, y_pred=pred_vals)
    score = f1_score(y_true=y_test, y_pred=pred_vals, average='weighted')
    print(f"{seed}, {select}_{circle_strategy}, XGBoost, {acc}, {score}")
    final_results.append(f"{seed}, {select}_{circle_strategy}, XGBoost, {acc}, {score}")

# ['1415, Base, XGBoost, 0.9607843137254902, 0.9607843137254902']

1415, random_on, XGBoost, 0.9607843137254902, 0.9606681382230311
1415, random_in, XGBoost, 0.9803921568627451, 0.980125383486728
1415, proportional_on, XGBoost, 0.9803921568627451, 0.980125383486728
1415, proportional_in, XGBoost, 0.9607843137254902, 0.9607855185084787
1415, fewest_on, XGBoost, 0.9803921568627451, 0.980125383486728
1415, fewest_in, XGBoost, 0.9803921568627451, 0.980125383486728
1415, representation_on, XGBoost, 0.9607843137254902, 0.9606681382230311
1415, representative_in, XGBoost, 0.9607843137254902, 0.95932069797616
CPU times: user 25min 58s, sys: 30 s, total: 26min 28s
Wall time: 27min 7s


In [8]:
import csv

# Specify the file path to save the CSV file
file_path = "experiment_results.csv"

# Write the list to a CSV file
with open(file_path, mode="w", newline="") as file:
    writer = csv.writer(file)
    for item in final_results:
        writer.writerow(item.split(", "))