In [1]:
import random

from ptrail.core.Datasets import Datasets
from ptrail.features.kinematic_features import KinematicFeatures
from ptrail.preprocessing.statistics import Statistics
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, accuracy_score

from src.augmentation.augment import Augmentation
from src.selection.select import Selection
from src.utils.general_utils import Utilities

In [2]:
starkey_data = Datasets.load_starkey()
ready_dataset = KinematicFeatures.create_distance_column(starkey_data).reset_index()

------------------------ Dataset Facts ------------------------------

Number of unique Trajectories in the data: 253
Number of points in the data: 287136
Dataset time range: 1196 days 22:51:45
Datatype of the DataFrame: <class 'ptrail.core.TrajectoryDF.PTRAILDataFrame'>
Dataset Bounding Box: (45.18896978643169, -118.61020848239596, 45.314545642992, -118.50455596234036)

---------------------------------------------------------------------


In [3]:
traj_ids = list(ready_dataset['traj_id'].unique())
train_size = int(len(traj_ids) * 0.8)

seed = Utilities.generate_pi_seed(1)
seed = next(seed)

random.seed(seed)
train_traj_ids = random.sample(traj_ids, train_size)
test_traj_ids = []
for t in traj_ids:
    if t not in train_traj_ids:
        test_traj_ids.append(t)

training = ready_dataset.loc[ready_dataset.traj_id.isin(train_traj_ids)]
testing = ready_dataset.loc[ready_dataset.traj_id.isin(test_traj_ids)]

# Get the original train and test data ready.
pivoted_train = Statistics.pivot_stats_df(
    dataframe=Statistics.generate_kinematic_stats(training, 'Species'),
    target_col_name='Species'
)

pivoted_test = Statistics.pivot_stats_df(
    dataframe=Statistics.generate_kinematic_stats(testing, 'Species'),
    target_col_name='Species'
)

x_test = pivoted_test.drop(columns=['Species'])
y_test = pivoted_test['Species']

In [4]:
%%time
b_trainX = pivoted_train.drop(columns='Species')
b_trainY = pivoted_train['Species']

model = GradientBoostingClassifier(n_estimators=512)
model.fit(b_trainX, b_trainY)

predicted = model.predict(x_test)
acc = accuracy_score(y_true=y_test, y_pred=predicted)
f1 = f1_score(y_true=y_test, y_pred=predicted, average='weighted')
f"{seed}, Base, XGBoost, {acc}, {f1}"

CPU times: user 3.22 s, sys: 3.99 ms, total: 3.23 s
Wall time: 3.22 s


'1415, Base, XGBoost, 0.9607843137254902, 0.9607843137254902'

In [5]:
balanced = Augmentation.balance_dataset_with_augmentation(dataset=training, classification_col='Species')
balanced = Statistics.pivot_stats_df(
    dataframe=Statistics.generate_kinematic_stats(balanced, 'Species'),
    target_col_name='Species'
)

model = GradientBoostingClassifier(n_estimators=512)

# Augment the trajectories and create the training set.
x_train = balanced.drop(columns=['Species'])
y_train = balanced['Species']

# Fit the model and predict.
model.fit(X=x_train, y=y_train)
pred_vals = model.predict(X=x_test)

# Calculate the accuracy and f1 score.
acc = accuracy_score(y_true=y_test, y_pred=pred_vals)
score = f1_score(y_true=y_test, y_pred=pred_vals, average='weighted')
f"{seed}, Balanced, XGBoost, {acc}, {score}"

'1415, Balanced, XGBoost, 0.9803921568627451, 0.980125383486728'