In [1]:
import random

from ptrail.core.Datasets import Datasets
from ptrail.features.kinematic_features import KinematicFeatures
from ptrail.preprocessing.statistics import Statistics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score

from src.augmentation.augment import Augmentation
from src.selection.select import Selection
from src.utils.general_utils import Utilities

In [2]:
seed = Utilities.generate_pi_seed(1)
seed = next(seed)

def augment_trajectories(dataset, to_augment, circle, class_col):
    print(f"Trajectories before augmentation: {len(dataset.traj_id.unique())}")
    dataset = Augmentation.augment_trajectories_with_randomly_generated_points(dataset,
                                                                               ids_to_augment=to_augment,
                                                                               circle=circle,
                                                                               seed=seed)

    for i in range(1, 10):
        dataset = Augmentation.augment_trajectories_with_randomly_generated_points(dataset,
                                                                                   ids_to_augment=to_augment,
                                                                                   circle=circle,
                                                                                   seed=seed)


    print(f"Trajectories after augmentation: {len(dataset.traj_id.unique())}")
    return Statistics.pivot_stats_df(dataframe=Statistics.generate_kinematic_stats(dataset, class_col), target_col_name=class_col)


In [3]:
# Load the dataset and create the distance column in it which is necessary for Augmentation.
final_results = []
starkey_data = Datasets.load_starkey()
ready_dataset = KinematicFeatures.create_distance_column(starkey_data).reset_index()

------------------------ Dataset Facts ------------------------------

Number of unique Trajectories in the data: 253
Number of points in the data: 287136
Dataset time range: 1196 days 22:51:45
Datatype of the DataFrame: <class 'ptrail.core.TrajectoryDF.PTRAILDataFrame'>
Dataset Bounding Box: (45.18896978643169, -118.61020848239596, 45.314545642992, -118.50455596234036)

---------------------------------------------------------------------


In [4]:
traj_ids = list(ready_dataset['traj_id'].unique())
train_size = int(len(traj_ids) * 0.8)

random.seed(seed)
train_traj_ids = random.sample(traj_ids, train_size)
test_traj_ids = []
for t in traj_ids:
    if t not in train_traj_ids:
        test_traj_ids.append(t)

training = ready_dataset.loc[ready_dataset.traj_id.isin(train_traj_ids)]
testing = ready_dataset.loc[ready_dataset.traj_id.isin(test_traj_ids)]

# Get the original train and test data ready.
pivoted_train =Statistics.pivot_stats_df(
    dataframe=Statistics.generate_kinematic_stats(training, 'Species'),
    target_col_name='Species'
)

pivoted_test = Statistics.pivot_stats_df(
    dataframe=Statistics.generate_kinematic_stats(testing, 'Species'),
    target_col_name='Species'
)

x_test = pivoted_test.drop(columns=['Species'])
y_test = pivoted_test['Species']

In [5]:
%%time
b_trainX = pivoted_train.drop(columns='Species')
b_trainY = pivoted_train['Species']

model = RandomForestClassifier(n_estimators=256, random_state=seed)
model.fit(b_trainX, b_trainY)

predicted = model.predict(x_test)
acc = accuracy_score(y_true=y_test, y_pred=predicted)
f1 = f1_score(y_true=y_test, y_pred=predicted, average='weighted')
final_results.append(f"{seed}, Base, RandomForest, {acc}, {f1}")
final_results

CPU times: user 485 ms, sys: 71 µs, total: 486 ms
Wall time: 486 ms


['1415, Base, RandomForest, 0.9803921568627451, 0.980125383486728']

In [6]:
%%time

# Random selection.
random_selected = Selection.select_randomly(training, seed=seed, k=0.2)

# Proportional selection.
proportional_selected = Selection.select_trajectories_proportionally(training, classification_col='Species', seed=seed, k=0.2)

# Fewest selection.
fewest_selected = Selection.select_with_fewest_points(training, k=0.2)

# Representative Selection
rep_selected = Selection.select_representative_trajectories(training, 'Species', closeness_cutoff=0.7, tolerance=10)

CPU times: user 31.2 s, sys: 2.56 s, total: 33.7 s
Wall time: 37.1 s


In [7]:
select_strategies = [
    'random', 'random', 'proportional', 'proportional',
    'fewest', 'fewest', 'representation', 'representative'
]
augment_strategies = ['on', 'in', 'on', 'in', 'on', 'in', 'on', 'in']
selected_traj = [
    random_selected, random_selected, proportional_selected, proportional_selected,
    fewest_selected, fewest_selected, rep_selected, rep_selected
]


for select, augment, traj in zip(select_strategies, augment_strategies, selected_traj):
    # Create the model.
    model = RandomForestClassifier(n_estimators=256, random_state=seed)

    # Augment the trajectories and create the training set.
    train = augment_trajectories(dataset=training, to_augment=traj, circle=augment, class_col='Species')
    x_train = train.drop(columns=['Species'])
    y_train = train['Species']

    # Fit the model and predict.
    model.fit(X=x_train, y=y_train)
    pred_vals = model.predict(X=x_test)

    # Calculate the accuracy and f1 score.
    acc = accuracy_score(y_true=y_test, y_pred=pred_vals)
    score = f1_score(y_true=y_test, y_pred=pred_vals, average='weighted')
    final_results.append(f"{seed}, {select}_{augment}, RandomForest, {acc}, {f1}")


Trajectories before augmentation: 202
Trajectories after augmentation: 242


ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
final_results