In [1]:
import random

import matplotlib.pyplot as plt
import pandas as pd
from ptrail.core.Datasets import Datasets
from ptrail.features.kinematic_features import KinematicFeatures
from ptrail.preprocessing.statistics import Statistics
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score

from src.augmentation.augment import Augmentation
from src.selection.select import Selection
from src.utils.general_utils import Utilities

In [2]:
BASE = 'base'
RANDOM_SELECTED = 'random_selected'
PROPORTIONAL_SELECTED = 'proportional_selected'
FEWEST_SELECTED = 'fewest_selected'
REPRESENTATIVE_SELECTED = 'representative_selected'
BALANCED_ON = 'balanced_on'
BALANCED_IN = 'balanced_in'

In [3]:
def augment_trajectories(dataset, ids_to_augment, circle, class_col, random_):
    if random_:
        print("Performing Random Augmentation")
        dataset = Augmentation.augment_trajectories_with_randomly_generated_points(dataset,
                                                                                   ids_to_augment=ids_to_augment,
                                                                                   circle=circle)

        for i in range(1, 3):
            dataset = Augmentation.augment_trajectories_with_randomly_generated_points(dataset,
                                                                                       ids_to_augment=ids_to_augment,
                                                                                       circle=circle)

    else:
        dataset = Augmentation.augment_trajectories_by_dropping_points(dataset, ids_to_augment)

        for i in range(1, 3):
            dataset = Augmentation.augment_trajectories_by_dropping_points(dataset, ids_to_augment)

    pivoted = Statistics.pivot_stats_df(dataframe=Statistics.generate_kinematic_stats(dataset, class_col),
                                        target_col_name=class_col).dropna()
    return pivoted.drop(columns=[class_col]), pivoted[class_col]


In [4]:
starkey_data = Datasets.load_starkey()
ready_dataset = KinematicFeatures.create_distance_column(starkey_data).reset_index()

------------------------ Dataset Facts ------------------------------

Number of unique Trajectories in the data: 253
Number of points in the data: 287136
Dataset time range: 1196 days 22:51:45
Datatype of the DataFrame: <class 'ptrail.core.TrajectoryDF.PTRAILDataFrame'>
Dataset Bounding Box: (45.18896978643169, -118.61020848239596, 45.314545642992, -118.50455596234036)

---------------------------------------------------------------------


In [5]:
def get_test_train_data(seed_val, k=0.8):
    """
        Given the seed value and a proportion, split the
        data into training and testing set and return it.

        Parameters
        ----------
        seed_val: int
            The seed value to use to control the randomness
            while selecting the train-test split.
        k: float
            The percent of data to be used as training data.

        Returns
        -------
            tuple:
                training, testing_x and testing_y
    """
    # Get all the Trajectory Ids and set the random state.
    traj_ids = list(ready_dataset['traj_id'].unique())
    random.seed(seed_val)

    # Select the ids to be used as training set and the calculate subsequent testing ids.
    train_size = int(len(traj_ids) * k)
    train_traj_ids = random.sample(traj_ids, train_size)
    test_traj_ids = [id_ for id_ in traj_ids if id_ not in train_traj_ids]

    # Split the data into training and testing sets.
    training = ready_dataset.loc[ready_dataset.traj_id.isin(train_traj_ids)]
    testing = ready_dataset.loc[ready_dataset.traj_id.isin(test_traj_ids)]

    pivoted_test = Statistics.pivot_stats_df(Statistics.generate_kinematic_stats(testing, 'Species', False),
                                             'Species').dropna()

    return training.dropna(), pivoted_test.drop(columns=['Species']), pivoted_test['Species']

In [6]:
def select_ids(dataset, select_strategy, seed_val):
    if select_strategy == BASE:
        return []
    if select_strategy == RANDOM_SELECTED:
        return Selection.select_randomly(dataset, seed_val)
    if select_strategy == PROPORTIONAL_SELECTED:
        return Selection.select_trajectories_proportionally(dataset, 'Species', seed_val)
    if select_strategy == FEWEST_SELECTED:
        return Selection.select_with_fewest_points(dataset)
    if select_strategy == REPRESENTATIVE_SELECTED:
        return Selection.select_representative_trajectories(dataset, 'Species', closeness_cutoff=0.7, tolerance=10)
    if select_strategy == BALANCED_ON:
        balanced =  Augmentation.balance_dataset_with_augmentation(dataset, 'Species')
        balanced = Statistics.pivot_stats_df(dataframe=Statistics.generate_kinematic_stats(balanced, 'Species'),
                                                        target_col_name='Species').dropna()
        return balanced.drop(columns=['Species']), balanced['Species']
    if select_strategy == BALANCED_IN:
        balanced =  Augmentation.balance_dataset_with_augmentation(dataset, 'Species', 'in')
        balanced = Statistics.pivot_stats_df(dataframe=Statistics.generate_kinematic_stats(balanced, 'Species'),
                                                        target_col_name='Species').dropna()
        return balanced.drop(columns=['Species']), balanced['Species']


In [7]:
seed_generator = Utilities.generate_pi_seed(20)
seed_vals = [next(seed_generator) for i in range(20)]
final_results = ["seed, strategy, model, accuracy, f1_score"]

# All our selection strategies.
select_strategies = [
    BASE,
    RANDOM_SELECTED,
    PROPORTIONAL_SELECTED,
    FEWEST_SELECTED,
    REPRESENTATIVE_SELECTED,
    BALANCED_ON,
    BALANCED_IN
]
models = [GradientBoostingClassifier(), DecisionTreeClassifier(), SVC()]

for seed in seed_vals:
    for strategy in select_strategies:
        for model in models:
                training, test_x, test_y = get_test_train_data(seed, k=0.8)
                if 'balanced' not in strategy and strategy != BASE:
                    selected = select_ids(dataset=training, select_strategy=strategy, seed_val=seed)
                    train_x, train_y = augment_trajectories(training, selected, '', 'Species', False)
                elif strategy == BASE:
                    training = Statistics.pivot_stats_df(dataframe=Statistics.generate_kinematic_stats(training, 'Species'),
                                                        target_col_name='Species').dropna()
                    train_x, train_y = training.drop(columns=['Species']), training['Species']
                else:
                    train_x, train_y = select_ids(dataset=training, select_strategy=strategy, seed_val=seed)
                if (train_x is not None) and (train_y is not None):
                    # Fit the model and predict.
                    model.random_state = seed
                    model.fit(X=train_x, y=train_y)
                    pred_vals = model.predict(X=test_x)

                    # Calculate the accuracy and f1 score.
                    acc = accuracy_score(y_true=test_y, y_pred=pred_vals)
                    score = f1_score(y_true=test_y, y_pred=pred_vals, average='weighted')
                    print(f"{seed}, {strategy}_drop, {model.__class__.__name__}, {acc}, {score}")
                    final_results.append(f"{seed}, {strategy}_drop, {model.__class__.__name__}, {acc}, {score}")


1415, base_drop, GradientBoostingClassifier, 0.9411764705882353, 0.94213898743228
1415, base_drop, DecisionTreeClassifier, 0.9215686274509803, 0.9203891803400677
1415, base_drop, SVC, 0.7450980392156863, 0.7311415794387932
1415, random_selected_drop, GradientBoostingClassifier, 0.9411764705882353, 0.9400729988965284
1415, random_selected_drop, DecisionTreeClassifier, 0.8823529411764706, 0.8812215082111277
1415, random_selected_drop, SVC, 0.7647058823529411, 0.749062443568132
1415, proportional_selected_drop, GradientBoostingClassifier, 0.9215686274509803, 0.9203891803400677
1415, proportional_selected_drop, DecisionTreeClassifier, 0.9019607843137255, 0.9011800849372007
1415, proportional_selected_drop, SVC, 0.7450980392156863, 0.7228962463259572
1415, fewest_selected_drop, GradientBoostingClassifier, 0.9411764705882353, 0.9399759903961584
1415, fewest_selected_drop, DecisionTreeClassifier, 0.9411764705882353, 0.9422466488908238
1415, fewest_selected_drop, SVC, 0.8235294117647058, 0.823

In [8]:
import csv

# Specify the file path to save the CSV file
file_path = "drop_results_starkey.csv"

# Write the list to a CSV file
with open(file_path, mode="w", newline="") as file:
    writer = csv.writer(file)
    for item in final_results:
        writer.writerow(item.split(", "))