In [3]:
import sys
sys.path.append('../')

In [4]:
import math
from random import *

import pandas as pd
import progressbar
from ptrail.core.TrajectoryDF import PTRAILDataFrame
from ptrail.features.kinematic_features import KinematicFeatures
from ptrail.preprocessing.statistics import Statistics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

from src.augmentation.augment import Augmentation
from src.selection.select import Selection

In [5]:
# def trajectoryAugumentationProcedure(trajs, seed, class_name, selection, augment):
#     myRandom = Random(seed)
#
#     # Select the trajectories and remove duplicates from original dataset.
#     if selection == 'random':
#         splits = Selection.select_randomly(trajs, myRandom, .2)
#     elif selection == 'proportional':
#         splits = Selection.select_trajectories_proportionally(trajs, myRandom, .2)
#     elif selection == 'class':
#         splits = Selection.select_fewest_class(trajs, class_name, myRandom)
#
#     paramTestingDataSet = Filters.remove_duplicates(dataframe=trajs)
#
#     trainDataParm = paramTestingDataSet.loc[paramTestingDataSet.traj_id.isin(splits["train"]) == True].dropna()
#     testDataParm = paramTestingDataSet.loc[paramTestingDataSet.traj_id.isin(splits["test"]) == True].dropna()
#     testData = PTRAILDataFrame(data_set=testDataParm,
#                                latitude='lat',
#                                longitude='lon',
#                                datetime='DateTime',
#                                traj_id='traj_id')
#
#     statsTestParm = Statistics.generate_kinematic_stats(dataframe=testData, target_col_name=class_name)
#     pivotedStatsTestParm = Statistics.pivot_stats_df(dataframe=statsTestParm, target_col_name=class_name)
#     pivotedStatsTestParm = pivotedStatsTestParm.loc[:,~pivotedStatsTestParm.columns.duplicated()]
#     testParmX = pivotedStatsTestParm.drop(columns=class_name)
#     testParmY = pivotedStatsTestParm[class_name].to_numpy()
#     noiseTraj = trainDataParm.traj_id.unique()
#
#     sampledTraj = myRandom.choices(sorted(noiseTraj), k=math.floor(len(noiseTraj)))
#     for traj in sampledTraj:
#         trajToChange = trainDataParm.loc[trainDataParm.traj_id == traj]
#         #Trajectory must be changed
#         if augment == 'on':
#             trajChanged = Augmentation.augment_trajectories_with_randomly_generated_points(trajToChange, myRandom, 'on')
#         elif augment == 'in':
#             trajChanged = Augmentation.augment_trajectories_with_randomly_generated_points(trajToChange, myRandom, 'in')
#
#         trainDataParm = pd.concat([trainDataParm, trajChanged], ignore_index = True)
#
#     trainDataNoise = PTRAILDataFrame(data_set=trainDataParm,
#                                             datetime='DateTime',
#                                             traj_id='traj_id',
#                                             latitude='lat',
#                                             longitude='lon')
#
#     statsTrainNoiseParm = Statistics.generate_kinematic_stats(dataframe=trainDataNoise, target_col_name=class_name)
#     pivotedStatsTrainNoiseParm = Statistics.pivot_stats_df(dataframe=statsTrainNoiseParm, target_col_name=class_name)
#     pivotedStatsTrainNoise = pivotedStatsTrainNoiseParm.loc[:, ~pivotedStatsTrainNoiseParm.columns.duplicated()]
#     pivotedStatsTrainNoise=pivotedStatsTrainNoise.dropna()
#
#     trainParmX = pivotedStatsTrainNoise.drop(columns=class_name)
#     trainParmY = pivotedStatsTrainNoise[class_name].to_numpy()
#
#     # Why is this interpolated?
#     testParmX = testParmX.interpolate()
#     return [trainParmX, trainParmY, testParmX, testParmY]

In [6]:
def augment_and_create_train_test_splits(trajs, seed, class_name, selection, augment, augment_percent):
    myRandom = Random(seed)

    # Select the trajectories and remove duplicates from original dataset.
    if selection == 'random':
        splits = Selection.select_randomly(trajs, myRandom, .2)
    elif selection == 'proportional':
        splits = Selection.select_trajectories_proportionally(trajs, myRandom, .2)
    elif selection == 'fewest':
        splits = Selection.select_fewest_class(trajs, class_name, myRandom)
    unique_data = trajs.drop_duplicates(subset=['DateTime', 'traj_id', 'lat', 'lon'],
            keep='first')

    # Split the original data into training and testing sets as per the splits created above.
    unique_training_data = unique_data.loc[unique_data['traj_id'].isin(splits['train'])].dropna()
    unique_testing_data = unique_data.loc[unique_data['traj_id'].isin(splits['test'])].dropna()

    # Randomly select the trajectories to be augmented and then
    # perform the augmentation procedure.
    # TODO: Also, why are we augmenting all the trajectories here?
    traj_ids = unique_training_data['traj_id'].unique()
    selected_trajectories = myRandom.choices(sorted(traj_ids), k=math.floor(augment_percent * len(traj_ids)))
    for traj in selected_trajectories:
        current_traj = unique_training_data.loc[unique_training_data.traj_id == traj]
        #Trajectory must be changed
        if augment == 'on':
            augmented_traj = Augmentation.augment_trajectories_with_randomly_generated_points(current_traj, myRandom, 'on')
        elif augment == 'in':
            augmented_traj = Augmentation.augment_trajectories_with_randomly_generated_points(current_traj, myRandom, 'in')
        unique_training_data = pd.concat([unique_training_data, augmented_traj])

    # Create the training X and Y values.
    X_train = Statistics.pivot_stats_df(Statistics.generate_kinematic_stats(unique_training_data, class_name),
                                        class_name)
    Y_train = X_train[class_name].to_numpy()

    # Create the testing X and Y values.
    X_test = Statistics.pivot_stats_df(Statistics.generate_kinematic_stats(unique_testing_data, class_name),
                                       class_name)
    Y_test = X_test[class_name].to_numpy()

    # Drop the target column and return the training and testing splits.
    return [X_train.drop(columns=[class_name]).to_numpy(), Y_train,
            X_test.drop(columns=[class_name]).to_numpy(), Y_test]


In [9]:
dataset = PTRAILDataFrame(pd.read_csv('./starkey.csv'), traj_id='traj_id',
                          datetime='DateTime', latitude='lat', longitude='lon')
ready_dataset = KinematicFeatures.create_distance_column(dataset)

ready_dataset.reset_index(inplace=True)
ready_dataset.reset_index().traj_id.unique()

array(['880109D01', '880119D02', '880120D02', '890130D09', '890221E02',
       '890222E01', '890224E04', '890317E01', '890317E23', '890317E25',
       '890324E17', '890328E12', '890328E21', '890328E34', '890413E05',
       '890418E01', '890418E04', '890418E13', '890418E15', '890424E06',
       '890424E08', '890504E01', '900205E01', '900205E11', '900205E14',
       '900219E15', '900313D02', '900626E01', '910130D01', '910214D01',
       '910301D01', '910312E05', '910312E09', '910312E12', '910312E13',
       '910313E07', '910313E11', '910313E18', '910313E19', '910313E26',
       '910313E37', '910315E04', '910315E08', '910315E14', '910315E15',
       '910315E17', '910315E20', '910315E22', '910319E02', '910319E11',
       '920122D01', '920125E01', '920225D01', '920226D01', '920303D05',
       '920304D03', '920309D02', '920318D02', '921123E22', '921130E09',
       '921130E30', '921215E02', '921216E02', '921216E04', '921216E08',
       '921228E02', '921228E04', '921228E06', '921228E09', '9212

In [None]:
%%time

# seed_vals = [14159, 26535, 89793]
# selection_vals = ['random', 'class', 'proportional']
# augment_vals = ['on', 'in']

seed_vals = [14159]
selection_vals = ['random', 'fewest', 'proportional']
augment_vals = ['on']
augment_percents = [0.2]

i = 0
lst = []
bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
for s in seed_vals:
    for sel in selection_vals:
        for aug in augment_vals:
            for percent in augment_percents:
                bar.update(i)
                i += 1
                # Create the model.
                model = RandomForestClassifier(random_state=s)

                # Get the augmented data.
                data = augment_and_create_train_test_splits(ready_dataset, s, 'Species', sel, aug, percent)

                # Fit the model and perform testing.
                model.fit(data[0], data[1])
                test_predict = model.predict(data[2])
                performance_val = f1_score(data[3], test_predict, average='weighted')
                # Add the value to dict.
                lst.append(f"{s}, {sel}, {aug}, {percent}, {performance_val}")

print(lst)