In [1]:
#import sys
#sys.path.append('/home/nicholasjesperson/Documents/School/Comp4780/Data_Augmentation/Data Augmentation/paper/AugmenTRAJ')

In [2]:
import math
from random import *

import pandas as pd
import progressbar
from ptrail.core.TrajectoryDF import PTRAILDataFrame
from ptrail.features.kinematic_features import KinematicFeatures
from ptrail.preprocessing.statistics import Statistics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

from src.augmentation.augment import Augmentation
from src.selection.select import Selection

In [3]:
# def trajectoryAugumentationProcedure(trajs, seed, class_name, selection, augment):
#     myRandom = Random(seed)
#
#     # Select the trajectories and remove duplicates from original dataset.
#     if selection == 'random':
#         splits = Selection.select_randomly(trajs, myRandom, .2)
#     elif selection == 'proportional':
#         splits = Selection.select_trajectories_proportionally(trajs, myRandom, .2)
#     elif selection == 'class':
#         splits = Selection.select_fewest_class(trajs, class_name, myRandom)
#
#     paramTestingDataSet = Filters.remove_duplicates(dataframe=trajs)
#
#     trainDataParm = paramTestingDataSet.loc[paramTestingDataSet.traj_id.isin(splits["train"]) == True].dropna()
#     testDataParm = paramTestingDataSet.loc[paramTestingDataSet.traj_id.isin(splits["test"]) == True].dropna()
#     testData = PTRAILDataFrame(data_set=testDataParm,
#                                latitude='lat',
#                                longitude='lon',
#                                datetime='DateTime',
#                                traj_id='traj_id')
#
#     statsTestParm = Statistics.generate_kinematic_stats(dataframe=testData, target_col_name=class_name)
#     pivotedStatsTestParm = Statistics.pivot_stats_df(dataframe=statsTestParm, target_col_name=class_name)
#     pivotedStatsTestParm = pivotedStatsTestParm.loc[:,~pivotedStatsTestParm.columns.duplicated()]
#     testParmX = pivotedStatsTestParm.drop(columns=class_name)
#     testParmY = pivotedStatsTestParm[class_name].to_numpy()
#     noiseTraj = trainDataParm.traj_id.unique()
#
#     sampledTraj = myRandom.choices(sorted(noiseTraj), k=math.floor(len(noiseTraj)))
#     for traj in sampledTraj:
#         trajToChange = trainDataParm.loc[trainDataParm.traj_id == traj]
#         #Trajectory must be changed
#         if augment == 'on':
#             trajChanged = Augmentation.augment_trajectories_with_randomly_generated_points(trajToChange, myRandom, 'on')
#         elif augment == 'in':
#             trajChanged = Augmentation.augment_trajectories_with_randomly_generated_points(trajToChange, myRandom, 'in')
#
#         trainDataParm = pd.concat([trainDataParm, trajChanged], ignore_index = True)
#
#     trainDataNoise = PTRAILDataFrame(data_set=trainDataParm,
#                                             datetime='DateTime',
#                                             traj_id='traj_id',
#                                             latitude='lat',
#                                             longitude='lon')
#
#     statsTrainNoiseParm = Statistics.generate_kinematic_stats(dataframe=trainDataNoise, target_col_name=class_name)
#     pivotedStatsTrainNoiseParm = Statistics.pivot_stats_df(dataframe=statsTrainNoiseParm, target_col_name=class_name)
#     pivotedStatsTrainNoise = pivotedStatsTrainNoiseParm.loc[:, ~pivotedStatsTrainNoiseParm.columns.duplicated()]
#     pivotedStatsTrainNoise=pivotedStatsTrainNoise.dropna()
#
#     trainParmX = pivotedStatsTrainNoise.drop(columns=class_name)
#     trainParmY = pivotedStatsTrainNoise[class_name].to_numpy()
#
#     # Why is this interpolated?
#     testParmX = testParmX.interpolate()
#     return [trainParmX, trainParmY, testParmX, testParmY]

In [4]:
def augment_and_create_train_test_splits(trajs, seed, class_name, selection, augment, augment_percent):
    myRandom = Random(seed)

    # Select the trajectories and remove duplicates from original dataset.
    if selection == 'random':
        splits = Selection.select_randomly(trajs, myRandom, .2)
    elif selection == 'proportional':
        splits = Selection.select_trajectories_proportionally(trajs, myRandom, .2)
    elif selection == 'fewest':
        splits = Selection.select_fewest_class(trajs, class_name, myRandom)
    unique_data = trajs.drop_duplicates(subset=['DateTime', 'traj_id', 'lat', 'lon'],
            keep='first')

    # Split the original data into training and testing sets as per the splits created above.
    unique_training_data = unique_data.loc[unique_data['traj_id'].isin(splits['train'])].dropna()
    unique_testing_data = unique_data.loc[unique_data['traj_id'].isin(splits['test'])].dropna()

    # Randomly select the trajectories to be augmented and then
    # perform the augmentation procedure.
    # TODO: Also, why are we augmenting all the trajectories here?
    traj_ids = unique_training_data['traj_id'].unique()
    selected_trajectories = myRandom.choices(sorted(traj_ids), k=math.floor(augment_percent * len(traj_ids)))
    for traj in selected_trajectories:
        current_traj = unique_training_data.loc[unique_training_data.traj_id == traj]
        #Trajectory must be changed
        if augment == 'on':
            augmented_traj = Augmentation.augment_trajectories_with_randomly_generated_points(current_traj, myRandom, 'on')
        elif augment == 'in':
            augmented_traj = Augmentation.augment_trajectories_with_randomly_generated_points(current_traj, myRandom, 'in')
        unique_training_data = pd.concat([unique_training_data, augmented_traj])

    # Create the training X and Y values.
    X_train = Statistics.pivot_stats_df(Statistics.generate_kinematic_stats(unique_training_data, class_name),
                                        class_name)
    Y_train = X_train[class_name].to_numpy()

    # Create the testing X and Y values.
    X_test = Statistics.pivot_stats_df(Statistics.generate_kinematic_stats(unique_testing_data, class_name),
                                       class_name)
    Y_test = X_test[class_name].to_numpy()

    # Drop the target column and return the training and testing splits.
    return [X_train.drop(columns=[class_name]).to_numpy(), Y_train,
            X_test.drop(columns=[class_name]).to_numpy(), Y_test]


In [5]:
dataset = PTRAILDataFrame(pd.read_csv('./starkey.csv'), traj_id='traj_id',
                          datetime='DateTime', latitude='lat', longitude='lon')
ready_dataset = KinematicFeatures.create_distance_column(dataset)

ready_dataset.reset_index(inplace=True)
ready_dataset

Unnamed: 0,traj_id,DateTime,lat,lon,StarkeyTime,GMDate,GMTime,LocDate,LocTime,RadNum,Species,UTME,UTMN,Year,Grensunr,Grensuns,Obswt,Distance
0,880109D01,1995-04-13 13:40:06,45.239682,-118.533204,229902006,21:40:06,19950413,19950413,13:40:06,409,0,379662,5010734,95,13:13:00,02:39:00,1.47,
1,880109D01,1995-04-15 12:16:15,45.250521,-118.530438,230069775,20:16:15,19950415,19950415,12:16:15,409,0,379895,5011927,95,13:09:00,02:41:00,1.59,1224.551334
2,880109D01,1995-04-15 21:39:38,45.247943,-118.541455,230103578,05:39:38,19950416,19950415,21:39:38,409,0,379039,5011656,95,13:07:00,02:43:00,1.34,908.878736
3,880109D01,1995-04-16 03:32:14,45.247429,-118.539530,230124734,11:32:14,19950416,19950416,03:32:14,409,0,379188,5011581,95,13:07:00,02:43:00,1.50,161.204428
4,880109D01,1995-04-16 04:08:28,45.247117,-118.542579,230126908,12:08:28,19950416,19950416,04:08:28,409,0,378938,5011567,95,13:07:00,02:43:00,1.34,241.258531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287131,OSUX93191,1996-08-15 06:51:06,45.220642,-118.543392,272213466,14:51:06,19960815,19960815,06:51:06,390,2,378821,5008634,96,12:56:00,03:04:00,1.60,892.331554
287132,OSUX93191,1996-08-15 08:45:15,45.219785,-118.546807,272220315,16:45:15,19960815,19960815,08:45:15,390,2,378568,5008518,96,12:56:00,03:04:00,1.39,283.975120
287133,OSUX93191,1996-08-15 10:36:54,45.219801,-118.545661,272227014,18:36:54,19960815,19960815,10:36:54,390,2,378645,5008543,96,12:56:00,03:04:00,1.43,89.767305
287134,OSUX93191,1996-08-15 12:31:22,45.220268,-118.551024,272233882,20:31:22,19960815,19960815,12:31:22,390,2,378232,5008600,96,12:56:00,03:04:00,1.53,423.187635


In [6]:
%%time

# seed_vals = [14159, 26535, 89793]
# selection_vals = ['random', 'class', 'proportional']
# augment_vals = ['on', 'in']

seed_vals = [14159]
selection_vals = ['random', 'fewest', 'proportional']
augment_vals = ['on']
augment_percents = [0.33]

i = 0
lst = []
bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
for s in seed_vals:
    for sel in selection_vals:
        for aug in augment_vals:
            for percent in augment_percents:
                bar.update(i)
                i += 1
                # Create the model.
                model = RandomForestClassifier(random_state=s)

                # Get the augmented data.
                data = augment_and_create_train_test_splits(ready_dataset, s, 'Species', sel, aug, percent)

                # Fit the model and perform testing.
                model.fit(data[0], data[1])
                test_predict = model.predict(data[2])
                performance_val = f1_score(data[3], test_predict, average='weighted')
                # Add the value to dict.
                lst.append(f"{s}, {sel}, {aug}, {percent}, {performance_val}")

print(lst)

\ |                        #                          | 2 Elapsed Time: 0:02:15

['14159, random, on, 0.33, 0.9578571428571427', '14159, fewest, on, 0.33, 0.9374600414878596', '14159, proportional, on, 0.33, 0.8120335621662852']
CPU times: user 3min 17s, sys: 7.39 s, total: 3min 25s
Wall time: 3min 36s
