In [1]:
#import sys
#sys.path.append('/home/nicholasjesperson/Documents/School/Comp4780/Data_Augmentation/Data Augmentation/paper/AugmenTRAJ')

In [2]:
from src.augmentation.augment import Augmentation
from src.selection.select import Selection
from src.utils.alter import Alter

import math
import pandas as pd
from random import *
import progressbar

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

from ptrail.core.TrajectoryDF import PTRAILDataFrame
from ptrail.preprocessing.filters import Filters
from ptrail.core.Datasets import Datasets
from ptrail.preprocessing.statistics import Statistics
from ptrail.features.kinematic_features import KinematicFeatures

In [3]:
def trajectoryAugumentationProcedure(trajs, seed, class_name, selection, augment):
    myRandom = Random(seed)

    # Select the trajectories and remove duplicates from original dataset.
    if selection == 'random':
        splits = Selection.select_randomly(trajs, myRandom, .2)
    elif selection == 'proportional':
        splits = Selection.select_trajectories_proportionally(trajs, myRandom, .2)
    elif selection == 'class':
        splits = Selection.select_fewest_class(trajs, class_name, myRandom)
    # else:
    #     splits = Selection.select_representative_trajectories(trajs, class_name)
    paramTestingDataSet = Filters.remove_duplicates(dataframe=trajs)

    trainDataParm = paramTestingDataSet.loc[paramTestingDataSet.traj_id.isin(splits["train"]) == True].dropna()
    testDataParm = paramTestingDataSet.loc[paramTestingDataSet.traj_id.isin(splits["test"]) == True].dropna()
    testData = PTRAILDataFrame(data_set=testDataParm,
                               latitude='lat',
                               longitude='lon',
                               datetime='DateTime',
                               traj_id='traj_id')

    statsTestParm = Statistics.generate_kinematic_stats(dataframe=testData, target_col_name=class_name)
    pivotedStatsTestParm = Statistics.pivot_stats_df(dataframe=statsTestParm, target_col_name=class_name)
    pivotedStatsTestParm = pivotedStatsTestParm.loc[:,~pivotedStatsTestParm.columns.duplicated()]
    testParmX = pivotedStatsTestParm.drop(columns=class_name)
    testParmY = pivotedStatsTestParm[class_name].to_numpy()
    noiseTraj = trainDataParm[class_name].unique()

    sampledTraj = myRandom.choices(sorted(noiseTraj), k=math.floor(len(noiseTraj)))
    for traj in sampledTraj:
        trajToChange = trainDataParm.loc[trainDataParm.traj_id == traj]

        #Trajectory must be changed
        if augment == 'on':
            trajChanged = Augmentation.augment_trajectories_with_randomly_generated_points(trajToChange, myRandom, 'on')
        elif augment == 'in':
            trajChanged = Augmentation.augment_trajectories_with_randomly_generated_points(trajToChange, myRandom, 'in')

        trainDataParm = pd.concat([trainDataParm, trajChanged], ignore_index = True)

    trainDataNoise = PTRAILDataFrame(data_set=trainDataParm,
                                            datetime='DateTime',
                                            traj_id='traj_id',
                                            latitude='lat',
                                            longitude='lon')
    
    statsTrainNoiseParm = Statistics.generate_kinematic_stats(dataframe=trainDataNoise, target_col_name=class_name)
    pivotedStatsTrainNoiseParm = Statistics.pivot_stats_df(dataframe=statsTrainNoiseParm, target_col_name=class_name)
    pivotedStatsTrainNoise = pivotedStatsTrainNoiseParm.loc[:, ~pivotedStatsTrainNoiseParm.columns.duplicated()]
    pivotedStatsTrainNoise=pivotedStatsTrainNoise.dropna()
    
    trainParmX = pivotedStatsTrainNoise.drop(columns=class_name)
    trainParmY = pivotedStatsTrainNoise[class_name].to_numpy()

    testParmX = testParmX.interpolate()
    return [trainParmX, trainParmY, testParmX, testParmY]

In [4]:
dataset = Datasets.load_starkey()
ready_dataset = dataset.reset_index()

------------------------ Dataset Facts ------------------------------

Number of unique Trajectories in the data: 253
Number of points in the data: 287136
Dataset time range: 1196 days 22:51:45
Datatype of the DataFrame: <class 'ptrail.core.TrajectoryDF.PTRAILDataFrame'>
Dataset Bounding Box: (45.18896978643169, -118.61020848239596, 45.314545642992, -118.50455596234036)

---------------------------------------------------------------------


In [5]:
%%time

# seed_vals = [14159, 26535, 89793]
# selection_vals = ['random', 'class', 'proportional']
# augment_vals = ['on', 'in']

seed_vals = [14159]
selection_vals = ['random', 'class', 'proportional']
augment_vals = ['on']

i = 0
lst = []
bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
for s in seed_vals:
    for sel in selection_vals:
        for aug in augment_vals:
            i += 1
            bar.update(i)
            # Create the model.
            model = RandomForestClassifier(random_state=s)

            # Get the augmented data.
            data = trajectoryAugumentationProcedure(ready_dataset, s, 'Species', sel, aug)

            # Fit the model and perform testing.
            model.fit(data[0], data[1])
            test_predict = model.predict(data[2])
            performance_val = f1_score(data[3], test_predict, average='weighted')
            # Add the value to dict.
            lst.append(f"{s},{sel},{aug},{performance_val}")

print(lst)

\ |                                   #               | 3 Elapsed Time: 0:01:35

['14159,random,on,0.9346558704453442', '14159,class,on,0.9377645795160169', '14159,proportional,on,0.7392282593275642']
CPU times: user 2min 8s, sys: 4.67 s, total: 2min 13s
Wall time: 2min 19s
