In [2]:
from src.augmentation.augment import Augmentation
from src.selection.select import Selection
from src.utils.alter import Alter

import math
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from random import *
from ptrail.core.TrajectoryDF import PTRAILDataFrame
from ptrail.preprocessing.filters import Filters
from ptrail.core.Datasets import Datasets
from ptrail.preprocessing.statistics import Statistics

In [3]:
def trajectoryAugumentationProcedure(trajs, seed, n, k, pradius, model):
    myRandom = Random(seed * (n * k * pradius))
    #     Split data into training and testing
    splits = Selection.select_randomly(trajs, myRandom)
    paramTestingDataSet = Filters.remove_duplicates(dataframe=trajs)

    trainDataParm = paramTestingDataSet.loc[paramTestingDataSet.traj_id.isin(splits["train"]) == True].dropna()
    testDataParm = paramTestingDataSet.loc[paramTestingDataSet.traj_id.isin(splits["test"]) == True].dropna()
    #  Organize test data
    testData = PTRAILDataFrame(data_set=testDataParm,
                               latitude='lat',
                               longitude='lon',
                               datetime='DateTime',
                               traj_id='traj_id')
    #Calculate features for testing data
    statsTestParm = Statistics.generate_kinematic_stats(dataframe=testData,
                                                target_col_name='Species')
    pivotedStatsTestParm = Statistics.pivot_stats_df(dataframe=statsTestParm,
                                              target_col_name='Species')
    #Remove duplicate columns
    pivotedStatsTestParm = pivotedStatsTestParm.loc[:,~pivotedStatsTestParm.columns.duplicated()]

    testParmX = pivotedStatsTestParm.drop(columns='Species')
    testParmY = pivotedStatsTestParm['Species'].to_numpy()

    # Organize training data
    noiseTraj = trainDataParm['traj_id'].unique()
    #Select n % of training data to augment
    ### Here you are selecting without replacement, but the same trajectory could be
    ### selected more than once according to the data augumentation theory I believe

    #Using rnaomd.choices instead of sample has replacement
    sampledTraj = myRandom.choices(sorted(noiseTraj), k=math.floor(n * len(noiseTraj)))
    for traj in sampledTraj:
        trajToChange = trainDataParm.loc[trainDataParm.traj_id == traj]
        #Add noise to selected data then concatenate it to the training data
        trajChanged = Augmentation.augment_trajectories_with_randomly_generated_points(trajToChange, pradius,
                                                                                       k, 100, myRandom,)
        trainDataParm = pd.concat([trainDataParm, trajChanged], ignore_index = True)

    trainDataNoiseFilt = trainDataParm.filter(["traj_id", "DateTime","lat", "lon", "StarkeyTime", "GMDate", "GMTime", "LocDate", "LocTime", "RadNum", "Species", "UTME", "UTMN", "Year", "Grensunr", "Grensuns", "Obswt"])



    trainDataNoise = PTRAILDataFrame(data_set=trainDataNoiseFilt,
                                            datetime='DateTime',
                                            traj_id='traj_id',
                                            latitude='lat',
                                            longitude='lon')

    #Calculate features of training data
    statsTrainNoiseParm = Statistics.generate_kinematic_stats(dataframe=trainDataNoise,
                                                        target_col_name='Species')

    pivotedStatsTrainNoiseParm = Statistics.pivot_stats_df(dataframe=statsTrainNoiseParm,
                                                      target_col_name='Species')

    pivotedStatsTrainNoise = pivotedStatsTrainNoiseParm.loc[:,~pivotedStatsTrainNoiseParm.columns.duplicated()]

    pivotedStatsTrainNoise=pivotedStatsTrainNoise.dropna()

    trainParmX = pivotedStatsTrainNoise.drop(columns='Species')
    trainParmY = pivotedStatsTrainNoise['Species'].to_numpy()

    #Test model classification
#     rf_model = RandomForestClassifier(random_state=seed)
    model.fit(trainParmX, trainParmY)
    test_predict = model.predict(testParmX)
    performance_val = f1_score(testParmY, test_predict, average='weighted')
    print(f"Current run: k={k}, pradius={pradius}, n={n}, fscore={performance_val}, seed={seed}")
    return str(f"{n},{k},{pradius},{performance_val}\n")

In [4]:
# Import the dataset.
dataset = Datasets.load_starkey()
ready_dataset = Filters.remove_duplicates(dataframe=dataset)

seeds = [14159,26535,89793,23846,26433,83279,50288,41971,69399,37510]
n_vals = [.2, .3, .4, .5]
k_vals = [.1, .2, .3, .4]
rad_vals = [.001, .005, .01, .02]

# Repeats 10 times (10 seeds) 80% training/ 20% testing

text_file = open("results.txt", "w")
for n in n_vals:
    for k in k_vals:
        for rad in rad_vals:
            for s in seeds:
                # n = percentage of trajs to be augumented, k = percentage of points to be changed, and radius
                print(trajectoryAugumentationProcedure(ready_dataset, s, n, k,
                                                       rad, RandomForestClassifier(random_state=s)))
text_file.close()

------------------------ Dataset Facts ------------------------------

Number of unique Trajectories in the data: 253
Number of points in the data: 287136
Dataset time range: 1196 days 22:51:45
Datatype of the DataFrame: <class 'ptrail.core.TrajectoryDF.PTRAILDataFrame'>
Dataset Bounding Box: (45.18896978643169, -118.61020848239596, 45.314545642992, -118.50455596234036)

---------------------------------------------------------------------
printing select  253
253 183
252 167
251 166
250 88
249 242
248 55
247 118
246 240
245 203
244 163
243 160
242 214
241 223
240 55
239 15
238 69
237 84
236 221
235 108
234 172
233 56
232 4
231 46
230 28
229 214
228 26
227 69
226 122
225 98
224 151
223 61
222 201
221 117
220 84
219 48
218 47
217 112
216 15
215 57
214 22
213 144
212 3
211 61
210 98
209 30
208 135
207 44
206 34
205 129
204 130
Current run: k=0.1, pradius=0.001, n=0.2, fscore=0.9155404383975813, seed=14159
0.2,0.1,0.001,0.9155404383975813

printing select  253
253 108
252 14
251 83
250 76

Current run: k=0.1, pradius=0.005, n=0.2, fscore=0.9362848297213623, seed=83279
0.2,0.1,0.005,0.9362848297213623

printing select  253
253 206
252 83
251 156
250 110
249 221
248 206
247 241
246 215
245 201
244 232
243 215
242 4
241 144
240 173
239 63
238 124
237 176
236 179
235 112
234 169
233 163
232 156
231 166
230 99
229 177
228 210
227 8
226 119
225 223
224 18
223 96
222 143
221 144
220 133
219 170
218 186
217 203
216 122
215 202
214 173
213 48
212 12
211 206
210 146
209 144
208 157
207 156
206 124
205 98
204 190
Current run: k=0.1, pradius=0.005, n=0.2, fscore=0.96, seed=50288
0.2,0.1,0.005,0.96

printing select  253
253 124
252 158
251 2
250 113
249 79
248 6
247 210
246 19
245 226
244 237
243 29
242 102
241 151
240 148
239 94
238 53
237 38
236 198
235 234
234 185
233 192
232 213
231 177
230 219
229 1
228 69
227 14
226 103
225 61
224 49
223 189
222 7
221 210
220 205
219 89
218 151
217 178
216 186
215 26
214 92
213 167
212 139
211 121
210 48
209 78
208 177
207 10
206 174
205 17
204

KeyboardInterrupt: 