In [1]:
import numpy as np
import pandas as pd
from ptrail.core.TrajectoryDF import PTRAILDataFrame
from ptrail.features.kinematic_features import KinematicFeatures
from ptrail.core.Datasets import Datasets
from ptrail.preprocessing.statistics import Statistics

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from src.utils.test_utils import TestUtils
from src.selection.select import Selection
from src.utils.general_utils import Utilities

In [2]:
gl_dataset = PTRAILDataFrame(data_set=pd.read_csv('../TestUtils/geolife.csv'),
                             traj_id='traj_id',
                             datetime='DateTime',
                             latitude='lat',
                             longitude='lon')
ready_dataset = KinematicFeatures.create_distance_column(gl_dataset)
ready_dataset

Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,mode_of_transport,Distance
traj_id,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,2008-03-31 16:00:08,41.741415,86.186028,1,
10,2008-03-31 16:01:07,41.737063,86.179470,1,728.185829
10,2008-03-31 16:02:07,41.734105,86.172823,1,642.172796
10,2008-03-31 16:03:06,41.739110,86.166563,1,761.267192
10,2008-03-31 16:04:05,41.744368,86.159987,1,799.694199
...,...,...,...,...,...
98,2007-06-02 12:07:19,39.935300,116.468267,1,14.666196
98,2007-06-02 12:07:58,39.935450,116.468333,1,17.621166
98,2007-06-02 12:08:20,39.935400,116.468517,1,16.590457
98,2007-06-02 12:09:40,39.934633,116.468983,1,94.077625


In [3]:
seed_generator = Utilities.generate_pi_seed(20)
seed_vals = [next(seed_generator) for i in range(20)]
shake_percentages = [0.2, 0.4, 0.6]
circle_methods = ['on', 'in']
ml_models = [ExtraTreesClassifier(), GradientBoostingClassifier(), RandomForestClassifier()]
scaler = MinMaxScaler((0, 1))

distance_results = [["seed", "on_20%_dist", "on_20%_std", "on_40%_dist", "on_40%_std", "on_60%_dist", "on_60%_std",
                    "in_20%_dist","in_20%_std","in_40%_dist","in_40%_std","in_60%_dist","in_60%_std"]]

model_results = [["seed", "model", "baseline", "in_20%_f1", "in_40%_f1", "in_60%_f1", "on_20%_f1", "on_40%_f1", "on_60%_f1"]]

for seed in seed_vals:
    # Intermediate lists for storing distance and model score values.
    distance_row = [seed]

    # Set apart 20% data for testing that augmentation process will never see.
    train, test_x, test_y = TestUtils.get_test_train_data(dataset=ready_dataset, seed_val=seed,
                                                          class_col='mode_of_transport', k=0.8)

    model_row = TestUtils.create_model_row(seed, ml_models, "mode_of_transport", train, test_x, test_y)
    for shake in shake_percentages:
        for method in circle_methods:
            # Randomly select 30% of trajectories to be augmented.
            selected = Selection.select_randomly(train, seed, k=0.3)

            # Augment the trajectories.
            train_x, train_y = TestUtils.augment_trajectories_using_random_strategy(dataset=train,
                                                                                    percent_to_shake=shake,
                                                                                    ids_to_augment=selected,
                                                                                    circle=method,
                                                                                    n_augmentations=20,
                                                                                    class_col="mode_of_transport")
            mean, std = TestUtils.find_original_and_augmentation_pairs_and_calculate_differences(train_x, selected)
            distance_row.append(mean)
            distance_row.append(std)

            for i in range(len(ml_models)):
                f1_score = TestUtils.train_model_and_evaluate(ml_models[i], scaler.fit_transform(train_x), train_y,
                                                              scaler.fit_transform(test_x), test_y, seed)
                model_row[i].append(f1_score)

    model_results.extend(model_row)
    distance_results.append(distance_row)

    print(model_row)

[[1415, 'ExtraTreesClassifier', 0.5, 0.6429, 0.6429, 0.6429, 0.6429, 0.6429, 0.7667], [1415, 'GradientBoostingClassifier', 0.6455, 0.5333, 0.5333, 0.5333, 0.5333, 0.5333, 0.5333], [1415, 'RandomForestClassifier', 0.7667, 0.7667, 0.6429, 0.7667, 0.7667, 0.7667, 0.7667]]
[[9265, 'ExtraTreesClassifier', 0.6864, 0.6864, 0.6864, 0.6864, 0.6864, 0.6864, 0.6864], [9265, 'GradientBoostingClassifier', 0.6864, 0.5667, 0.5667, 0.5667, 0.6864, 0.6864, 0.6864], [9265, 'RandomForestClassifier', 0.6864, 0.6731, 0.6731, 0.6731, 0.6731, 0.6731, 0.6731]]
[[3589, 'ExtraTreesClassifier', 0.5, 0.5, 0.3452, 0.5, 0.5, 0.3452, 0.0833], [3589, 'GradientBoostingClassifier', 0.3849, 0.631, 0.631, 0.631, 0.75, 0.15, 0.15], [3589, 'RandomForestClassifier', 0.5, 0.631, 0.5, 0.5, 0.631, 0.631, 0.5]]
[[7932, 'ExtraTreesClassifier', 0.6455, 0.6429, 0.7667, 0.6429, 0.5, 0.6429, 0.5], [7932, 'GradientBoostingClassifier', 0.6455, 0.7667, 0.7667, 0.7667, 0.7667, 0.6455, 0.6455], [7932, 'RandomForestClassifier', 0.7667, 0.

In [4]:
import csv

file_path = "./geolife_distances.csv"
with open(file_path, mode="w") as file:
    writer = csv.writer(file)
    for item in distance_results:
        writer.writerow(item)
    print(f"File successfully written to: {file_path}")

file_path = "./geolife_f1_score.csv"
with open(file_path, mode="w") as file:
    writer = csv.writer(file)
    for item in model_results:
        writer.writerow(item)
    print(f"File successfully written to: {file_path}")

File successfully written to: ./geolife_distances.csv
File successfully written to: ./geolife_f1_score.csv
