In [1]:
import pandas as pd
from ptrail.features.kinematic_features import KinematicFeatures
from ptrail.core.TrajectoryDF import PTRAILDataFrame

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from src.utils.general_utils import Utilities
from TestUtils.test_utils import TestUtils
from TestUtils.Keys import *

In [2]:
gl_dataset = PTRAILDataFrame(data_set=pd.read_csv('./TestUtils/geolife.csv'),
                             traj_id='traj_id',
                             datetime='DateTime',
                             latitude='lat',
                             longitude='lon')
ready_dataset = KinematicFeatures.create_distance_column(gl_dataset)
ready_dataset

Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,mode_of_transport,Distance
traj_id,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,2008-03-31 16:00:08,41.741415,86.186028,1,
10,2008-03-31 16:01:07,41.737063,86.179470,1,728.185829
10,2008-03-31 16:02:07,41.734105,86.172823,1,642.172796
10,2008-03-31 16:03:06,41.739110,86.166563,1,761.267192
10,2008-03-31 16:04:05,41.744368,86.159987,1,799.694199
...,...,...,...,...,...
98,2007-06-02 12:07:19,39.935300,116.468267,1,14.666196
98,2007-06-02 12:07:58,39.935450,116.468333,1,17.621166
98,2007-06-02 12:08:20,39.935400,116.468517,1,16.590457
98,2007-06-02 12:09:40,39.934633,116.468983,1,94.077625


In [3]:
# Get the 20 seed values that we are going to use.
seed_generator = Utilities.generate_pi_seed(20)
seed_vals = [next(seed_generator) for i in range(20)]
final_results = ["seed, strategy, model, accuracy, f1_score"]

# All our selection strategies.
select_strategies = [
    BASE, BALANCED_IN, BALANCED_ON, BALANCED_DROP, BALANCED_STRETCH
]

models = [GradientBoostingClassifier(), DecisionTreeClassifier(), RandomForestClassifier()]

for seed in seed_vals:
    iter_map = TestUtils.get_iterable_map(ready_dataset, seed, 'mode_of_transport')
    for select_strategy in select_strategies:
        for model in models:
            train_x, train_y = TestUtils.select_correct_test_train_split(iter_map, select_strategy,
                                                                         BASE, 'mode_of_transport', 3)
            if (train_x is not None) and (train_y is not None):
                # Fit the model and predict.
                model.random_state = seed
                model.fit(X=train_x, y=train_y)
                pred_vals = model.predict(X=iter_map[TEST_X])

                # Calculate the accuracy and f1 score.
                acc = accuracy_score(y_true=iter_map[TEST_Y], y_pred=pred_vals)
                score = f1_score(y_true=iter_map[TEST_Y], y_pred=pred_vals, average='weighted')
                if 'balanced' not in select_strategy:
                    print(f"{seed}, base, {model.__class__.__name__}, {acc}, {score}")
                    final_results.append(f"{seed}, base,"f" {model.__class__.__name__}, {acc}, {score}")
                else:
                    print(f"{seed}, {select_strategy}, {model.__class__.__name__}, {acc}, {score}")
                    final_results.append(f"{seed}, {select_strategy},"f" {model.__class__.__name__}, {acc}, {score}")


1415, base, GradientBoostingClassifier, 0.875, 0.8818181818181818
1415, base, DecisionTreeClassifier, 0.75, 0.7666666666666667
1415, base, RandomForestClassifier, 0.75, 0.7666666666666667
1415, balanced_in, GradientBoostingClassifier, 0.625, 0.6428571428571429
1415, balanced_in, DecisionTreeClassifier, 0.875, 0.8589743589743589
1415, balanced_in, RandomForestClassifier, 0.875, 0.8818181818181818
1415, balanced_on, GradientBoostingClassifier, 0.375, 0.40476190476190477
1415, balanced_on, DecisionTreeClassifier, 0.75, 0.7666666666666667
1415, balanced_on, RandomForestClassifier, 0.875, 0.8818181818181818
1415, balanced_drop, GradientBoostingClassifier, 0.75, 0.7666666666666667
1415, balanced_drop, DecisionTreeClassifier, 0.875, 0.8589743589743589
1415, balanced_drop, RandomForestClassifier, 0.5, 0.5
1415, balanced_stretch, GradientBoostingClassifier, 0.5, 0.5
1415, balanced_stretch, DecisionTreeClassifier, 0.5, 0.5
1415, balanced_stretch, RandomForestClassifier, 0.625, 0.6428571428571429

In [4]:
TestUtils.write_csv_file('./balance_geolife.csv', final_results)

File successfully written to: ./balance_geolife.csv
