In [1]:
import random

import pandas as pd

from ptrail.core.TrajectoryDF import PTRAILDataFrame
from ptrail.features.kinematic_features import KinematicFeatures
from ptrail.preprocessing.filters import Filters
from ptrail.preprocessing.interpolation import Interpolation
from ptrail.features.temporal_features import TemporalFeatures

"""
    So what I am planning to do for this notebook is that we will
    first try to perform classification of Species of the Starkey
    dataset using the original dataset only and plot it using
    matplotlib and see what the results look like.

    Next, we will use PTRAIL to generate features on the starkey
    dataset and then we will again perform the same kind of
    classification performed above to see the difference and how
    the features generated by us help in making the classification
    process better.
"""

pdf = pd.read_csv('./starkey_new.csv')
starkey = PTRAILDataFrame(data_set=pdf,
                          latitude='lat',
                          longitude='lon',
                          datetime='DateTime',
                          traj_id='Id')
print(starkey)

------------------------ Dataset Facts ------------------------------

Number of unique Trajectories in the data: 253
Number of points in the data: 287136
Dataset time range: 1196 days 22:51:45
Datatype of the DataFrame: <class 'ptrail.core.TrajectoryDF.PTRAILDataFrame'>
Dataset Bounding Box: (45.18896978643169, -118.61020848239596, 45.314545642992, -118.50455596234036)

---------------------------------------------------------------------


In [2]:
%%time

# Now applying a full pipeline of feature generation and data cleanup
# using PTRAIL.

# 1. Create a distance column based on which we will remove
# outlier using hampel filter.
starkey = KinematicFeatures.create_distance_column(dataframe=starkey)

# 2. Applying hampel filter.
filt_starkey = Filters.hampel_outlier_detection(dataframe=starkey,
                                                column_name='Distance')

# 3. Now, interpolate the trajectories using linear Interpolation.
ip_starkey = Interpolation.interpolate_position(dataframe=filt_starkey,
                                                time_jump=3600*2,
                                                ip_type='linear')

# 4. Now, generate kinematic and temporal features on the
# interpolated dataset.
feat_starkey = KinematicFeatures.generate_kinematic_features(dataframe=ip_starkey)
feat_starkey = TemporalFeatures.generate_temporal_features(dataframe=feat_starkey)



CPU times: user 8.16 s, sys: 1.33 s, total: 9.5 s
Wall time: 1min 26s


In [14]:
# Now, we create test and train datasets. The train dataset includes
# 33% of all the unique trajectories in the dataset and the test
# contains the rest.
import random
import progressbar
import datetime as dt

def dtt2timestamp(dtt):
    ts = (dtt.hour * 60 + dtt.minute) * 60 + dtt.second
    #if you want microseconds as well
    ts += dtt.microsecond * 10**(-6)
    return ts

In [15]:
taken = []
train_33 = []
test_66 = []

total = feat_starkey.traj_id.unique().tolist()
len(total)

iterator = 0
while iterator != len(total)//3:
    index = random.randint(0, len(total))
    if index not in taken:
        train_33.append(total[index])
        taken.append(index)
        iterator += 1

for i in range(len(total)):
    if total[i] not in train_33:
        test_66.append(total[i])

print(len(test_66))
print(len(train_33))

169
84


In [20]:
test_chunks = []

bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
for i in range(len(test_66)):
    small = feat_starkey.reset_index().loc[feat_starkey.reset_index().traj_id == test_66[i]]

    if 'D' in test_66[i]:
        small['Species'] = 0
    elif 'E' in test_66[i]:
        small['Species'] = 1
    else:
        small['Species'] = 2

    small['Date'] = small['Date'].map(dt.datetime.toordinal)
    small['Time'] = small['Time'].apply(dtt2timestamp)
    small = small.drop(columns=['Day_Of_Week', 'Time_Of_Day'])

    test_chunks.append(small)
    bar.update(i)

test_df = PTRAILDataFrame(data_set=pd.concat(test_chunks).dropna(),
                          latitude='lat',
                          longitude='lon',
                          datetime='DateTime',
                          traj_id='Id')

/ |  #                                              | 168 Elapsed Time: 0:00:29

In [21]:
train_chunks = []

bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
for i in range(len(train_33)):
    small = feat_starkey.reset_index().loc[feat_starkey.reset_index().traj_id == train_33[i]]

    if 'D' in train_33[i]:
        small['Species'] = 0
    elif 'E' in train_33[i]:
        small['Species'] = 1
    else:
        small['Species'] = 2
    small['Date'] = small['Date'].map(dt.datetime.toordinal)
    small['Time'] = small['Time'].apply(dtt2timestamp)
    small = small.drop(columns=['Day_Of_Week', 'Time_Of_Day'])

    train_chunks.append(small)
    bar.update(i)

train_df = PTRAILDataFrame(data_set=pd.concat(train_chunks).dropna(),
                          latitude='lat',
                          longitude='lon',
                          datetime='DateTime',
                          traj_id='Id')

| |                                       #          | 83 Elapsed Time: 0:00:13

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_model = RandomForestClassifier()
rf_model.fit(train_df.drop(columns=['Species']),
             train_df['Species'])

RandomForestClassifier()

In [25]:
# Now, lets predict the Species on the train and test dataset.
rf_predict_train = rf_model.predict(train_df.drop(columns=['Species']))
print(f"Target on train data: {rf_predict_train}")

rf_predict_test = rf_model.predict(test_df.drop(columns=['Species']))
print(f"Target on test data: {rf_predict_test}")

Target on train data: [0 0 0 ... 2 2 2]
Target on test data: [0 0 0 ... 2 2 2]


In [26]:
rf_train_accuracy = accuracy_score(train_df['Species'], rf_predict_train)
print(f"The Training accuracy of RF is: {rf_train_accuracy*100}%")

rf_test_accuracy = accuracy_score(test_df['Species'], rf_predict_test)
print(f"The Testing accuracy of RF is: {rf_test_accuracy*100}%")

The Training accuracy of RF is: 100.0%
The Testing accuracy of RF is: 77.1181729647943%
