In [1]:
import pandas as pd
from ptrail.core.TrajectoryDF import PTRAILDataFrame

"""
    First, to begin with the following actions ar performed:
        1. Read the database from the csv file using pandas.
        2. Convert the pandas dataframe to PTRAILDataFrame DataFrame
           in order to represent the data with PTRAIL.

    It is to be duly noted that the Starkey Project database provided
    originally has been modified here to meet with the needs of
    PTRAIL Library.
"""
pdf = pd.read_csv('https://raw.githubusercontent.com/YakshHaranwala/PTRAIL/main/examples/data/starkey.csv')
starkey = PTRAILDataFrame(data_set=pdf,
                          latitude='lat',
                          longitude='lon',
                          datetime='DateTime',
                          traj_id='Id')
print("Size of the Dataset: {}".format(starkey.shape))
starkey.head(5)

Size of the Dataset: (287136, 15)


Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,StarkeyTime,GMDate,GMTime,LocDate,LocTime,RadNum,Species,UTME,UTMN,Year,Grensunr,Grensuns,Obswt
traj_id,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
880109D01,1995-04-13 13:40:06,45.239682,-118.533204,229902006,21:40:06,19950413,19950413,13:40:06,409,D,379662,5010734,95,13:13:00,02:39:00,1.47
880109D01,1995-04-15 12:16:15,45.250521,-118.530438,230069775,20:16:15,19950415,19950415,12:16:15,409,D,379895,5011927,95,13:09:00,02:41:00,1.59
880109D01,1995-04-15 21:39:38,45.247943,-118.541455,230103578,05:39:38,19950416,19950415,21:39:38,409,D,379039,5011656,95,13:07:00,02:43:00,1.34
880109D01,1995-04-16 03:32:14,45.247429,-118.53953,230124734,11:32:14,19950416,19950416,03:32:14,409,D,379188,5011581,95,13:07:00,02:43:00,1.5
880109D01,1995-04-16 04:08:28,45.247117,-118.542579,230126908,12:08:28,19950416,19950416,04:08:28,409,D,378938,5011567,95,13:07:00,02:43:00,1.34


In [2]:

"""
Our course of action

1. First, generate a set of spatial and temporal features on the
   dataset.
2. Then, based on the above generated features, clean the dataset
   to reduce noise as well as make the trajectories smoother.
3. Then, once the noise has been cleared, we will further make
   the trajectory smoother, we will interpolate the trajectories.
   Now, an important point to consider for interpolation is that
   only 4 fundamental columns are returned.
4. Then, train various models based on the original starkey dataset.
5. Based on that trained model, we will predict the species of the
   animal for the interpolated and cleaned dataset.
"""

'\nOur course of action\n\n1. First, generate a set of spatial and temporal features on the\n   dataset.\n2. Then, based on the above generated features, clean the dataset\n   to reduce noise as well as make the trajectories smoother.\n3. Then, once the noise has been cleared, we will further make\n   the trajectory smoother, we will interpolate the trajectories.\n   Now, an important point to consider for interpolation is that\n   only 4 fundamental columns are returned.\n4. Then, train various models based on the original starkey dataset.\n5. Based on that trained model, we will predict the species of the\n   animal for the interpolated and cleaned dataset.\n'

In [3]:
%%time

# Step - 1: Feature generation.
# The following features are being generated:
#   1. Distance between consecutive points of the trajectory.
from ptrail.features.kinematic_features import KinematicFeatures
from ptrail.features.temporal_features import TemporalFeatures

starkey_feat = KinematicFeatures.create_distance_column(starkey)
starkey_feat = TemporalFeatures.create_time_of_day_column(starkey_feat)

CPU times: user 1.49 s, sys: 321 ms, total: 1.81 s
Wall time: 2.3 s


In [4]:
%%time

# Step - 2: Filtering.

# 2.1: Hampel filter based on distance between consecutive points.
from ptrail.preprocessing.filters import Filters

hamp_filt_starkey = Filters.hampel_outlier_detection(dataframe=starkey_feat,
                                                     column_name='Distance')
print(f"Length of original Dataframe: {len(starkey_feat)}")
print(f"Length of DataFrame after filtering: {len(hamp_filt_starkey)}")



Length of original Dataframe: 287136
Length of DataFrame after filtering: 262207
CPU times: user 859 ms, sys: 213 ms, total: 1.07 s
Wall time: 26.8 s


In [5]:
# 2.2: Remove duplicate points from the trajectories.

dupl_filt_starkey = Filters.remove_duplicates(dataframe=hamp_filt_starkey)
dupl_filt_starkey = dupl_filt_starkey.drop_duplicates(subset=['DateTime', 'traj_id'])
print(f"Length of original Dataframe: {len(hamp_filt_starkey)}")
print(f"Length of DataFrame after filtering: {len(dupl_filt_starkey)}")

Length of original Dataframe: 262207
Length of DataFrame after filtering: 262206


In [6]:
# 2.3 Remove trajectories with few points.

few_filt_starkey = Filters.remove_trajectories_with_less_points(dataframe=dupl_filt_starkey,
                                                                num_min_points=5)
print(f"Length of original Dataframe: {len(dupl_filt_starkey)}")
print(f"Length of DataFrame after filtering: {len(few_filt_starkey)}")

Length of original Dataframe: 262206
Length of DataFrame after filtering: 262206


In [7]:
a = few_filt_starkey.reset_index()['DateTime'].diff().dt.total_seconds()
(a > 3600*4).value_counts()

False    223302
True      38904
Name: DateTime, dtype: int64

In [8]:
%%time

# Step -3: Interpolate Trajectory.
# We will use cubic interpolation here.
import datetime as dt

def dtt2timestamp(dtt):
    ts = (dtt.hour * 60 + dtt.minute) * 60 + dtt.second
    #if you want microseconds as well
    ts += dtt.microsecond * 10**(-6)
    return ts

from ptrail.preprocessing.interpolation import Interpolation

ip_starkey = Interpolation.interpolate_position(dataframe=few_filt_starkey,
                                                time_jump=3600*4,
                                                ip_type='cubic')

# Add Date and Time column and convert to ordinal values
# to make them eligible for the model fitting.
ip_starkey = TemporalFeatures.create_time_column(ip_starkey)
ip_starkey = TemporalFeatures.create_date_column(ip_starkey)
ip_starkey['Date'] = ip_starkey['Date'].map(dt.datetime.toordinal)
ip_starkey['Time'] = ip_starkey['Time'].apply(dtt2timestamp)


CPU times: user 1.58 s, sys: 244 ms, total: 1.82 s
Wall time: 23.9 s


In [9]:
# Modifying the data of the Species column to indicate the
# results as follows:
#   D (Deer) : 0
#   E (Elk): 1
#   C (Cattle): 2

# First, doing the above mentioned conversion for the test
# dataset.
int_test = []
for i in range(len(few_filt_starkey['Species'])):
    if few_filt_starkey['Species'].iloc[i] == 'D':
        int_test.append(0)
    elif few_filt_starkey['Species'].iloc[i] == 'E':
        int_test.append(1)
    else:
        int_test.append(2)
few_filt_starkey['Species'] = int_test
few_filt_starkey = few_filt_starkey.drop(columns='index')
few_filt_starkey.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,StarkeyTime,GMDate,GMTime,LocDate,LocTime,RadNum,Species,UTME,UTMN,Year,Grensunr,Grensuns,Obswt,Distance,Time_Of_Day
traj_id,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
880109D01,1995-04-13 13:40:06,45.239682,-118.533204,229902006,21:40:06,19950413,19950413,13:40:06,409,0,379662,5010734,95,13:13:00,02:39:00,1.47,,Noon
880109D01,1995-04-16 03:32:14,45.247429,-118.53953,230124734,11:32:14,19950416,19950416,03:32:14,409,0,379188,5011581,95,13:07:00,02:43:00,1.5,161.204428,Late Night
880109D01,1995-04-16 04:08:28,45.247117,-118.542579,230126908,12:08:28,19950416,19950416,04:08:28,409,0,378938,5011567,95,13:07:00,02:43:00,1.34,241.258531,Late Night
880109D01,1995-04-16 05:30:40,45.246359,-118.538736,230131840,13:30:40,19950416,19950416,05:30:40,409,0,379258,5011468,95,13:07:00,02:43:00,1.46,312.474615,Early Morning
880109D01,1995-04-16 06:03:39,45.246655,-118.536833,230133819,14:03:39,19950416,19950416,06:03:39,409,0,379404,5011489,95,13:07:00,02:43:00,1.65,152.598529,Early Morning


In [10]:
# Now, fixing the Species column for the interpolated starkey dataset.

# Create a list of all unique ids.
ids_ = list(ip_starkey.reset_index()['traj_id'].value_counts().keys())

df_chunks = []
# Create a small chunk for each ID, then for the same ID in the
# original dataset, grab the species and then again assign that
# to the interpolated ID.
for i in range(len(ids_)):
    small = ip_starkey.reset_index().loc[ip_starkey.reset_index()['traj_id'] == ids_[i]]
    spec = few_filt_starkey.reset_index().loc[few_filt_starkey.reset_index()['traj_id'] == ids_[i], 'Species'].iloc[0]
    small['Species'] = spec
    df_chunks.append(small)

# Now, convert the dataframe with interpolated species to
# PTRAILDataFrame.
ip_starkey = PTRAILDataFrame(data_set=pd.concat(df_chunks),
                           latitude='lat',
                           longitude='lon',
                           traj_id='traj_id',
                           datetime='DateTime')
ip_starkey.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,Time,Date,Species
traj_id,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
880109D01,1995-04-13 13:40:06,45.239682,-118.533204,49206.0,728396,0
880109D01,1995-04-13 17:40:06,45.163212,-117.32047,63606.0,728396,0
880109D01,1995-04-16 03:32:14,45.247429,-118.53953,12734.0,728399,0
880109D01,1995-04-16 04:08:28,45.247117,-118.542579,14908.0,728399,0
880109D01,1995-04-16 05:30:40,45.246359,-118.538736,19840.0,728399,0


In [11]:
# Step-5: Getting the test and train datasets ready.
small_train = few_filt_starkey.reset_index()[['DateTime', 'traj_id', 'lat', 'lon', 'Species']]

np_train_starkey = PTRAILDataFrame(data_set=small_train,
                                datetime='DateTime',
                                traj_id='traj_id',
                                latitude='lat',
                                longitude='lon',
                                rest_of_columns=['Species'])
np_train_starkey = TemporalFeatures.create_date_column(np_train_starkey)
np_train_starkey = TemporalFeatures.create_time_column(np_train_starkey)

np_train_starkey['Date'] = np_train_starkey['Date'].map(dt.datetime.toordinal)
np_train_starkey['Time'] = np_train_starkey['Time'].apply(dtt2timestamp)

In [12]:
# Splitting train into x and y.

np_train_x = np_train_starkey.drop(columns=['Species'])
np_train_y = np_train_starkey.reset_index()['Species']

In [13]:
# Splitting test into x and y.

np_test_x = ip_starkey.drop(columns=['Species'])
np_test_y = ip_starkey.reset_index()['Species']

In [14]:
np_test_x

Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,Time,Date
traj_id,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
880109D01,1995-04-13 13:40:06,45.239682,-118.533204,49206.0,728396
880109D01,1995-04-13 17:40:06,45.163212,-117.320470,63606.0,728396
880109D01,1995-04-16 03:32:14,45.247429,-118.539530,12734.0,728399
880109D01,1995-04-16 04:08:28,45.247117,-118.542579,14908.0,728399
880109D01,1995-04-16 05:30:40,45.246359,-118.538736,19840.0,728399
...,...,...,...,...,...
OSUX93191,1996-08-15 06:51:06,45.220642,-118.543392,24666.0,728886
OSUX93191,1996-08-15 08:45:15,45.219785,-118.546807,31515.0,728886
OSUX93191,1996-08-15 10:36:54,45.219801,-118.545661,38214.0,728886
OSUX93191,1996-08-15 12:31:22,45.220268,-118.551024,45082.0,728886


In [15]:
%%time

# Model 1: RandomForestClassifier.

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_model = RandomForestClassifier()
rf_model.fit(np_train_x, np_train_y)

CPU times: user 1min 2s, sys: 135 ms, total: 1min 2s
Wall time: 1min 2s


RandomForestClassifier()

In [16]:
%%time

# Now, lets predict the Species on the train and test dataset.
rf_predict_train = rf_model.predict(np_train_x)
print(f"Target on train data: {rf_predict_train}")

rf_predict_test = rf_model.predict(np_test_x)
print(f"Target on test data: {rf_predict_test}")

Target on train data: [0 0 0 ... 2 2 2]
Target on test data: [0 1 1 ... 1 1 1]
CPU times: user 7.21 s, sys: 3.98 ms, total: 7.21 s
Wall time: 7.21 s


In [17]:
# Finally, lets check the accuracy of the RF module.
rf_train_accuracy = accuracy_score(np_train_y, rf_predict_train)
print(f"The Training accuracy of RF is: {rf_train_accuracy*100}%")

rf_test_accuracy = accuracy_score(np_test_y, rf_predict_test)
print(f"The Testing accuracy of RF is: {rf_test_accuracy*100}%")

The Training accuracy of RF is: 100.0%
The Testing accuracy of RF is: 56.979065295155095%


In [18]:
%%time

# Model - 2: DecisionTreeClassifier model.
from sklearn.tree import DecisionTreeClassifier

dtc_model = DecisionTreeClassifier()
dtc_model.fit(np_train_x, np_train_y)

print(f"The depth of decision tree is: {dtc_model.get_depth()}")

The depth of decision tree is: 39
CPU times: user 1.76 s, sys: 38 µs, total: 1.76 s
Wall time: 1.75 s


In [19]:
%%time

# Now predicting the values of the training dataset.
dtc_predict_train = dtc_model.predict(np_train_x)
print(f"Target on train data: {dtc_predict_train}")

# Now, predicting the values of the test dataset.
dtc_predict_test = dtc_model.predict(np_test_x)
print(f"Target on the test data: {dtc_predict_test}")

Target on train data: [0 0 0 ... 2 2 2]
Target on the test data: [1 1 1 ... 1 1 1]
CPU times: user 63.2 ms, sys: 0 ns, total: 63.2 ms
Wall time: 60.7 ms


In [20]:
# Finally, lets check the accuracy of the model on both sets.
dtc_train_accuracy = accuracy_score(np_train_y, dtc_predict_train)
print(f"The Training accuracy is: {dtc_train_accuracy*100}%")

dtc_test_accuracy = accuracy_score(np_test_y, dtc_predict_test)
print(f"The Testing accuracy is: {dtc_test_accuracy*100}%")

The Training accuracy is: 100.0%
The Testing accuracy is: 56.62865892340548%


In [21]:
%%time
from sklearn.naive_bayes import GaussianNB

# Model - 3: Gaussian Naive Bayes.

nb_model = GaussianNB()
nb_model.fit(np_train_x, np_train_y)

CPU times: user 46.5 ms, sys: 30 µs, total: 46.5 ms
Wall time: 45.1 ms


GaussianNB()

In [22]:
%%time

# Now, lets predict the Species on the train and test dataset.
nb_predict_train = nb_model.predict(np_train_x)
print(f"Target on train data: {nb_predict_train}")

nb_predict_test = nb_model.predict(np_test_x)
print(f"Target on test data: {nb_predict_test}")

Target on train data: [1 1 1 ... 1 1 1]
Target on test data: [2 2 2 ... 2 2 2]
CPU times: user 64.4 ms, sys: 3.99 ms, total: 68.4 ms
Wall time: 64.8 ms


In [23]:
# Finally, lets check the accuracy of the Bayes model.
nb_train_accuracy = accuracy_score(np_train_y, nb_predict_train)
print(f"The Training accuracy of NB is: {nb_train_accuracy*100}%")

nb_test_accuracy = accuracy_score(np_test_y, nb_predict_test)
print(f"The Testing accuracy of NB is: {nb_test_accuracy*100}%")

The Training accuracy of NB is: 57.88120790523481%
The Testing accuracy of NB is: 17.244643432454605%


In [24]:
%%time

# Model - 4: K Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=75)
knn_model.fit(np_train_x, np_train_y)

CPU times: user 579 ms, sys: 33 µs, total: 579 ms
Wall time: 580 ms


KNeighborsClassifier(n_neighbors=75)

In [25]:
%%time

# Now, lets predict the Species on the train and test dataset.
knn_predict_train = knn_model.predict(np_train_x)
print(f"Target on train data: {knn_predict_train}")

knn_predict_test = knn_model.predict(np_test_x)
print(f"Target on test data: {knn_predict_test}")

Target on train data: [1 1 0 ... 1 1 1]
Target on test data: [1 1 1 ... 1 1 1]
CPU times: user 27.1 s, sys: 1.1 s, total: 28.2 s
Wall time: 28.2 s


In [26]:
# Finally, lets check the accuracy of the KNN module.
knn_train_accuracy = accuracy_score(np_train_y, knn_predict_train)
print(f"The Training accuracy of KNN is: {knn_train_accuracy*100}%")

knn_test_accuracy = accuracy_score(np_test_y, knn_predict_test)
print(f"The Testing accuracy of KNN is: {knn_test_accuracy*100}%")

The Training accuracy of KNN is: 58.66189179500088%
The Testing accuracy of KNN is: 57.417488433268346%


In [27]:
%%time

# Model - 5: K Means Clustering
from sklearn.cluster import KMeans

km_model = KMeans(n_clusters=3)
km_model.fit(np_train_x, np_train_y)

CPU times: user 11.3 s, sys: 277 ms, total: 11.6 s
Wall time: 1.19 s


KMeans(n_clusters=3)

In [28]:
%%time

# Now, lets predict the Species on the train and test dataset.
km_predict_train = km_model.predict(np_train_x)
print(f"Target on train data: {km_predict_train}")

km_predict_test = km_model.predict(np_test_x)
print(f"Target on test data: {km_predict_test}")

Target on train data: [2 0 0 ... 2 2 2]
Target on test data: [1 1 1 ... 1 1 1]
CPU times: user 616 ms, sys: 15.7 ms, total: 631 ms
Wall time: 67.1 ms


In [29]:
# Finally, lets check the accuracy of the KM module.
km_train_accuracy = accuracy_score(np_train_y, km_predict_train)
print(f"The Training accuracy of KM is: {km_train_accuracy*100}%")

km_test_accuracy = accuracy_score(np_test_y, km_predict_test)
print(f"The Testing accuracy of KM is: {km_test_accuracy*100}%")

The Training accuracy of KM is: 32.91419723423567%
The Testing accuracy of KM is: 57.417488433268346%


In [30]:
%%time

# Now, lets try another classification model which is the
# Support Vector Machine and check its accuracy.
# from sklearn.svm import SVC
#
# svc_model = SVC()
# svc_model.fit(np_train_x, np_train_y)
#
#

CPU times: user 25 µs, sys: 0 ns, total: 25 µs
Wall time: 9.54 µs


In [31]:
# %%time
#
# # Now, lets predict the Species on the train and test dataset.
# svc_predict_train = svc_model.predict(np_train_x)
# print(f"Target on train data: {svc_predict_train}")
#
# svc_predict_test = svc_model.predict(np_test_x)
# print(f"Target on test data: {svc_predict_test}")
#
#

In [32]:
# # Finally, lets check the accuracy of the SVC module.
# svc_train_accuracy = accuracy_score(np_train_y, svc_predict_train)
# print(f"The Training accuracy og SVC is: {svc_train_accuracy*100}%")
#
# svc_test_accuracy = accuracy_score(np_test_y, svc_predict_test)
# print(f"The Training accuracy og SVC is: {svc_test_accuracy*100}%")


starkey['Species'].value_counts()

E    166885
D     70582
C     49669
Name: Species, dtype: int64