In [1]:
import pandas as pd
from Nummobility.core.TrajectoryDF import NumPandasTraj

"""
    First, to begin with the following actions ar performed:
        1. Read the database from the csv file using pandas.
        2. Convert the pandas dataframe to NumPandasTraj DataFrame
           in order to represent the data with NumMobility.

    It is to be duly noted that the Starkey Project database provided
    originally has been modified here to meet with the needs of
    NumMobility Library.
"""
pdf = pd.read_csv('./data/starkey.csv')
starkey = NumPandasTraj(data_set=pdf,
                        latitude='lat',
                        longitude='lon',
                        datetime='DateTime',
                        traj_id='Id')
print("Size of the Dataset: {}".format(starkey.shape))
starkey.head(5)


Size of the Dataset: (287136, 15)


Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,StarkeyTime,GMDate,GMTime,LocDate,LocTime,RadNum,Species,UTME,UTMN,Year,Grensunr,Grensuns,Obswt
traj_id,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
880109D01,1995-04-13 13:40:06,45.239682,-118.533204,229902006,21:40:06,19950413,19950413,13:40:06,409,D,379662,5010734,95,13:13:00,02:39:00,1.47
880109D01,1995-04-15 12:16:15,45.250521,-118.530438,230069775,20:16:15,19950415,19950415,12:16:15,409,D,379895,5011927,95,13:09:00,02:41:00,1.59
880109D01,1995-04-15 21:39:38,45.247943,-118.541455,230103578,05:39:38,19950416,19950415,21:39:38,409,D,379039,5011656,95,13:07:00,02:43:00,1.34
880109D01,1995-04-16 03:32:14,45.247429,-118.53953,230124734,11:32:14,19950416,19950416,03:32:14,409,D,379188,5011581,95,13:07:00,02:43:00,1.5
880109D01,1995-04-16 04:08:28,45.247117,-118.542579,230126908,12:08:28,19950416,19950416,04:08:28,409,D,378938,5011567,95,13:07:00,02:43:00,1.34


In [2]:
%%time

# Now, we import the spatial features from the NumMobility
# library and generate several spatial features on the dataset.
from Nummobility.features.spatial_features import SpatialFeatures
from Nummobility.features.temporal_features import TemporalFeatures

starkey = SpatialFeatures.create_distance_between_consecutive_column(starkey)
starkey = TemporalFeatures.create_date_column(starkey)
starkey.head()

CPU times: user 1.25 s, sys: 229 ms, total: 1.48 s
Wall time: 1.86 s


Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,StarkeyTime,GMDate,GMTime,LocDate,LocTime,RadNum,Species,UTME,UTMN,Year,Grensunr,Grensuns,Obswt,Distance_prev_to_curr,Date
traj_id,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
880109D01,1995-04-13 13:40:06,45.239682,-118.533204,229902006,21:40:06,19950413,19950413,13:40:06,409,D,379662,5010734,95,13:13:00,02:39:00,1.47,,1995-04-13
880109D01,1995-04-15 12:16:15,45.250521,-118.530438,230069775,20:16:15,19950415,19950415,12:16:15,409,D,379895,5011927,95,13:09:00,02:41:00,1.59,1224.551334,1995-04-15
880109D01,1995-04-15 21:39:38,45.247943,-118.541455,230103578,05:39:38,19950416,19950415,21:39:38,409,D,379039,5011656,95,13:07:00,02:43:00,1.34,908.878736,1995-04-15
880109D01,1995-04-16 03:32:14,45.247429,-118.53953,230124734,11:32:14,19950416,19950416,03:32:14,409,D,379188,5011581,95,13:07:00,02:43:00,1.5,161.204428,1995-04-16
880109D01,1995-04-16 04:08:28,45.247117,-118.542579,230126908,12:08:28,19950416,19950416,04:08:28,409,D,378938,5011567,95,13:07:00,02:43:00,1.34,241.258531,1995-04-16


In [3]:
%%time

# Finally, lets run hampel filter on Distance between
# consecutive points so as to clean up the data a little bit.

from Nummobility.preprocessing.filters import Filters

small_starkey = Filters.hampel_outlier_detection(starkey,
                                                 'Distance_prev_to_curr')
print(f"Original DF length: {len(starkey)}")
print(f"Filtered DF length: {len(small_starkey)}")



Original DF length: 287136
Filtered DF length: 262207
CPU times: user 1.03 s, sys: 238 ms, total: 1.27 s
Wall time: 24.1 s


In [4]:
# # Here drop the points with the same trajectory ID and the
# # same DateTime values are they are not allowed in Cubic
# # interpolation since the times need to be strictly increasing.
#
# filtered_starkey = small_starkey.reset_index().drop_duplicates(subset=['DateTime', 'traj_id'], keep='first')
# print(f"Original DF length: {len(small_starkey)}")
# print(f"Filtered DF length: {len(filtered_starkey)}")

In [5]:
# Now, lets split the dataset using. The size of the
# training dataset is 70% of original dataset and the
# rest is testing dataset.

# Setting up the train database. Here, it is to be noted
# that for the purpose of fitting in the classifier, only
# the lat, lon, datetime, traj_id and Species columns are
# used and other columns are not being considered for now.

ids_ = list(small_starkey.traj_id.value_counts().keys())

train_df = []
for i in range(int(len(ids_) * 0.7)):
    small = small_starkey.reset_index().loc[small_starkey.reset_index()['traj_id'] == ids_[i],
    ['DateTime', 'traj_id', 'lat', 'lon', 'Species']]
    train_df.append(small)

np_train_starkey = NumPandasTraj(pd.concat(train_df),
                                 latitude='lat',
                                 longitude='lon',
                                 datetime='DateTime',
                                 traj_id='traj_id')

In [6]:
# Setting up the test database. Here, it is to be noted
# that for the purpose of fitting in the classifier, only
# the lat, lon, datetime, traj_id and Species columns are
# used and other columns are not being considered for now.

test_df = []
for i in range(int(len(ids_) * 0.7)+1, len(ids_)):
    small = small_starkey.reset_index().loc[small_starkey.reset_index()['traj_id'] == ids_[i],
    ['DateTime', 'traj_id', 'lat', 'lon', 'Species']]
    test_df.append(small)

np_test_starkey = NumPandasTraj(pd.concat(test_df),
                                 latitude='lat',
                                 longitude='lon',
                                 datetime='DateTime',
                                 traj_id='traj_id')

In [7]:
# Modifying the data of the Species column to indicate the
# results as follows:
#   D (Deer) : 0
#   E (Elk): 1
#   C (Cattle): 2

# First, doing the above mentioned conversion for the test
# dataset.
int_test = []
for i in range(len(np_test_starkey['Species'])):
    if np_test_starkey['Species'].iloc[i] == 'D':
        int_test.append(0)
    elif np_test_starkey['Species'].iloc[i] == 'E':
        int_test.append(1)
    else:
        int_test.append(2)
np_test_starkey['Species'] = int_test
np_test_starkey.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,Species
traj_id,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
880109D01,1995-04-13 13:40:06,45.239682,-118.533204,0
880109D01,1995-04-16 03:32:14,45.247429,-118.53953,0
880109D01,1995-04-16 04:08:28,45.247117,-118.542579,0
880109D01,1995-04-16 05:30:40,45.246359,-118.538736,0
880109D01,1995-04-16 06:03:39,45.246655,-118.536833,0


In [8]:
# Now, doing the conversion for the train dataset.
int_train = []
for i in range(len(np_train_starkey['Species'])):
    if np_train_starkey['Species'].iloc[i] == 'D':
        int_train.append(0)
    elif np_train_starkey['Species'].iloc[i] == 'E':
        int_train.append(1)
    else:
        int_train.append(2)
np_train_starkey['Species'] = int_train
np_train_starkey.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,Species
traj_id,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
880119D02,1993-05-18 14:04:36,45.245158,-118.527618,0
880119D02,1993-05-18 14:41:26,45.244873,-118.528757,0
880119D02,1993-05-18 15:17:29,45.245148,-118.528382,0
880119D02,1993-05-18 15:54:58,45.243803,-118.527964,0
880119D02,1993-05-18 17:05:50,45.243814,-118.527199,0


In [9]:
## Getting the training x and y ready for the DecisionTree.
train_x = np_train_starkey.drop(columns=['Species'])
train_y = np_train_starkey.reset_index()['Species']

In [10]:
# Getting the testing x and y ready for the DecisionTree.
test_x = np_test_starkey.drop(columns=['Species'])
test_y = np_test_starkey.reset_index()['Species']

In [11]:
# Now, setting up the DecisionTree model from the scikit-learn
# library. Here, we are trying to see if we can predict the
# Distance_prev_to_curr.

from sklearn.tree import DecisionTreeClassifier

dtc_model = DecisionTreeClassifier()
dtc_model.fit(train_x, train_y)

print(f"The depth of decision tree is: {dtc_model.get_depth()}")

The depth of decision tree is: 40


In [12]:
# Now predicting the values of the training dataset.

dtc_predict_train = dtc_model.predict(train_x)
print(f"Target on train data: {dtc_predict_train}")

Target on train data: [0 0 0 ... 0 2 2]


In [13]:
# Now, predicting the values of the test dataset.
dtc_predict_test = dtc_model.predict(test_x)
print(f"Target on the test data: {dtc_predict_test}")

Target on the test data: [0 1 1 ... 1 2 1]


In [14]:
from sklearn.metrics import accuracy_score

dtc_train_accuracy = accuracy_score(train_y, dtc_predict_train)
print(f"The Training accuracy is: {dtc_train_accuracy*100}%")

dtc_test_accuracy = accuracy_score(test_y, dtc_predict_test)
print(f"The Testing accuracy is: {dtc_test_accuracy*100}%")

The Training accuracy is: 81.21551923224962%
The Testing accuracy is: 46.40501675856849%


In [15]:
%%time

# Now, lets try another classification model which is the
# Support Vector Machine and check its accuracy.
# from sklearn.svm import SVC
#
# svc_model = SVC()
# svc_model.fit(train_x, train_y)
#

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.44 µs


In [16]:
# %%time
#
# # Now, lets predict the Species on the train and test dataset.
# svc_predict_train = svc_model.predict(train_x)
# print(f"Target on train data: {svc_predict_train}")
#
# svc_predict_test = svc_model.predict(test_x)
# print(f"Target on test data: {svc_predict_test}")
#

In [17]:
# # Finally, lets check the accuracy of the SVC module.
# svc_train_accuracy = accuracy_score(train_y, svc_predict_train)
# print(f"The Training accuracy og SVC is: {svc_train_accuracy*100}%")
#
# svc_test_accuracy = accuracy_score(test_y, svc_predict_test)
# print(f"The Training accuracy og SVC is: {svc_test_accuracy*100}%")

In [18]:
%%time
# Now, lets try the same classification with the Naive
# Bayes algorithm. Repeat the same steps:
#   1. Setup the model and train it.
#   2. Try to predict on the Training dataset.
#   3. Try to predict on the Testing dataset.
#   4. Measure accuracy for both.

from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(train_x, train_y)

CPU times: user 36.4 ms, sys: 3.89 ms, total: 40.3 ms
Wall time: 39.7 ms


GaussianNB()

In [19]:
%%time

# Now, lets predict the Species on the train and test dataset.
nb_predict_train = nb_model.predict(train_x)
print(f"Target on train data: {nb_predict_train}")

nb_predict_test = nb_model.predict(test_x)
print(f"Target on test data: {nb_predict_test}")

Target on train data: [0 0 0 ... 0 0 0]
Target on test data: [1 1 1 ... 1 1 1]
CPU times: user 26.7 ms, sys: 0 ns, total: 26.7 ms
Wall time: 25.1 ms


In [20]:
# Finally, lets check the accuracy of the SVC module.
nb_train_accuracy = accuracy_score(train_y, nb_predict_train)
print(f"The Training accuracy of NB is: {nb_train_accuracy*100}%")

nb_test_accuracy = accuracy_score(test_y, nb_predict_test)
print(f"The Testing accuracy of NB is: {nb_test_accuracy*100}%")

The Training accuracy of NB is: 61.56329920539204%
The Testing accuracy of NB is: 30.6699823404332%


In [21]:
%%time
# Now, lets try the same classification with K Nearest
# Neighbors algorithm. Repeat the same steps:
#   1. Setup the model and train it.
#   2. Try to predict on the Training dataset.
#   3. Try to predict on the Testing dataset.
#   4. Measure accuracy for both.

from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=75)
knn_model.fit(train_x, train_y)

CPU times: user 154 ms, sys: 28 µs, total: 154 ms
Wall time: 152 ms


KNeighborsClassifier(n_neighbors=75)

In [22]:
%%time

# Now, lets predict the Species on the train and test dataset.
knn_predict_train = knn_model.predict(train_x)
print(f"Target on train data: {knn_predict_train}")

knn_predict_test = knn_model.predict(test_x)
print(f"Target on test data: {knn_predict_test}")

Target on train data: [0 0 0 ... 0 2 0]
Target on test data: [0 1 1 ... 1 2 1]
CPU times: user 11.2 s, sys: 116 ms, total: 11.4 s
Wall time: 11.4 s


In [23]:
# Finally, lets check the accuracy of the SVC module.
knn_train_accuracy = accuracy_score(train_y, knn_predict_train)
print(f"The Training accuracy of KNN is: {knn_train_accuracy*100}%")

knn_test_accuracy = accuracy_score(test_y, knn_predict_test)
print(f"The Testing accuracy of KNN is: {knn_test_accuracy*100}%")

The Training accuracy of KNN is: 74.74446810876464%
The Testing accuracy of KNN is: 47.81417810934516%


In [24]:
%%time
# Now, lets try the same classification with K means
# clustering algorithm. Repeat the same steps:
#   1. Setup the model and train it.
#   2. Try to predict on the Training dataset.
#   3. Try to predict on the Testing dataset.
#   4. Measure accuracy for both.

from sklearn.cluster import KMeans

km_model = KMeans(n_clusters=3)
km_model.fit(train_x, train_y)

CPU times: user 11 s, sys: 195 ms, total: 11.2 s
Wall time: 1.02 s


KMeans(n_clusters=3)

In [25]:
%%time

# Now, lets predict the Species on the train and test dataset.
km_predict_train = km_model.predict(train_x)
print(f"Target on train data: {km_predict_train}")

km_predict_test = km_model.predict(test_x)
print(f"Target on test data: {km_predict_test}")

Target on train data: [2 2 2 ... 2 2 2]
Target on test data: [2 2 2 ... 2 2 2]
CPU times: user 124 ms, sys: 11.9 ms, total: 136 ms
Wall time: 11.7 ms


In [26]:
# Finally, lets check the accuracy of the SVC module.
km_train_accuracy = accuracy_score(train_y, km_predict_train)
print(f"The Training accuracy of KM is: {km_train_accuracy*100}%")

km_test_accuracy = accuracy_score(test_y, knn_predict_test)
print(f"The Testing accuracy of KM is: {km_test_accuracy*100}%")

The Training accuracy of KM is: 32.97623018825965%
The Testing accuracy of KM is: 47.81417810934516%


In [27]:
%%time
# Finally, lets try the same classification with Random
# Forests Classification algorithm. Repeat the same steps:
#   1. Setup the model and train it.
#   2. Try to predict on the Training dataset.
#   3. Try to predict on the Testing dataset.
#   4. Measure accuracy for both.

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(train_x, train_y)

CPU times: user 31 s, sys: 200 ms, total: 31.2 s
Wall time: 29.2 s


RandomForestClassifier()

In [28]:
%%time

# Now, lets predict the Species on the train and test dataset.
rf_predict_train = rf_model.predict(train_x)
print(f"Target on train data: {rf_predict_train}")

rf_predict_test = rf_model.predict(test_x)
print(f"Target on test data: {rf_predict_test}")

Target on train data: [0 0 0 ... 0 2 2]
Target on test data: [0 1 1 ... 1 2 1]
CPU times: user 4.09 s, sys: 4.03 ms, total: 4.1 s
Wall time: 4.1 s


In [29]:
# Finally, lets check the accuracy of the SVC module.
rf_train_accuracy = accuracy_score(train_y, rf_predict_train)
print(f"The Training accuracy of RF is: {knn_train_accuracy*100}%")

rf_test_accuracy = accuracy_score(test_y, rf_predict_test)
print(f"The Testing accuracy of RF is: {rf_test_accuracy*100}%")

The Training accuracy of RF is: 74.74446810876464%
The Testing accuracy of RF is: 48.93141600893791%
