In [1]:
import pandas as pd
from Nummobility.core.TrajectoryDF import NumPandasTraj

"""
    First, to begin with the following actions ar performed:
        1. Read the database from the csv file using pandas.
        2. Convert the pandas dataframe to NumPandasTraj DataFrame
           in order to represent the data with NumMobility.

    It is to be duly noted that the Starkey Project database provided
    originally has been modified here to meet with the needs of
    NumMobility Library.
"""
pdf = pd.read_csv('./data/starkey.csv')
starkey = NumPandasTraj(data_set=pdf,
                        latitude='lat',
                        longitude='lon',
                        datetime='DateTime',
                        traj_id='Id')
print("Size of the Dataset: {}".format(starkey.shape))
starkey.head(5)


Size of the Dataset: (287136, 15)


Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,StarkeyTime,GMDate,GMTime,LocDate,LocTime,RadNum,Species,UTME,UTMN,Year,Grensunr,Grensuns,Obswt
traj_id,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
880109D01,1970-01-01 00:04:17,45.2387,-118.525916,230803457,08:04:17,19950424,19950424,00:04:17,409,D,380220,5010606,95,12:54:00,02:53:00,1.42
880109D01,1970-01-01 00:24:02,45.241063,-118.530948,230286242,08:24:02,19950418,19950418,00:24:02,409,D,379854,5010865,95,13:04:00,02:45:00,1.43
880109D01,1970-01-01 00:33:08,45.249747,-118.527741,230545988,08:33:08,19950421,19950421,00:33:08,409,D,380122,5011840,95,12:59:00,02:49:00,1.66
880109D01,1970-01-01 00:33:51,45.23844,-118.525144,230978031,08:33:51,19950426,19950426,00:33:51,409,D,380293,5010559,95,12:50:00,02:56:00,1.42
880109D01,1970-01-01 00:36:53,45.254177,-118.539712,230632613,08:36:53,19950422,19950422,00:36:53,409,D,379191,5012351,95,12:57:00,02:50:00,1.48


In [2]:
%%time

# Now, we import the spatial features from the NumMobility
# library and generate several spatial features on the dataset.
from Nummobility.features.spatial_features import SpatialFeatures

starkey = SpatialFeatures.create_distance_between_consecutive_column(starkey)

CPU times: user 865 ms, sys: 171 ms, total: 1.04 s
Wall time: 1.43 s


In [3]:
%%time

# Finally, lets run hampel filter on Distance between
# consecutive points so as to clean up the data a little bit.

from Nummobility.preprocessing.filters import Filters

small_starkey = Filters.hampel_outlier_detection(starkey,
                                                 'Distance_prev_to_curr')
print(f"Original DF length: {len(starkey)}")
print(f"Filtered DF length: {len(small_starkey)}")



Original DF length: 287136
Filtered DF length: 275561
CPU times: user 889 ms, sys: 236 ms, total: 1.12 s
Wall time: 25.2 s


In [4]:
# Here drop the points with the same trajectory ID and the
# same DateTime values are they are not allowed in Cubic
# interpolation since the times need to be strictly increasing.

filtered_starkey = small_starkey.reset_index().drop_duplicates(subset=['DateTime', 'traj_id'], keep='first')
print(f"Original DF length: {len(small_starkey)}")
print(f"Filtered DF length: {len(filtered_starkey)}")

Original DF length: 275561
Filtered DF length: 272778


In [5]:
# Now, lets split the dataset using. The size of the
# training dataset is 70% of original dataset and the
# rest is testing dataset.

# Setting up the train database. Here, it is to be noted
# that for the purpose of fitting in the classifier, only
# the lat, lon, datetime, traj_id and Species columns are
# used and other columns are not being considered for now.

ids_ = list(filtered_starkey.traj_id.value_counts().keys())

train_df = []
for i in range(int(len(ids_) * 0.7)):
    small = filtered_starkey.reset_index().loc[filtered_starkey.reset_index()['traj_id'] == ids_[i],
    ['DateTime', 'traj_id', 'lat', 'lon', 'Species']]
    train_df.append(small)

np_train_starkey = NumPandasTraj(pd.concat(train_df),
                                 latitude='lat',
                                 longitude='lon',
                                 datetime='DateTime',
                                 traj_id='traj_id')

In [6]:
# Setting up the test database. Here, it is to be noted
# that for the purpose of fitting in the classifier, only
# the lat, lon, datetime, traj_id and Species columns are
# used and other columns are not being considered for now.

test_df = []
for i in range(int(len(ids_) * 0.7)+1, len(ids_)):
    small = filtered_starkey.reset_index().loc[filtered_starkey.reset_index()['traj_id'] == ids_[i],
    ['DateTime', 'traj_id', 'lat', 'lon', 'Species']]
    test_df.append(small)

np_test_starkey = NumPandasTraj(pd.concat(test_df),
                                 latitude='lat',
                                 longitude='lon',
                                 datetime='DateTime',
                                 traj_id='traj_id')

In [7]:
# Modifying the data of the Species column to indicate the
# results as follows:
#   D (Deer) : 0
#   E (Elk): 1
#   C (Cattle): 2

# First, doing the above mentioned conversion for the test
# dataset.
int_test = []
for i in range(len(np_test_starkey['Species'])):
    if np_test_starkey['Species'].iloc[i] == 'D':
        int_test.append(0)
    elif np_test_starkey['Species'].iloc[i] == 'E':
        int_test.append(1)
    else:
        int_test.append(2)
np_test_starkey['Species'] = int_test
np_test_starkey.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,Species
traj_id,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
880109D01,1970-01-01 00:04:17,45.2387,-118.525916,0
880109D01,1970-01-01 00:24:02,45.241063,-118.530948,0
880109D01,1970-01-01 00:33:08,45.249747,-118.527741,0
880109D01,1970-01-01 00:33:51,45.23844,-118.525144,0
880109D01,1970-01-01 00:36:53,45.254177,-118.539712,0


In [8]:
# Now, doing the conversion for the train dataset.
int_train = []
for i in range(len(np_train_starkey['Species'])):
    if np_train_starkey['Species'].iloc[i] == 'D':
        int_train.append(0)
    elif np_train_starkey['Species'].iloc[i] == 'E':
        int_train.append(1)
    else:
        int_train.append(2)
np_train_starkey['Species'] = int_train
np_train_starkey.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,Species
traj_id,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
880119D02,1970-01-01 00:00:22,45.242184,-118.52792,0
880119D02,1970-01-01 00:01:10,45.242199,-118.526774,0
880119D02,1970-01-01 00:01:24,45.239769,-118.526709,0
880119D02,1970-01-01 00:02:43,45.243335,-118.5226,0
880119D02,1970-01-01 00:03:35,45.243793,-118.528728,0


In [9]:
## Getting the training x and y ready for the DecisionTree.
train_x = np_train_starkey.drop(columns=['Species'])
train_y = np_train_starkey.reset_index()['Species']

In [10]:
# Getting the testing x and y ready for the DecisionTree.
test_x = np_test_starkey.drop(columns=['Species'])
test_y = np_test_starkey.reset_index()['Species']

In [11]:
# Now, setting up the DecisionTree model from the scikit-learn
# library. Here, we are trying to see if we can predict the
# Distance_prev_to_curr.

from sklearn.tree import DecisionTreeClassifier

dtc_model = DecisionTreeClassifier()
dtc_model.fit(train_x, train_y)

print(f"The depth of decision tree is: {dtc_model.get_depth()}")

The depth of decision tree is: 40


In [12]:
# Now predicting the values of the training dataset.

dtc_predict_train = dtc_model.predict(train_x)
print(f"Target on train data: {dtc_predict_train}")

Target on train data: [0 0 0 ... 1 2 1]


In [13]:
# Now, predicting the values of the test dataset.
dtc_predict_test = dtc_model.predict(test_x)
print(f"Target on the test data: {dtc_predict_test}")

Target on the test data: [0 0 0 ... 1 1 1]


In [14]:
from sklearn.metrics import accuracy_score

dtc_train_accuracy = accuracy_score(train_y, dtc_predict_train)
print(f"The Training accuracy is: {dtc_train_accuracy*100}%")

dtc_test_accuracy = accuracy_score(test_y, dtc_predict_test)
print(f"The Testing accuracy is: {dtc_test_accuracy*100}%")

The Training accuracy is: 81.06507661624576%
The Testing accuracy is: 45.173202039739756%


In [None]:
%%time

# Now, lets try another classification model which is the
# Support Vector Machine and check its accuracy.
from sklearn.svm import SVC

svc_model = SVC()
svc_model.fit(train_x, train_y)

In [None]:
%%time

# Now, lets predict the Species on the train and test dataset.
svc_predict_train = svc_model.predict(train_x)
print(f"Target on train data: {svc_predict_train}")

svc_predict_test = svc_model.predict(test_x)
print(f"Target on test data: {svc_predict_test}")

In [None]:
%%time

# Finally, lets check the accuracy of the SVC module.

svc_train_accuracy = accuracy_score(train_y, svc_predict_train)
print(f"The Training accuracy og SVC is: {svc_train_accuracy*100}%")

svc_test_accuracy = accuracy_score(train_x, svc_predict_test)
print(f"The Training accuracy og SVC is: {svc_test_accuracy*100}%")