In [1]:
import seaborn as sn

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification, make_blobs
from matplotlib.colors import ListedColormap
from sklearn.datasets import load_breast_cancer
from matplotlib import cm
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
#from adspy_shared_utilities import load_crime_dataset
import pandas as pd
import numpy as np
import os
import glob
import matplotlib.pyplot as plt

In [2]:
os.chdir(r"C:\Users\asif\Python Scripts\Combined Trajectory Reading\Combined Trajectory_Label_Geolife\Trajectoryanalysis")
dfSegments = pd.read_table('TrajectorySegments.csv', sep=',')
df = dfSegments.copy()

In [3]:
dfSegments.drop(['trip_id','segmentid','date_time_first','date_time_last','date_time_count','P(train)','P(subway)'],axis=1, inplace=True)

In [4]:
# filter all segments from dfSegments into df dataframe
dfNonWalk = dfSegments[dfSegments['Transportation_Mode'].isin(['car','bike','bus'])].copy()
dfNonWalk.drop(['P(walk)'], axis=1, inplace=True)

In [5]:
dfWalk = dfSegments[dfSegments['Transportation_Mode']=='walk'].copy()
dfWalk.drop(['P(car)','P(bus)','P(bike)'], axis=1, inplace=True)

# 1- Classification for Walk dataframe 

In [6]:
le = preprocessing.LabelEncoder()

le.fit(dfWalk['time_slice']) 
list(le.classes_)
dfWalk['time_slice'] = le.transform(dfWalk['time_slice']) 

le.fit(dfWalk['Transportation_Mode']) 
list(le.classes_)
dfWalk['Transportation_Mode'] = le.transform(dfWalk['Transportation_Mode'])

In [7]:
train, test, y_train, y_test = train_test_split(dfWalk, dfWalk['Transportation_Mode'], random_state=0) 

In [8]:
train.shape, y_train.shape, test.shape, y_test.shape

((1315, 39), (1315,), (439, 39), (439,))

In [9]:
train = train.drop(['Transportation_Mode'], axis=1)
test = test.drop(['Transportation_Mode'], axis=1)

In [10]:
scaler = StandardScaler()
train_norm = scaler.fit_transform(train)
test_norm = scaler.fit_transform(test)

## 1.1 Modelling

In [11]:
# Importing Classifier Modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

### 1.1.1 k fold cross validation

In [12]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

### 1.1.2 cross validation

In [13]:
clf = KNeighborsClassifier(n_neighbors = 13)
scoring = 'accuracy'
score = cross_val_score(clf, train_norm, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [14]:
# kNN Score
round(np.mean(score)*100, 2)

100.0

# 2- Classification for Non Walk Segments

In [15]:
le = preprocessing.LabelEncoder()

le.fit(dfNonWalk['time_slice']) 
list(le.classes_)
dfNonWalk['time_slice'] = le.transform(dfNonWalk['time_slice']) 

le.fit(dfNonWalk['Transportation_Mode']) 
list(le.classes_)
dfNonWalk['Transportation_Mode'] = le.transform(dfNonWalk['Transportation_Mode'])

In [16]:
train, test, y_train, y_test = train_test_split(dfNonWalk, dfNonWalk['Transportation_Mode'], random_state=0) 

In [17]:
train.shape, y_train.shape, test.shape, y_test.shape

((1905, 41), (1905,), (636, 41), (636,))

In [18]:
train = train.drop(['Transportation_Mode'], axis=1)
test = test.drop(['Transportation_Mode'], axis=1)

In [19]:
scaler = StandardScaler()
train_norm = scaler.fit_transform(train)
test_norm = scaler.fit_transform(test)

## 2.1 Modelling

In [20]:
# Importing Classifier Modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

### 2.1.1 k fold cross validation

In [21]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

### 2.1.2 KNN

In [22]:
clf = KNeighborsClassifier(n_neighbors = 13)
scoring = 'accuracy'
score = cross_val_score(clf, train_norm, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.67015707 0.61780105 0.62303665 0.57591623 0.57068063 0.58421053
 0.57368421 0.61052632 0.64210526 0.63684211]


In [23]:
# kNN Score
round(np.mean(score)*100, 2)

61.05

### 2.1.3 Decison Tree

In [24]:
clf = DecisionTreeClassifier()
scoring = 'accuracy'
score = cross_val_score(clf, train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.76963351 0.69109948 0.71727749 0.65968586 0.64921466 0.68421053
 0.66315789 0.73157895 0.66842105 0.65789474]


In [25]:
# Decision Tree Score
round(np.mean(score)*100, 2)

68.92

### 2.1.4 Random Forest

In [26]:
clf = RandomForestClassifier(n_estimators=13)
scoring = 'accuracy'
score = cross_val_score(clf, train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.81675393 0.80104712 0.77486911 0.68586387 0.68586387 0.69473684
 0.71052632 0.78421053 0.73157895 0.68947368]


In [27]:
# Random Forest Score
round(np.mean(score)*100, 2)

73.75

### 2.1.5 Naive Bayes

In [28]:
clf = GaussianNB()
scoring = 'accuracy'
score = cross_val_score(clf, train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.56020942 0.5026178  0.52879581 0.51308901 0.53926702 0.54210526
 0.48947368 0.51578947 0.54736842 0.52631579]


In [29]:
# Naive Bayes Score
round(np.mean(score)*100, 2)

52.65

## 2.2 Testing

In [30]:
clf = RandomForestClassifier(n_estimators=13)
clf.fit(train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=13, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [31]:
prediction = clf.predict(test)

In [32]:
clf.score(test, y_test)

0.7216981132075472

# 3- Evaluation

In [37]:
test.columns

Index(['userid', 'grid_index_first', 'grid_index_last', 'acceleration_mean',
       'acceleration_Top_1', 'acceleration_Top_2', 'acceleration_Top_3',
       'acceleration_count', 'acceleration_85_percentile',
       'acceleration_median', 'acceleration_min', 'velocity_Top_1',
       'velocity_Top_2', 'velocity_Top_3', 'stop_points', 'velocity_sum',
       'velocity_count', 'velocity_85_percentile', 'velocity_median',
       'velocity_min', 'velocity_change_pts', 'acc_change_pts', 'bearing_sum',
       'bearing_Top_1', 'bearing_Top_2', 'bearing_Top_3', 'bearing_Frequency',
       'bearing_rate_sum', 'rate_bearing_rate_mean', 'Vincenty_distance_sum',
       'time_delta', 'mean_velocity', 'stop_rate', 'velocity_changerate',
       'acc_changerate', 'time_slice', 'Covariance', 'P(bike)', 'P(bus)',
       'P(car)'],
      dtype='object')

In [70]:
submission = pd.DataFrame({
        "prediction": prediction,
        "Actual": y_test,
        "userid": test["userid"],
        "distance": test['Vincenty_distance_sum']
    })

In [71]:
submission[['userid','Actual','prediction','distance']]


Unnamed: 0,userid,Actual,prediction,distance
1320,163,2,2,196.477404
1819,167,0,1,367.209946
1838,179,0,1,911.411393
2721,68,1,1,756.077888
1402,167,1,1,469.026962
2294,62,2,1,451.732595
2336,62,1,1,4449.480184
3222,68,1,1,1313.700907
2680,68,0,0,3980.071310
3403,84,1,1,899.463059


In [86]:
submission[submission['userid']==10]

Unnamed: 0,Actual,distance,prediction,userid
9,2,7645.370969,2,10
1,1,3029.002935,2,10


In [93]:
flags = np.where(submission['Actual'].eq(submission['prediction']), 'correct', 'incorrect')
res_count = pd.crosstab(submission['userid'], flags, margins=True)
res = pd.crosstab(submission['userid'], flags, margins=True, values=submission['distance'], aggfunc=np.sum)
res=res.reset_index()
res_count = res_count.reset_index()

In [94]:
res_count

col_0,userid,correct,incorrect,All
0,10,1,1,2
1,20,11,2,13
2,52,27,11,38
3,62,22,5,27
4,64,6,1,7
5,65,8,8,16
6,68,120,14,134
7,69,1,0,1
8,75,0,1,1
9,78,1,1,2


In [91]:
1.049564e+06/1.322456e+06

0.7936475769326163

In [95]:
list(le.classes_)

['bike', 'bus', 'car']