In [1]:
import glob
import pandas as pd 

# Specify what folders do we need
INPUT_FOLDER      = './data_preprocessed3/'
COLUMNS = ['velocity', 'acceleration', 'trajectory_id', 'subfolder',
        'labels', 'v_ave', 'v_med', 'v_max', 'v_std', 'a_ave', 'a_med', 'a_max', 'a_std']

list_df_metadata = []

# Find all files that ends with .csv, read and append to the list_df_metadata
for filename in glob.glob(INPUT_FOLDER + '*.csv'):
    df_data = pd.read_csv(filename)
    list_df_metadata.append(df_data)
    
df_metadata = pd.concat(list_df_metadata)
df_metadata = df_metadata.drop(['Unnamed: 0'], axis=1)
df_metadata = df_metadata.reset_index(drop=True)
df_metadata.head()

Unnamed: 0,lat,long,altitude,datetime,timedelta,distance,velocity,acceleration,trajectory_id,subfolder,...,a_ave,a_med,a_max,a_min,a_std,a_rol,a_rsd,a_qu1,a_qu2,a_qu3
0,39.9664,116.30916,1003.9,2009-03-12 03:42:17,0 days 00:00:02.000000000,1.929597,0.964798,-0.060098,20090312003145,38,...,0.01188,0.0,29.74198,-12.008664,1.537051,-0.167892,1.13822,-0.69268,-0.134601,0.165613
1,39.968847,116.307993,292.0,2009-03-12 03:43:01,0 days 00:00:04.000000000,0.0,0.0,0.0,20090312003145,38,...,0.01188,0.0,29.74198,-12.008664,1.537051,0.576158,5.959252,-0.193682,0.017087,0.238215
2,39.96888,116.308142,288.7,2009-03-12 03:43:45,0 days 00:00:02.000000000,0.0,0.0,0.0,20090312003145,38,...,0.01188,0.0,29.74198,-12.008664,1.537051,0.591054,5.930925,-0.062513,0.0,0.108236
3,39.968593,116.307162,282.2,2009-03-12 03:44:17,0 days 00:00:02.000000000,12.44809,6.224045,-0.480035,20090312003145,38,...,0.01188,0.0,29.74198,-12.008664,1.537051,0.077697,0.398285,-0.149349,0.0,0.264757
4,39.968617,116.306595,272.3,2009-03-12 03:44:47,0 days 00:00:02.000000000,9.555823,4.777912,-0.746941,20090312003145,38,...,0.01188,0.0,29.74198,-12.008664,1.537051,0.040122,0.552961,-0.190018,0.099273,0.329978


In [2]:
X_COLUMNS = ['v_rol', 'v_rsd', 'v_qu1','v_qu2', 'v_qu3', 'a_rol', 'a_rsd', 'a_qu1', 'a_qu2', 'a_qu3']
Y_COLUMN  = ['labels']

In [10]:
print(df_metadata.shape)
# df_metadata2 = df_metadata.dropna(subset=['labels'], axis=0)
df_metadata2 = df_metadata.dropna()
print(df_metadata2.shape)
X = df_metadata2[X_COLUMNS]
Y = df_metadata2[Y_COLUMN]

(1566825, 31)
(307289, 31)


In [13]:
df_metadata2['labels'].value_counts(dropna=False)

walk          82472
bus           77527
bike          47420
train         37289
car           32569
subway        15718
taxi          13304
airplane        603
boat            235
run             130
motorcycle       22
Name: labels, dtype: int64

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123, stratify=Y)

In [17]:
print(y_train['labels'].value_counts())
print(y_test['labels'].value_counts())

walk          57730
bus           54269
bike          33194
train         26102
car           22798
subway        11003
taxi           9313
airplane        422
boat            165
run              91
motorcycle       15
Name: labels, dtype: int64
walk          24742
bus           23258
bike          14226
train         11187
car            9771
subway         4715
taxi           3991
airplane        181
boat             70
run              39
motorcycle        7
Name: labels, dtype: int64


In [21]:
import numpy as np

from time import time
from scipy.stats import randint as sp_randint

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

 # Let's train Random Forest
## 1. Define search area
## 2. Printing results
## 3. Show top 

In [30]:
# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [31]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [32]:
clf = RandomForestClassifier(n_estimators=20, n_jobs=4)


# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X_train, y_train.values.ravel())

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

GridSearchCV took 131.03 seconds for 4 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.706 (std: 0.004)
Parameters: {'criterion': 'entropy', 'max_depth': None}

Model with rank: 2
Mean validation score: 0.705 (std: 0.003)
Parameters: {'criterion': 'gini', 'max_depth': None}

Model with rank: 3
Mean validation score: 0.618 (std: 0.003)
Parameters: {'criterion': 'entropy', 'max_depth': 3}



In [None]:
dict_mapping = {
    'walk'       : 'walk',
    'bus'        : 'car',
    'bike'       : 'bike',
    'train'      : 'train',
    'car'        : 'car',
    'subway'     : 'subway',
    'taxi'       : 'car',
    'airplane'   : 'airplane',
    'boat'       : 'boat',
    'run'        : 'walk',
    'motorcycle' : 'bike'
}

In [None]:
rf_classifier = RandomForestClassifier(n_estimators = 40, n_jobs=3)

#Random Forest
t_start = time.clock()
rf_classifier.fit(X_train, y_train)
t_end = time.clock()
t_diff = t_end - t_start

train_score = rf_classifier.score(X_train, y_train)
test_score = rf_classifier.score(X_test, y_test)
y_pred_rf= rf_classifier.predict(X_test)
print("trained Random Forest in {:.2f} s.\t Score on training / test set: {} / {}".format(t_diff, train_score, test_score))


In [None]:
import numpy as np
import matplotlib.pyplot as plt

importances = rf_classifier.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_classifier.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]


In [None]:
X.head()

In [None]:
#  Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
from sklearn.metrics import confusion_matrix
import itertools


class_names = list(Y['labels'].unique())

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_pred_rf, y_test)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure(figsize=(10,8))
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure(figsize=(10,8))
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

In [None]:
Y[Y['labels'] == 'train'].shape