Approach
- Pick 5 users and get 100 records per each walking surface
- Enumerate records
- Each enumerated id is a partition and surfaces are classes

In [23]:
import pandas as pd

from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute

In [2]:
data = pd.read_csv("sampled_data/data.csv")
data.describe()

Unnamed: 0,UserId,PacketCounter,SampleTimeFine,Acc_X,Acc_Y,Acc_Z,FreeAcc_X,FreeAcc_Y,FreeAcc_Z,Gyr_X,...,OriInc_q3,Roll,Pitch,Yaw,Latitude,Longitude,Altitude,Vel_X,Vel_Y,Vel_Z
count,1708236.0,1708236.0,0.0,1693804.0,1693804.0,1693804.0,1693804.0,1693804.0,1693804.0,1693804.0,...,1693804.0,1693804.0,1693804.0,1693804.0,0.0,0.0,1708236.0,0.0,0.0,0.0
mean,15.50002,32256.32,,9.838823,0.9067474,-0.07830717,-0.1514832,0.1432863,-0.1293384,0.04703301,...,-0.0006346451,38.95021,-70.9017,-0.6138306,,,0.0,,,
std,8.654662,19179.77,,149.0878,201.0112,89.39607,135.4799,136.3478,112.6015,2.040233,...,0.003301078,99.22891,9.823791,100.302,,,0.0,,,
min,1.0,0.0,,-137694.7,-185464.1,-90.34852,-124612.8,-79.1027,-103428.5,-195.9829,...,-0.07032,-180.0,-89.99649,-179.9997,,,0.0,,,
25%,8.0,15079.0,,7.076802,-0.8746855,-2.809921,-2.008377,-2.66811,-2.727015,-1.031616,...,-0.002157,-3.304743,-78.26868,-79.61625,,,0.0,,,
50%,16.0,32163.0,,9.597096,0.8244235,-0.7924415,-0.0408685,0.002297,-0.079477,0.0466475,...,-0.000237,29.74054,-71.64629,-32.91615,,,0.0,,,
75%,23.0,49211.0,,13.06224,2.87003,1.985747,1.995095,2.776245,2.630229,1.115163,...,0.001069,130.6391,-65.12706,82.25846,,,0.0,,,
max,30.0,65535.0,,119.4948,93.40336,85776.15,64.5567,125381.7,156.9266,200.027,...,0.050715,179.9999,73.46872,179.9999,,,0.0,,,


In [3]:
# Drop unneeded and NaN columns
data.dropna(axis=1, how="all", inplace=True)
data.drop(["PacketCounter", "Altitude", "SensorLoc"], axis=1, inplace=True)
data.head(5)

Unnamed: 0,UserId,Surface,Acc_X,Acc_Y,Acc_Z,FreeAcc_X,FreeAcc_Y,FreeAcc_Z,Gyr_X,Gyr_Y,...,VelInc_X,VelInc_Y,VelInc_Z,OriInc_q0,OriInc_q1,OriInc_q2,OriInc_q3,Roll,Pitch,Yaw
0,1,CALIB,9.49226,0.130429,-2.558672,-0.0,-0.0,0.019241,-0.025877,-0.028406,...,0.094926,0.001303,-0.025573,1.0,-0.000129,-0.000142,2.4e-05,177.082561,-74.903636,103.717809
1,1,CALIB,9.521406,0.118498,-2.408461,-0.046313,0.186881,0.007437,-0.002117,-0.030708,...,0.095218,0.001189,-0.02407,1.0,-1.1e-05,-0.000154,4.3e-05,177.231488,-74.656365,104.388684
2,1,CALIB,9.544091,0.187095,-2.20406,-0.019889,0.402747,-0.023921,-0.001215,-0.028015,...,0.095444,0.001874,-0.022027,1.0,-6e-06,-0.00014,3.6e-05,177.248779,-74.640533,104.370032
3,1,CALIB,9.573117,0.1629,-2.1681,-0.052578,0.442471,-0.005893,-0.000991,-0.038044,...,0.095735,0.00163,-0.021663,1.0,-5e-06,-0.00019,9e-06,177.257435,-74.619164,104.360623
4,1,CALIB,9.653494,0.251947,-2.118363,0.018708,0.531586,0.059392,-0.007314,-0.03384,...,0.096539,0.002515,-0.021167,1.0,-3.7e-05,-0.000169,-3.4e-05,177.242447,-74.59961,104.371823


In [4]:
data.dropna(how="any", inplace=True)

In [5]:
from sklearn import preprocessing

# Encode data
le = preprocessing.LabelEncoder()
data["Surface"] = le.fit_transform(data["Surface"])

In [6]:
# Pick 5 users
data_5 = data[data["UserId"] <= 5]

for user_id in data_5.UserId.unique():
    print(f"Surfaces of user {user_id}:", " ".join(map(str, data_5[data_5.UserId == user_id].Surface.unique())))

Surfaces of user 1: 2 6 4 7 0 1 5 9 3 8
Surfaces of user 2: 2 6 4 7 0 1 5 9 3 8
Surfaces of user 3: 2 6 4 7 0 1 5 9 3 8
Surfaces of user 4: 2 6 4 7 0 1 5 9 3 8
Surfaces of user 5: 2 6 4 7 0 1 5 3


Each user appears to have the same amount of surfaces, except for the last one\
Select a few surfaces and get 100 records per each user and surface pair

In [9]:
def get_data_sample(slice_start: int, slice_end: int, surfaces: list):
    datasets = []
    for user_id in data_5.UserId.unique():
        for surface in surfaces:
            tmp = data_5[(data_5.UserId == user_id) & (data_5.Surface == surface)].iloc[slice_start:slice_end]
            tmp.insert(0, "id", list(range(surface * 100, surface * 100 + len(tmp))))
            datasets.append(tmp)

    df = pd.concat(datasets).reset_index(drop=True)
    # Sort values by id so that measurements from different users per each surface type are grouped together
    df.sort_values(by=["id"], inplace=True)

    # Add time column
    df.insert(0, "Time", list(range(1, len(df) + 1)))

    return df


surfaces = [1, 3, 5]
data_ts = get_data_sample(0, 100, surfaces)
data_ts

Unnamed: 0,Time,id,UserId,Surface,Acc_X,Acc_Y,Acc_Z,FreeAcc_X,FreeAcc_Y,FreeAcc_Z,...,VelInc_X,VelInc_Y,VelInc_Z,OriInc_q0,OriInc_q1,OriInc_q2,OriInc_q3,Roll,Pitch,Yaw
0,1,100,1,1,9.655841,-0.802532,-2.329289,0.000000,0.000000,0.152498,...,0.096563,-0.008009,-0.023281,1.000000,-0.000458,-0.000087,0.000278,-161.015603,-75.696432,106.508134
300,2,100,2,1,9.456831,-0.693162,-0.868848,-0.000000,-0.000000,-0.290765,...,0.094564,-0.006954,-0.008722,1.000000,0.000140,0.000343,-0.000248,-141.435076,-83.272463,85.065291
900,3,100,4,1,9.660328,0.825617,-1.808154,-0.043403,-0.090206,0.049512,...,0.096605,0.008266,-0.018066,1.000000,-0.000129,-0.000177,0.000128,152.699818,-78.243198,167.945963
600,4,100,3,1,9.562259,0.458874,-2.338050,-0.000000,0.000000,0.041948,...,0.095625,0.004577,-0.023374,1.000000,0.000070,-0.000064,-0.000145,168.921902,-76.013552,131.784646
1200,5,100,5,1,9.001673,3.554755,-1.284438,-0.000000,0.000000,-0.049687,...,0.090007,0.035580,-0.012824,1.000000,-0.000702,-0.000500,0.000456,109.821241,-67.208230,-159.711687
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199,1496,599,4,5,10.639774,1.445141,-1.199068,-2.039918,2.974678,0.371661,...,0.106408,0.014484,-0.011859,0.999991,0.004222,-0.000659,-0.000168,-178.922481,-65.709882,148.398690
599,1497,599,2,5,5.464766,-4.541182,8.435724,-4.742301,0.565656,0.129030,...,0.054433,-0.045629,0.084379,0.999991,0.003074,-0.002960,0.000780,-7.093975,-49.988155,-52.934872
299,1498,599,1,5,4.969698,0.495670,3.027671,0.286370,0.591673,-4.009424,...,0.049865,0.005071,0.029980,0.999974,-0.004649,0.005492,-0.000505,-2.219655,-61.094297,-49.491748
899,1499,599,3,5,5.809286,-3.256449,3.884960,-2.585187,-0.797670,-2.592934,...,0.058415,-0.032427,0.038478,0.999977,0.000115,0.006314,0.002428,-9.049496,-58.352218,-57.773940


In [10]:
# Map each id to a corresponding surface type
# NOTE: id is a 3 digit number and the hundreds digit is a surface code
classes = pd.Series(data_ts.id.unique() // 100, data_ts.id.unique())

In [11]:
features = extract_features(
    data_ts.drop(["UserId", "Surface"], axis=1),
    column_id="id",
    column_sort="Time",
    impute_function=impute
)

features

Feature Extraction: 100%|██████████| 30/30 [00:56<00:00,  1.87s/it]


Unnamed: 0,Acc_X__variance_larger_than_standard_deviation,Acc_X__has_duplicate_max,Acc_X__has_duplicate_min,Acc_X__has_duplicate,Acc_X__sum_values,Acc_X__abs_energy,Acc_X__mean_abs_change,Acc_X__mean_change,Acc_X__mean_second_derivative_central,Acc_X__median,...,Yaw__permutation_entropy__dimension_6__tau_1,Yaw__permutation_entropy__dimension_7__tau_1,Yaw__query_similarity_count__query_None__threshold_0.0,"Yaw__matrix_profile__feature_""min""__threshold_0.98","Yaw__matrix_profile__feature_""max""__threshold_0.98","Yaw__matrix_profile__feature_""mean""__threshold_0.98","Yaw__matrix_profile__feature_""median""__threshold_0.98","Yaw__matrix_profile__feature_""25""__threshold_0.98","Yaw__matrix_profile__feature_""75""__threshold_0.98",Yaw__mean_n_absolute_max__number_of_maxima_7
100,0.0,0.0,0.0,0.0,47.336932,448.455769,0.265291,-0.163542,-0.060263,9.562259,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,0.0,0.0,0.0,0.0,47.540364,452.385290,0.344005,0.018880,0.093837,9.638948,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102,0.0,0.0,0.0,0.0,47.612158,453.812420,0.348937,-0.077748,0.115482,9.626419,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103,0.0,0.0,0.0,0.0,47.821013,457.915780,0.374132,-0.131939,-0.138688,9.658233,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104,0.0,0.0,0.0,0.0,47.345462,448.598057,0.154907,0.154907,-0.087512,9.584026,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,1.0,0.0,0.0,0.0,34.370459,273.318774,3.879417,-0.281672,0.423266,7.039631,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
596,1.0,0.0,0.0,0.0,34.904235,273.849359,2.622464,-1.883641,0.072662,6.681120,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
597,1.0,0.0,0.0,0.0,35.274402,273.114884,2.199727,1.093637,0.779130,6.294757,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
598,1.0,0.0,0.0,0.0,35.222331,269.955110,3.702765,0.167779,-1.367381,6.042614,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
X_train, X_test, y_train, y_test = train_test_split(features, classes, test_size=.2)

In [13]:
relevant_features = set()

for label in classes.unique():
    targets_binary = y_train == label
    print(len(targets_binary))
    features_filtered = select_features(X_train, targets_binary)
    print("Number of relevant features for class {}: {}/{}".format(label, features_filtered.shape[1], features.shape[1]))
    relevant_features = relevant_features.union(set(features_filtered.columns))

240
Number of relevant features for class 1: 635/17358
240
Number of relevant features for class 3: 647/17358
240
Number of relevant features for class 5: 623/17358


### Let's train a model

In [14]:
print("Amount of relevant features:", len(relevant_features))

X_train_filtered = X_train[list(relevant_features)]
X_test_filtered = X_test[list(relevant_features)]

Amount of relevant features: 1173


In [15]:
classifier_selected = DecisionTreeClassifier()
classifier_selected.fit(X_train_filtered, y_train)

print(classification_report(y_test, classifier_selected.predict(X_test_filtered)))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        22
           3       1.00      0.94      0.97        16
           5       0.96      1.00      0.98        22

    accuracy                           0.98        60
   macro avg       0.99      0.98      0.98        60
weighted avg       0.98      0.98      0.98        60



In [16]:

classifier_full = DecisionTreeClassifier()
classifier_full.fit(X_train, y_train)
print(classification_report(y_test, classifier_full.predict(X_test)))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        22
           3       1.00      1.00      1.00        16
           5       1.00      1.00      1.00        22

    accuracy                           1.00        60
   macro avg       1.00      1.00      1.00        60
weighted avg       1.00      1.00      1.00        60



#### Precision of 100% is absolutely unrealistic, there seems to be no variance between train and test data
#### Let's check how our model performs with unseen data

In [17]:
surfaces = [1, 3, 5]
data_ts = get_data_sample(800, 900, surfaces)
data_ts

Unnamed: 0,Time,id,UserId,Surface,Acc_X,Acc_Y,Acc_Z,FreeAcc_X,FreeAcc_Y,FreeAcc_Z,...,VelInc_X,VelInc_Y,VelInc_Z,OriInc_q0,OriInc_q1,OriInc_q2,OriInc_q3,Roll,Pitch,Yaw
0,1,100,1,1,-0.803406,-2.071275,-4.595924,-0.086164,-4.248959,-12.640564,...,-0.007875,-0.020889,-0.045907,0.999988,-0.003719,-0.003148,0.000716,-4.548070,-61.554669,-56.582956
300,2,100,2,1,15.591149,-5.502933,16.183249,-8.069631,4.284502,11.440798,...,0.153215,-0.058827,0.163044,0.999687,0.019510,-0.015145,-0.004049,-4.317328,-65.302512,-53.251243
900,3,100,4,1,17.531020,-1.686792,-2.906607,-2.125449,-1.790845,7.818901,...,0.174689,-0.018760,-0.031487,0.999803,-0.009551,0.014830,-0.009142,164.437101,-77.320976,169.137469
600,4,100,3,1,12.599671,2.543136,1.036400,-3.080940,5.858032,1.254564,...,0.125944,0.025522,0.010776,0.999994,0.001005,-0.003065,0.000802,171.895120,-64.121287,141.336521
1200,5,100,5,1,4.857637,4.854209,-0.980034,3.807336,-1.366017,-4.177562,...,0.048840,0.048476,-0.008738,0.999852,0.015398,-0.006465,-0.004222,48.271501,-59.427731,-110.335825
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199,1496,599,4,5,15.744431,2.118684,2.694547,-7.423905,6.599238,2.874682,...,0.157621,0.020970,0.026065,0.999980,0.002147,0.005872,-0.001020,-171.816081,-63.367240,148.353598
599,1497,599,2,5,32.464102,0.155519,11.478311,1.164892,1.304030,24.575977,...,0.325328,0.005015,0.112696,0.999905,-0.009891,0.006301,0.007192,-9.522515,-70.621780,-46.588363
299,1498,599,1,5,2.984693,-0.124149,-0.032814,0.807783,-1.141641,-7.172849,...,0.029848,-0.001257,-0.000039,0.999937,-0.006118,-0.009447,-0.000479,-8.619569,-63.015345,-49.375057
899,1499,599,3,5,-1.118383,0.798236,-8.602852,4.747116,-5.946602,-14.054461,...,-0.011857,0.006676,-0.086046,0.999849,-0.015390,0.007947,-0.001538,-1.156497,-67.705410,-57.096309


In [18]:
features = extract_features(
    data_ts.drop(["UserId", "Surface"], axis=1),
    column_id="id",
    column_sort="Time",
    impute_function=impute
)
features

Feature Extraction: 100%|██████████| 30/30 [01:06<00:00,  2.20s/it]


Unnamed: 0,Acc_X__variance_larger_than_standard_deviation,Acc_X__has_duplicate_max,Acc_X__has_duplicate_min,Acc_X__has_duplicate,Acc_X__sum_values,Acc_X__abs_energy,Acc_X__mean_abs_change,Acc_X__mean_change,Acc_X__mean_second_derivative_central,Acc_X__median,...,Yaw__permutation_entropy__dimension_6__tau_1,Yaw__permutation_entropy__dimension_7__tau_1,Yaw__query_similarity_count__query_None__threshold_0.0,"Yaw__matrix_profile__feature_""min""__threshold_0.98","Yaw__matrix_profile__feature_""max""__threshold_0.98","Yaw__matrix_profile__feature_""mean""__threshold_0.98","Yaw__matrix_profile__feature_""median""__threshold_0.98","Yaw__matrix_profile__feature_""25""__threshold_0.98","Yaw__matrix_profile__feature_""75""__threshold_0.98",Yaw__mean_n_absolute_max__number_of_maxima_7
100,1.0,0.0,0.0,0.0,49.776071,733.414397,7.751952,1.415261,-4.022765,12.599671,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,1.0,0.0,0.0,0.0,64.848807,1113.722007,13.385055,1.415403,3.570169,13.353338,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102,1.0,0.0,0.0,0.0,87.340741,1959.501231,10.949699,2.938197,-4.370324,19.453159,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103,1.0,0.0,0.0,0.0,97.610107,2428.813743,17.033417,4.635230,3.836674,18.942164,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104,1.0,0.0,0.0,0.0,77.164842,1324.090569,9.388101,-0.507681,2.830072,17.454538,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,1.0,0.0,0.0,0.0,30.922496,323.821926,5.382637,-3.105492,-1.113546,6.024110,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
596,1.0,0.0,0.0,0.0,38.203487,487.782995,12.658515,0.403449,4.263833,12.003591,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
597,1.0,0.0,0.0,0.0,49.444702,894.629867,15.458310,3.255663,-4.106964,12.726877,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
598,1.0,0.0,0.0,0.0,63.484013,1567.842603,15.868965,-8.135335,0.939882,14.505172,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Full feature set
print(classification_report(classes, classifier_full.predict(features)))

              precision    recall  f1-score   support

           1       0.39      0.79      0.52       100
           3       0.00      0.00      0.00       100
           5       0.61      0.58      0.59       100

    accuracy                           0.46       300
   macro avg       0.33      0.46      0.37       300
weighted avg       0.33      0.46      0.37       300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
# Filtered feature set
X_test_filtered = features[list(relevant_features)]
print(classification_report(classes, classifier_selected.predict(X_test_filtered)))

              precision    recall  f1-score   support

           1       0.42      0.95      0.58       100
           3       1.00      0.06      0.11       100
           5       0.92      0.61      0.73       100

    accuracy                           0.54       300
   macro avg       0.78      0.54      0.48       300
weighted avg       0.78      0.54      0.48       300



In [31]:
rdf_full = RandomForestClassifier()
rdf_full.fit(X_train, y_train)
print(classification_report(y_test, rdf_full.predict(X_test)))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        22
           3       1.00      1.00      1.00        16
           5       1.00      1.00      1.00        22

    accuracy                           1.00        60
   macro avg       1.00      1.00      1.00        60
weighted avg       1.00      1.00      1.00        60



In [29]:
# classifier_selected = DecisionTreeClassifier()
# classifier_selected.fit(X_train_filtered, y_train)

# print(classification_report(y_test, classifier_selected.predict(X_test_filtered)))
rdf_selected = RandomForestClassifier()
rdf_selected.fit(X_train_filtered, y_train)
print(classification_report(classes, rdf_selected.predict(X_test_filtered)))

              precision    recall  f1-score   support

           1       0.50      1.00      0.67       100
           3       0.00      0.00      0.00       100
           5       1.00      1.00      1.00       100

    accuracy                           0.67       300
   macro avg       0.50      0.67      0.56       300
weighted avg       0.50      0.67      0.56       300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Result: Decision Tree classifier performance on unseen data is quite poor