In [1]:
from pathlib import Path
import pandas as pd
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [2]:
# loading data

path = Path("hw5_data")
df = pd.DataFrame(columns=["activity", "accelerometer"])

for dir in path.iterdir():
    if not dir.is_dir():
        continue
    for file in dir.iterdir():
        if not file.suffix.lower() == ".csv":
            continue
        sample = pd.read_csv(file)
        df.loc[len(df.index)] = [dir.name, sample]

df.shape

(6462, 2)

In [3]:
# Спробуємо використати в якості вхідних параметрів "сирі" дані з акселерометрів.
# Для цього підготуємо датасет: перенесемо дані по всім осям для кожного семпла в колонки:
df_flatten = df.copy()

for i in range(df_flatten.shape[0]):
    sample = df_flatten.loc[i, "accelerometer"]
    sample = sample.stack().reset_index()
    sample["index"] = sample["level_1"] + "_" + sample["level_0"].astype(str)
    sample = sample.drop(["level_0", "level_1"], axis=1).set_index("index").T
    df_flatten.loc[i, sample.columns] = sample.iloc[0, :]

df_flatten


Unnamed: 0,activity,accelerometer,accelerometer_X_0,accelerometer_Y_0,accelerometer_Z_0,accelerometer_X_1,accelerometer_Y_1,accelerometer_Z_1,accelerometer_X_2,accelerometer_Y_2,...,accelerometer_Z_26,accelerometer_X_27,accelerometer_Y_27,accelerometer_Z_27,accelerometer_X_28,accelerometer_Y_28,accelerometer_Z_28,accelerometer_X_29,accelerometer_Y_29,accelerometer_Z_29
0,running,accelerometer_X accelerometer_Y accelero...,-3.682282,4.831499,2.351106,20.719421,5.635951,8.398860,3.960010,-11.482592,...,28.304253,0.565032,-6.847417,-3.438074,3.888184,12.502522,3.083732,-6.052541,6.023812,-1.489193
1,running,accelerometer_X accelerometer_Y accelero...,12.579136,7.723695,-2.169147,-7.244854,4.917691,-4.371812,2.121263,-15.054740,...,9.255983,37.708675,21.485565,6.545748,16.266207,-1.675941,10.979809,-6.895301,7.316681,-1.714248
2,running,accelerometer_X accelerometer_Y accelero...,7.335834,-14.087482,0.411803,5.358224,15.289372,2.662352,34.323277,31.704020,...,5.008670,22.989124,28.572401,7.120356,0.809240,0.679953,-7.029376,-1.297657,6.488287,-1.915361
3,running,accelerometer_X accelerometer_Y accelero...,-8.130708,8.331822,-4.410119,6.952762,-4.922478,-9.054871,5.655105,10.730812,...,3.744532,6.612785,19.019539,17.022774,-6.191406,6.919243,4.060566,6.852206,-8.939949,-0.205901
4,running,accelerometer_X accelerometer_Y accelero...,0.857124,-12.387600,-0.641646,8.638280,20.470425,2.087744,-3.327940,7.240066,...,-5.453991,11.291056,15.964537,6.066907,11.836933,22.610842,0.181959,3.524265,-7.996633,-2.485181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6457,walking,accelerometer_X accelerometer_Y accelero...,-2.351106,-7.972691,-2.762909,0.866701,-11.913548,-3.356671,-2.983175,-3.859453,...,-2.030283,-3.083732,-2.513912,1.106121,-0.837971,-8.101978,-0.033519,1.182736,-6.995857,0.665588
6458,walking,accelerometer_X accelerometer_Y accelero...,-6.057330,-10.098742,-7.335834,-6.890512,-12.770672,-1.690306,-6.464344,-10.462662,...,0.167594,-2.614468,-2.652775,3.897761,-1.321599,-8.729259,-1.522712,-4.707000,-10.563218,-1.762132
6459,walking,accelerometer_X accelerometer_Y accelero...,-9.054871,-23.616405,-17.022774,-2.264915,-12.114660,-0.967257,-18.598158,-16.438590,...,0.924162,-1.340753,-8.283937,1.173159,0.339977,-2.916138,3.428497,-2.863465,-8.676587,0.450110
6460,walking,accelerometer_X accelerometer_Y accelero...,-7.417237,-20.748152,-3.294421,4.917691,-11.267113,-1.388637,-8.724471,-23.678656,...,5.966351,0.981623,-8.935161,-1.762132,-0.900220,-16.060305,5.224149,15.169662,-11.453861,38.426937


In [4]:
# Навчимо класифікатори SVC та Random Forest

X = df_flatten.iloc[:, 2:-1]
y = df_flatten["activity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model_svm = SVC()
model_svm.fit(X_train, y_train)
y_pred = model_svm.predict(X_test)

print("SVC")
print(classification_report(y_test, y_pred))


model_forest = RandomForestClassifier()
model_forest.fit(X_train, y_train)
y_pred = model_forest.predict(X_test)

print("Random Forest")
print(classification_report(y_test, y_pred))

# Висновок: навіть використовуючи "сирі" дані з акселерометрів можна отримати результати:
# Вдалось вдало класифікувати дані для діяльностей: idle та running, досить непогано walking і
# незадовільно stairs. Причому обидва класифікатора забезпечили непогану
# precision (SVC та Random Forest: 0.89 та 1.00 відповідно), але дуже поганий
# recall (SVC та Random Forest: 0.15 та 0.04 відповідно)


SVC
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00       312
     running       1.00      1.00      1.00      1038
      stairs       0.89      0.15      0.26        52
     walking       0.92      1.00      0.96       537

    accuracy                           0.98      1939
   macro avg       0.95      0.79      0.80      1939
weighted avg       0.98      0.98      0.97      1939

Random Forest
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00       312
     running       1.00      1.00      1.00      1038
      stairs       1.00      0.04      0.07        52
     walking       0.91      1.00      0.96       537

    accuracy                           0.97      1939
   macro avg       0.98      0.76      0.76      1939
weighted avg       0.98      0.97      0.96      1939



In [5]:
# Також погані результати для stairs можуть бути пов'язані з малою кількістю даних для активності.
# Порівняно з іншими діяльностями, кількість зразків для stairs на порядок менша і 
# модель не може натренуватись належним чином.

df["activity"].value_counts()

activity
running    3408
walking    1850
idle       1039
stairs      165
Name: count, dtype: int64

In [6]:
# Спробуємо покращити роботу алгоритмів виокремивши часові ознаки (time domain features).
# Часові ознаки визначаємо для кожної осі. Візьмемо наступні часові ознаки: 
# mean, variance, standard deviation, median, range maximum and minimum value, skewness,
# indexes of minimum and maximum value, kurtosis, interquartile range
# 
# Інші вказані в статті ознаки, такі як: root main square metric, signal magnitude area,
# power, energy, entropy, mean absolute deviation of signal, cross correlation of binary combinations of x, y, and z,
# не використовуються, т.я. хз що це таке і як це рахувати

def extract_features(sample: pd.DataFrame):
    output = pd.DataFrame()
    
    # mean
    output["X_mean"] = [sample["accelerometer_X"].mean()]
    output["Y_mean"] = [sample["accelerometer_Y"].mean()]
    output["Z_mean"] = [sample["accelerometer_Z"].mean()]

    # variance
    output["X_variance"] = [sample["accelerometer_X"].var()]
    output["Y_variance"] = [sample["accelerometer_Y"].var()]
    output["Z_variance"] = [sample["accelerometer_Z"].var()]

    # standard deviation
    output["X_std"] = [sample["accelerometer_X"].std()]
    output["Y_std"] = [sample["accelerometer_Y"].std()]
    output["Z_std"] = [sample["accelerometer_Z"].std()]

    # median
    output["X_median"] = [sample["accelerometer_X"].median()]
    output["Y_median"] = [sample["accelerometer_Y"].median()]
    output["Z_median"] = [sample["accelerometer_Z"].median()]

    # range
    output["X_range"] = [sample["accelerometer_X"].max() - sample["accelerometer_X"].min()]
    output["Y_range"] = [sample["accelerometer_Y"].max() - sample["accelerometer_Y"].min()]
    output["Z_range"] = [sample["accelerometer_Z"].max() - sample["accelerometer_Z"].min()]

    # skewness
    output["X_skew"] = [sample["accelerometer_X"].skew()]
    output["Y_skew"] = [sample["accelerometer_Y"].skew()]
    output["Z_skew"] = [sample["accelerometer_Z"].skew()]

    # indexes of minimum value
    output["X_min_index"] = [sample["accelerometer_X"].idxmin()]
    output["Y_min_index"] = [sample["accelerometer_Y"].idxmin()]
    output["Z_min_index"] = [sample["accelerometer_Z"].idxmin()]

    # indexes of maximum value
    output["X_max_index"] = [sample["accelerometer_X"].idxmax()]
    output["Y_max_index"] = [sample["accelerometer_Y"].idxmax()]
    output["Z_max_index"] = [sample["accelerometer_Z"].idxmax()]

    # kurtosis
    output["X_kurtosis"] = [sample["accelerometer_X"].kurtosis()]
    output["Y_kurtosis"] = [sample["accelerometer_Y"].kurtosis()]
    output["Z_kurtosis"] = [sample["accelerometer_Z"].kurtosis()]

    # interquartile range
    output["X_interquartile"] = [sample["accelerometer_X"].quantile(q=0.75) - sample["accelerometer_X"].quantile(q=0.25)]
    output["Y_interquartile"] = [sample["accelerometer_Y"].quantile(q=0.75) - sample["accelerometer_Y"].quantile(q=0.25)]
    output["Z_interquartile"] = [sample["accelerometer_Z"].quantile(q=0.75) - sample["accelerometer_Z"].quantile(q=0.25)]

    return output


df_features = df.copy()

for i in range(df_features.shape[0]):
    sample = extract_features(df_features.loc[i, "accelerometer"])
    df_features.loc[i, sample.columns] = sample.iloc[0, :]

df_features




Unnamed: 0,activity,accelerometer,X_mean,Y_mean,Z_mean,X_variance,Y_variance,Z_variance,X_std,Y_std,...,Z_min_index,X_max_index,Y_max_index,Z_max_index,X_kurtosis,Y_kurtosis,Z_kurtosis,X_interquartile,Y_interquartile,Z_interquartile
0,running,accelerometer_X accelerometer_Y accelero...,4.566381,7.436869,2.243846,64.119802,174.706312,52.711674,8.007484,13.217652,...,24.0,11.0,25.0,26.0,-0.064587,-0.877788,4.558716,9.507376,16.773777,7.260417
1,running,accelerometer_X accelerometer_Y accelero...,7.700072,7.544289,1.132458,131.551278,181.058807,72.094691,11.469581,13.455809,...,24.0,10.0,10.0,14.0,1.529297,-1.008418,3.019652,11.634623,22.565350,10.699687
2,running,accelerometer_X accelerometer_Y accelero...,5.446011,7.525455,1.171084,100.625040,224.935138,48.165712,10.031203,14.997838,...,8.0,2.0,20.0,20.0,1.126711,-0.836969,-0.071832,11.316194,18.555063,9.218874
3,running,accelerometer_X accelerometer_Y accelero...,4.053703,8.423440,0.551784,48.849347,139.110957,34.975436,6.989231,11.794531,...,3.0,23.0,3.0,27.0,0.951948,-0.722399,1.284777,8.726865,17.284939,6.191406
4,running,accelerometer_X accelerometer_Y accelero...,8.022172,6.138574,2.580630,61.575806,184.793547,44.752780,7.847025,13.593879,...,6.0,15.0,4.0,21.0,0.452715,-1.310561,0.243562,9.724049,24.672250,9.326613
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6457,walking,accelerometer_X accelerometer_Y accelero...,-4.184266,-8.588799,0.147802,26.186655,44.632519,89.092913,5.117290,6.680757,...,18.0,5.0,5.0,16.0,-0.104649,1.322585,5.596353,7.231686,5.661090,3.750517
6458,walking,accelerometer_X accelerometer_Y accelero...,-3.155079,-9.280085,-0.679155,9.689139,26.494532,43.906227,3.112738,5.147284,...,18.0,23.0,23.0,23.0,-0.103315,2.755212,6.762499,3.685874,4.891355,3.299210
6459,walking,accelerometer_X accelerometer_Y accelero...,-3.983153,-9.760681,-1.529096,33.355324,35.010711,63.500625,5.775407,5.916985,...,2.0,25.0,22.0,11.0,0.795680,0.237591,2.149723,4.288015,7.229293,4.129997
6460,walking,accelerometer_X accelerometer_Y accelero...,0.149717,-11.431515,-0.154825,28.543598,60.824442,89.711594,5.342621,7.799003,...,9.0,29.0,18.0,29.0,1.000014,0.092546,9.404882,4.554969,8.883685,8.175002


In [8]:
# Навчимо класифікатори SVC та Random Forest

X = df_features.iloc[:, 2:-1]
y = df_features["activity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model_svm = SVC()
model_svm.fit(X_train, y_train)
y_pred = model_svm.predict(X_test)

print("SVC")
print(classification_report(y_test, y_pred))


model_forest = RandomForestClassifier()
model_forest.fit(X_train, y_train)
y_pred = model_forest.predict(X_test)

print("Random Forest")
print(classification_report(y_test, y_pred))

# Висновки: використавши часові ознаки вдалось покращити роботу алгоритму Random Forest:
# для idle та running метрики precision, recall, f1-score залишились рівними 1.
# Для walking метрики precision та f1-score покращились до 0.99, recall = 1.
# Значно покращилася робота класифікатора для stairs. Не дивлячись на малу кількість зразків,
# досягнуто досить непоганих показників precision, recall, f1-score (1.00, 0.87, 0.93 відповідно). 
# 
# Покращення роботи алгоритму SVC не спостерігається


SVC
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00       312
     running       1.00      1.00      1.00      1038
      stairs       0.00      0.00      0.00        52
     walking       0.91      1.00      0.95       537

    accuracy                           0.97      1939
   macro avg       0.73      0.75      0.74      1939
weighted avg       0.95      0.97      0.96      1939



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00       312
     running       1.00      1.00      1.00      1038
      stairs       1.00      0.87      0.93        52
     walking       0.99      1.00      0.99       537

    accuracy                           1.00      1939
   macro avg       1.00      0.97      0.98      1939
weighted avg       1.00      1.00      1.00      1939

