In [4]:
import pandas as pd
import numpy as np
from scipy.fftpack import fft
from scipy.signal import find_peaks
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# WISDMデータセットのダウンロード
!wget "http://www.cis.fordham.edu/wisdm/includes/datasets/latest/WISDM_ar_latest.tar.gz"

In [15]:
# 圧縮ファイルの解凍
!tar -xzf "WISDM_ar_latest.tar.gz"

In [17]:
# データの読み込み
column_names = ["user", "activity", "timestamp", "x-axis", "y-axis", "z-axis"]

file = open('WISDM_ar_v1.1/WISDM_ar_v1.1_raw.txt')
lines = file.readlines()

processedList = []

for i, line in enumerate(lines):
    try:
        line = line.split(',')
        last = line[5].split(';')[0]
        last = last.strip()
        if last == '':
            break;
        temp = [line[0], line[1], line[2], line[3], line[4], last]
        processedList.append(temp)
    except:
        pass

In [7]:
data = pd.DataFrame(data = processedList, columns = column_names )
data['x-axis'] = data['x-axis'].astype(float)
data['y-axis'] = data['y-axis'].astype(float)
data['z-axis'] = data['z-axis'].astype(float)
data.head()

Unnamed: 0,user,activity,timestamp,x-axis,y-axis,z-axis
0,33,Jogging,49105962326000,-0.694638,12.680544,0.503953
1,33,Jogging,49106062271000,5.012288,11.264028,0.953424
2,33,Jogging,49106112167000,4.903325,10.882658,-0.081722
3,33,Jogging,49106222305000,-0.612916,18.496431,3.023717
4,33,Jogging,49106332290000,-1.18497,12.108489,7.205164


In [8]:
# 欠損値の除去
data.dropna(axis=0, how="any", inplace=True)

# タイムスタンプの除去（この例では使用しないため）
data.drop(["timestamp"], axis=1, inplace=True)

# ラベルエンコーダーのインスタンス化
le = LabelEncoder()
data["activity"] = le.fit_transform(data["activity"])

In [9]:
# 時系列データを固定長のセグメントに分割する関数を定義
def create_segments(data, window_size=80, step_size=40):
    segments = []
    labels = []
    for i in range(0, len(data) - window_size, step_size):
        xs = data["x-axis"].values[i:i + window_size]
        ys = data["y-axis"].values[i:i + window_size]
        zs = data["z-axis"].values[i:i + window_size]
        label = data["activity"].values[i]
        segments.append([xs, ys, zs])
        labels.append(label)
    return segments, labels

# 時間領域の特徴量抽出関数を定義
def extract_time_features(segments):
    features = []
    for segment in segments:
        xs, ys, zs = segment
        mean_xs = np.mean(xs)
        mean_ys = np.mean(ys)
        mean_zs = np.mean(zs)
        std_xs = np.std(xs)
        std_ys = np.std(ys)
        std_zs = np.std(zs)
        features.append([mean_xs, mean_ys, mean_zs, std_xs, std_ys, std_zs])
    return np.array(features)
    
# 周波数領域の特徴量抽出関数を定義
def extract_frequency_features(segments, sampling_rate=50):
    features = []
    for segment in segments:
        xs, ys, zs = segment
        fft_xs = np.abs(fft(xs))
        fft_ys = np.abs(fft(ys))
        fft_zs = np.abs(fft(zs))

       # Peak frequency and maximum amplitude
        peak_indices_x, _ = find_peaks(fft_xs)
        peak_indices_y, _ = find_peaks(fft_ys)
        peak_indices_z, _ = find_peaks(fft_zs)
        peak_freq_x = peak_indices_x[np.argmax(fft_xs[peak_indices_x])] / len(xs) * sampling_rate
        peak_freq_y = peak_indices_y[np.argmax(fft_ys[peak_indices_y])] / len(ys) * sampling_rate
        peak_freq_z = peak_indices_z[np.argmax(fft_zs[peak_indices_z])] / len(zs) * sampling_rate
        max_amplitude_x = np.max(fft_xs[peak_indices_x])
        max_amplitude_y = np.max(fft_ys[peak_indices_y])
        max_amplitude_z = np.max(fft_zs[peak_indices_z])

        # Signal energy
        energy_x = np.sum(xs ** 2) / len(xs)
        energy_y = np.sum(ys ** 2) / len(ys)
        energy_z = np.sum(zs ** 2) / len(zs)

        features.append([peak_freq_x, peak_freq_y, peak_freq_z, max_amplitude_x, max_amplitude_y, max_amplitude_z,
                         energy_x, energy_y, energy_z])


    return np.array(features)



In [10]:
# セグメントを用意し、時間領域の特徴量と周波数領域の特徴量を抽出
window_size = 80
step_size = 40
segments, labels = create_segments(data, window_size, step_size)

time_features = extract_time_features(segments)
frequency_features = extract_frequency_features(segments)

# 時間領域の特徴量と周波数領域の特徴量を統合
combined_features = np.concatenate((time_features, frequency_features), axis=1)

# データセットの分割
X = combined_features
y = np.asarray(labels, dtype=np.float32)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# データの正規化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# ランダムフォレスト分類器をインスタンス化
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# モデルの学習
clf.fit(X_train_scaled, y_train)

In [12]:
# 予測の実行
y_pred = clf.predict(X_test_scaled)

# 評価指標の計算
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:")
print(accuracy_score(y_test, y_pred))

Confusion Matrix:
[[ 198    7    0    0   18   11]
 [   4  961    0    0    3    3]
 [   0    0   29    0    1    0]
 [   0    1    0   32    0    0]
 [  19    7    0    0  229   11]
 [   7    6    0    0    3 1026]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.87      0.85      0.86       234
         1.0       0.98      0.99      0.98       971
         2.0       1.00      0.97      0.98        30
         3.0       1.00      0.97      0.98        33
         4.0       0.90      0.86      0.88       266
         5.0       0.98      0.98      0.98      1042

    accuracy                           0.96      2576
   macro avg       0.95      0.94      0.95      2576
weighted avg       0.96      0.96      0.96      2576

Accuracy Score:
0.9607919254658385
