In [13]:
import pandas as pd
import numpy as np
from scipy.fftpack import fft
from scipy.signal import find_peaks
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
from matplotlib.patches import Patch



In [8]:
rng = np.random.RandomState(1338)
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
cmap_group = plt.cm.Paired
cmap_y = plt.cm.coolwarm

def visualize_groups(classes, groups):
    # Visualize dataset groups
    fig, ax = plt.subplots(dpi=200) 
    ax.scatter(
        range(len(groups)),
        [0.5] * len(groups),
        c=groups,
        marker="_",
        lw=50,
        cmap=cmap_data,
    )
    ax.scatter(
        range(len(groups)),
        [3.5] * len(groups),
        c=classes,
        marker="_",
        lw=50,
        cmap=cmap_data,
    )
    ax.set(
        ylim=[-1, 5],
        yticks=[0.5, 3.5],
        yticklabels=["Data\ngroup", "Data\nclass"],
        xlabel="Sample index",
    )


def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(
            range(len(indices)),
            [ii + 0.5] * len(indices),
            c=indices,
            marker="_",
            lw=lw,
            cmap=cmap_cv,
            vmin=-0.2,
            vmax=1.2,
        )

    # Plot the data classes and groups at the end
    ax.scatter(
        range(len(X)), [ii + 1.5] * len(X), c=y, marker="_", lw=lw, cmap=cmap_data 
    )

    ax.scatter(
        range(len(X)), [ii + 2.5] * len(X), c=group, marker="_", lw=lw, cmap=cmap_data 
    )



    # Formatting
    yticklabels = list(range(n_splits)) + ["class", "group"]
    ax.set(
        yticks=np.arange(n_splits + 2) + 0.5,
        yticklabels=yticklabels,
        xlabel="Sample index",
        ylabel="CV iteration",
        ylim=[n_splits + 2.2, -0.2],
        xlim=[0, len(X)],
    )
    ax.set_title("{}".format(type(cv).__name__), fontsize=15)
    return ax

In [9]:
def create_segments(data, target_x, target_y, target_group,  window_size=80, step_size=40):
    segments = []
    labels = []
    group = []

    for i in range(0, len(data) - window_size, step_size):
        segments_data = []
        for x in target_x:
            x_values = data[x].values[i:i + window_size]
            segments_data.append(x_values)
        label = data[target_y].values[i]
        member = data[target_group].values[i]

        segments.append(segments_data)
        labels.append(label)
        group.append(member)
    return segments, labels, group


def create_label_segments(data, target, window_size=80, step_size=40):
    ys = []
    for i in range(0, len(data) - window_size, step_size):
        segments_data = []
        y = data[target].values[i]
        y.append(y)
    return ys


def extract_time_features(segments):
    features = []
    for segment in segments:
        segment_features = []
        for axis in segment:
            mean_axis = np.mean(axis)
            std_axis = np.std(axis)
            segment_features.extend([mean_axis, std_axis])
        features.append(segment_features)
    return np.array(features)

def extract_frequency_features(segments, sampling_rate=100):
    features = []
    for segment in segments:
        segment_features = []
        for axis in segment:
            fft_axis = np.abs(fft(axis))

            # Peak frequency and maximum amplitude
            peak_indices, _ = find_peaks(fft_axis)

            if len(peak_indices) > 0:
                peak_freq = peak_indices[np.argmax(fft_axis[peak_indices])] / len(axis) * sampling_rate
                max_amplitude = np.max(fft_axis[peak_indices])
            else:
                peak_freq = 0.0  # デフォルトの値を設定
                max_amplitude = 0.0  # デフォルトの値を設定

            # Signal energy
            energy = np.sum(axis ** 2) / len(axis)

            segment_features.extend([peak_freq, max_amplitude, energy])

        features.append(segment_features)

    return np.array(features)

In [10]:
def show_graph(df, target_name):
    label_counts = df[target_name].value_counts()

    # 棒グラフで表示
    label_counts.plot(kind='bar')
    plt.ylabel('Count')
    plt.xlabel('Label')
    plt.title('Number of data points per label')
    plt.show()

In [None]:
#トイデータの読み込み
data = pd.read_csv('02_shortstick_toy_dataset.csv', index_col=0)

# 欠損値の除去
data.dropna(axis=0, how="any", inplace=True)


# セグメントを用意し、時間領域の特徴量と周波数領域の特徴量を抽出
window_size = 80
step_size = 40
target_x = ['Quaternion.X', 'Quaternion.Y', 'Quaternion.Z']
target_y = "action" 
terget_group = "user_id"
target_names = list(data[target_y].unique())

# ラベルエンコーダーのインスタンス化
le0 = LabelEncoder()
data["action"] = le0.fit_transform(data["action"])

le1 = LabelEncoder()
data["user_id"] = le1.fit_transform(data["user_id"])

segments, labels, groups = create_segments(data, target_x, target_y, terget_group, window_size, step_size)

#特徴量抽出
time_features = extract_time_features(segments)
frequency_features = extract_frequency_features(segments)

# 時間領域の特徴量と周波数領域の特徴量を統合
combined_features = np.concatenate((time_features, frequency_features), axis=1)

# データセットの分割
X = combined_features
y = np.asarray(labels, dtype=np.float32)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# パラメータのグリッドを定義
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400],
    'max_depth': [None, 10, 20, 30, 40, 50]
}

# グリッドサーチの初期化
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)

# グリッドサーチの実行
grid_search.fit(X_train, y_train)

# 最適なパラメータの表示
print(f"Best parameters: {grid_search.best_params_}")

# 最適なパラメータで学習したモデルの取得
best_clf = grid_search.best_estimator_

Best parameters: {'max_depth': None, 'n_estimators': 300}


In [16]:
# LeaveOneGroupOutのインスタンスを作成
logo = LeaveOneGroupOut()

all_y_true = []
all_y_pred = []

# 分割数、ここでは5分割
for train_index, test_index in logo.split(X, y, groups):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    

    # モデルの学習
    best_clf.fit(X_train, y_train)
    
    # 予測
    y_pred = best_clf.predict(X_test)

    # 保存
    all_y_true.extend(y_test)
    all_y_pred.extend(y_pred)
    
    # 予測結果の評価（ここでは正解率）
    print(accuracy_score(y_test, y_pred))


all_y_true = le0.inverse_transform(np.array(all_y_true).astype(np.int32))
all_y_pred = le0.inverse_transform(np.array(all_y_pred).astype(np.int32))

# 全ユーザの結果に基づく混同行列と分類レポートを表示
print("Confusion Matrix:")
print(confusion_matrix(all_y_true, all_y_pred, labels=target_names))

print("\nClassification Report:")
print(classification_report(all_y_true, all_y_pred, labels=target_names))


0.7574791564492398
0.8233838540055676
0.7697197341808726
0.7032238626092197
0.7548953409858203
Confusion Matrix:
[[3362   35   91   19  296   26   18  192]
 [ 101 1322   24  102  234    3    2   60]
 [ 246   43 1792  257   48  138   19   18]
 [  15  110  176 2679    7  134   21   12]
 [ 241  156   12    0 1185   11    0   31]
 [ 116   47  167  219   57 1809  166   13]
 [  32    0   56   33    0  181 1147    4]
 [ 265   83   10    7   69    8    0  807]]

Classification Report:
              precision    recall  f1-score   support

       exe-a       0.77      0.83      0.80      4039
       exe-b       0.74      0.72      0.73      1848
       exe-c       0.77      0.70      0.73      2561
       exe-d       0.81      0.85      0.83      3154
       exe-e       0.62      0.72      0.67      1636
       exe-f       0.78      0.70      0.74      2594
       exe-g       0.84      0.79      0.81      1453
       exe-h       0.71      0.65      0.68      1249

    accuracy                  