In [3]:
import pandas as pd
import numpy as np
from scipy.fftpack import fft
from scipy.signal import find_peaks
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
from matplotlib.patches import Patch



In [4]:
rng = np.random.RandomState(1338)
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
cmap_group = plt.cm.Paired
cmap_y = plt.cm.coolwarm

def visualize_groups(classes, groups):
    # Visualize dataset groups
    fig, ax = plt.subplots(dpi=200) 
    ax.scatter(
        range(len(groups)),
        [0.5] * len(groups),
        c=groups,
        marker="_",
        lw=50,
        cmap=cmap_data,
    )
    ax.scatter(
        range(len(groups)),
        [3.5] * len(groups),
        c=classes,
        marker="_",
        lw=50,
        cmap=cmap_data,
    )
    ax.set(
        ylim=[-1, 5],
        yticks=[0.5, 3.5],
        yticklabels=["Data\ngroup", "Data\nclass"],
        xlabel="Sample index",
    )


def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(
            range(len(indices)),
            [ii + 0.5] * len(indices),
            c=indices,
            marker="_",
            lw=lw,
            cmap=cmap_cv,
            vmin=-0.2,
            vmax=1.2,
        )

    # Plot the data classes and groups at the end
    ax.scatter(
        range(len(X)), [ii + 1.5] * len(X), c=y, marker="_", lw=lw, cmap=cmap_data 
    )

    ax.scatter(
        range(len(X)), [ii + 2.5] * len(X), c=group, marker="_", lw=lw, cmap=cmap_data 
    )



    # Formatting
    yticklabels = list(range(n_splits)) + ["class", "group"]
    ax.set(
        yticks=np.arange(n_splits + 2) + 0.5,
        yticklabels=yticklabels,
        xlabel="Sample index",
        ylabel="CV iteration",
        ylim=[n_splits + 2.2, -0.2],
        xlim=[0, len(X)],
    )
    ax.set_title("{}".format(type(cv).__name__), fontsize=15)
    return ax

In [5]:
def create_segments(data, target_x, target_y, target_group,  window_size=80, step_size=40):
    segments = []
    labels = []
    group = []

    for i in range(0, len(data) - window_size, step_size):
        segments_data = []
        for x in target_x:
            x_values = data[x].values[i:i + window_size]
            segments_data.append(x_values)
        label = data[target_y].values[i]
        member = data[target_group].values[i]

        segments.append(segments_data)
        labels.append(label)
        group.append(member)
    return segments, labels, group


def create_label_segments(data, target, window_size=80, step_size=40):
    ys = []
    for i in range(0, len(data) - window_size, step_size):
        segments_data = []
        y = data[target].values[i]
        y.append(y)
    return ys


def extract_time_features(segments):
    features = []
    for segment in segments:
        segment_features = []
        for axis in segment:
            mean_axis = np.mean(axis)
            std_axis = np.std(axis)
            segment_features.extend([mean_axis, std_axis])
        features.append(segment_features)
    return np.array(features)

def extract_frequency_features(segments, sampling_rate=100):
    features = []
    for segment in segments:
        segment_features = []
        for axis in segment:
            fft_axis = np.abs(fft(axis))

            # Peak frequency and maximum amplitude
            peak_indices, _ = find_peaks(fft_axis)

            if len(peak_indices) > 0:
                peak_freq = peak_indices[np.argmax(fft_axis[peak_indices])] / len(axis) * sampling_rate
                max_amplitude = np.max(fft_axis[peak_indices])
            else:
                peak_freq = 0.0  # デフォルトの値を設定
                max_amplitude = 0.0  # デフォルトの値を設定

            # Signal energy
            energy = np.sum(axis ** 2) / len(axis)

            segment_features.extend([peak_freq, max_amplitude, energy])

        features.append(segment_features)

    return np.array(features)

In [7]:
# トイデータの読み込み
data = pd.read_csv('04_zel_toy_dataset.csv', index_col=0)

# 欠損値の除去
data.dropna(axis=0, how="any", inplace=True)

# セグメントを用意し、時間領域の特徴量と周波数領域の特徴量を抽出
window_size = 80
step_size = 40
target_x = ['Solar.A','Solar.B','Piezo']
target_y = "place" 
terget_group = "user_id"
target_names = list(data[target_y].unique())

# ラベルエンコーダーのインスタンス化
le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()
data["action"] = le1.fit_transform(data["action"])
data["place"] = le2.fit_transform(data["place"])
data["user_id"] = le3.fit_transform(data["user_id"])

segments, labels, groups = create_segments(data, target_x, target_y, terget_group, window_size, step_size)

time_features = extract_time_features(segments)
frequency_features = extract_frequency_features(segments)

# 時間領域の特徴量と周波数領域の特徴量を統合
combined_features = np.concatenate((time_features, frequency_features), axis=1)

# データセットの分割
X = combined_features
y = np.asarray(labels, dtype=np.float32)

In [8]:
# ランダムフォレスト分類器をインスタンス化
clf = RandomForestClassifier(n_estimators=100, random_state=42)

In [9]:
# LeaveOneGroupOutのインスタンスを作成
logo = LeaveOneGroupOut()

all_y_true = []
all_y_pred = []

# 分割数、ここでは5分割
for train_index, test_index in logo.split(X, y, groups):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    

    # モデルの学習
    clf.fit(X_train, y_train)
    
    # 予測
    y_pred = clf.predict(X_test)

    # 保存
    all_y_true.extend(y_test)
    all_y_pred.extend(y_pred)
    
    # 予測結果の評価（ここでは正解率）
    print(accuracy_score(y_test, y_pred))



0.509693996729736
0.5134734239802224
0.7057692307692308
0.6502183406113538
0.8100734522560336
0.5591160220994476
0.6317147548674642
0.4639097744360902
0.6835886214442013
0.6743951612903226
0.5817137809187279


In [10]:

all_y_true = le2.inverse_transform(np.array(all_y_true).astype(np.int32))
all_y_pred = le2.inverse_transform(np.array(all_y_pred).astype(np.int32))

# 全ユーザの結果に基づく混同行列と分類レポートを表示
print("Confusion Matrix:")
print(confusion_matrix(all_y_true, all_y_pred, labels=target_names))

print("\nClassification Report:")
print(classification_report(all_y_true, all_y_pred, labels=target_names))

Confusion Matrix:
[[4837 1434   20  109   88  877 1464]
 [ 762 2721   19  881   76  346  291]
 [  59  103  502   39  121  605   37]
 [  64  669   11 2360   19    7  251]
 [  53   77   43   69 8834 1703   41]
 [ 381  180  147   14 1315 6658  385]
 [1631  619   30 1098   84 1380 1783]]

Classification Report:
              precision    recall  f1-score   support

         lab       0.62      0.55      0.58      8829
        hall       0.47      0.53      0.50      5096
    elevator       0.65      0.34      0.45      1466
      stairs       0.52      0.70      0.59      3381
    outdoors       0.84      0.82      0.83     10820
       store       0.58      0.73      0.64      9080
      toilet       0.42      0.27      0.33      6625

    accuracy                           0.61     45297
   macro avg       0.58      0.56      0.56     45297
weighted avg       0.61      0.61      0.60     45297



In [11]:
#個人特化モデルの評価
groups = np.array(groups)

# 各個人のデータに対して層化交差検証を適用し
skf = StratifiedKFold(n_splits=5)


all_y_true = []
all_y_pred = []

for group in np.unique(groups):
    group_X = X[groups == group]
    group_y = y[groups == group]
    
    for train_index, test_index in skf.split(group_X, group_y, groups=groups[groups == group]):
        X_train, X_test = group_X[train_index], group_X[test_index]
        y_train, y_test = group_y[train_index], group_y[test_index]
    
        clf.fit(X_train, y_train)
    
        score = clf.score(X_test, y_test)
        print(f"Test score for group {group}: {score}")

        y_pred = clf.predict(X_test)

        # 保存
        all_y_true.extend(y_test)
        all_y_pred.extend(y_pred)

Test score for group 0: 0.8553092182030338
Test score for group 0: 0.897196261682243
Test score for group 0: 0.8960280373831776
Test score for group 0: 0.8212616822429907
Test score for group 0: 0.7091121495327103
Test score for group 1: 0.7688504326328801
Test score for group 1: 0.9517923362175525
Test score for group 1: 0.8726823238566132
Test score for group 1: 0.9480840543881335
Test score for group 1: 0.7552533992583437
Test score for group 2: 0.7991452991452992
Test score for group 2: 0.8301282051282052
Test score for group 2: 0.8675213675213675
Test score for group 2: 0.8643162393162394
Test score for group 2: 0.7574786324786325
Test score for group 3: 0.7663755458515283
Test score for group 3: 0.8766375545851528
Test score for group 3: 0.7729257641921398
Test score for group 3: 0.9072052401746725
Test score for group 3: 0.8034934497816594
Test score for group 4: 0.8391608391608392
Test score for group 4: 0.965034965034965
Test score for group 4: 0.9702797202797203
Test score fo

In [13]:
all_y_true = le2.inverse_transform(np.array(all_y_true).astype(np.int32))
all_y_pred = le2.inverse_transform(np.array(all_y_pred).astype(np.int32))

# 全ユーザの結果に基づく混同行列と分類レポートを表示
print("Confusion Matrix:")
print(confusion_matrix(all_y_true, all_y_pred, labels=target_names))

print("\nClassification Report:")
print(classification_report(all_y_true, all_y_pred, labels=target_names))


Confusion Matrix:
[[7798  174   37   72   72  448  228]
 [ 242 2953   50  949   57  294  551]
 [  51   99  732   46   56  436   46]
 [  93  529   29 2557   13   31  129]
 [  49   93   44   83 9918  619   14]
 [ 329  179  180    9  525 7572  286]
 [ 448  460   36  124    2  452 5103]]

Classification Report:
              precision    recall  f1-score   support

         lab       0.87      0.88      0.87      8829
        hall       0.66      0.58      0.62      5096
    elevator       0.66      0.50      0.57      1466
      stairs       0.67      0.76      0.71      3381
    outdoors       0.93      0.92      0.92     10820
       store       0.77      0.83      0.80      9080
      toilet       0.80      0.77      0.79      6625

    accuracy                           0.81     45297
   macro avg       0.76      0.75      0.75     45297
weighted avg       0.81      0.81      0.81     45297

