In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv("Data\\Human activity.csv").drop("subject", axis=1)
df_ac = df["Activity"]
df_in = df.drop("Activity", axis=1)
test = pd.read_csv("Data\\test.csv").drop("subject", axis=1)
test_ac =test["Activity"]
test_in = test.drop("Activity", axis=1)

In [3]:
model = RandomForestClassifier(n_estimators=20, max_depth=20, random_state=42).fit(df_in, df_ac)

In [4]:
print(classification_report(test_ac, model.predict(test_in)))
print(confusion_matrix(test_ac, model.predict(test_in)))

                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       537
           SITTING       0.90      0.87      0.88       491
          STANDING       0.88      0.91      0.89       532
           WALKING       0.86      0.96      0.91       496
WALKING_DOWNSTAIRS       0.96      0.87      0.91       420
  WALKING_UPSTAIRS       0.89      0.86      0.87       471

          accuracy                           0.91      2947
         macro avg       0.91      0.91      0.91      2947
      weighted avg       0.91      0.91      0.91      2947

[[537   0   0   0   0   0]
 [  0 425  66   0   0   0]
 [  0  48 484   0   0   0]
 [  0   0   0 475   3  18]
 [  0   0   0  21 365  34]
 [  0   0   0  54  14 403]]


In [5]:
kmodel = KMeans(6, random_state=18).fit(df_in)
kmodel_centers = kmodel.cluster_centers_
kmodel_labels = kmodel.labels_
Feature = df_in.to_numpy()
num_rows, num_cols = Feature.shape
dist = [None]*num_rows
df_pruned = df.copy()
for i in range(6):
    for j in range(num_rows):
        if kmodel_labels[j] == i:
            dist[j] = np.linalg.norm(kmodel_centers[i]-Feature[j])
            if dist[j] > 5:
                df_pruned.loc[j, "Activity"] = None
df_pruned = df_pruned.dropna()

df_pruned_activity = df_pruned["Activity"]
df_pruned_inputs = df_pruned.drop("Activity", axis=1)

In [30]:
df_with_cluster = df_pruned_inputs.copy()
accuracies = []

for k in range(3, 5):
    kmeans_model = KMeans(n_clusters=k, random_state=18).fit(df_pruned_inputs)
    kmeans_values = kmeans_model.predict(df_pruned_inputs)
    if "Cluster" not in df_with_cluster.columns:
        df_with_cluster.insert(0, "Cluster", kmeans_values)
    df_with_cluster.loc[:, "Cluster"] = kmeans_values
    rand_forest = RandomForestClassifier(n_estimators=20, max_depth=20, random_state=42).fit(df_with_cluster, df_pruned_activity)
    print("Results for K={}".format(k))
    kmeans_values_test = kmeans_model.predict(test_in)
    test_in2 = test_in.copy()
    test_in2.insert(0, "Cluster", kmeans_values_test)
    report = classification_report(test_ac, rand_forest.predict(test_in2), output_dict=True)
    accuracies.append(report['accuracy'])
    print(report['weighted avg']['f1-score'])
    # print(confusion_matrix(test_ac, rand_forest.predict(test_in)))


Results for K=3
0.922737780066244
Results for K=4
0.9146074503676753
