In [3]:
import numpy as np
import pandas as pd

In [31]:
# --- 1.lOADING THE DATA ---
#Adjust the paths to match where you've unzipped "UCI HAR Dataset"

PATH = "/Users/ahmed/Desktop/Artificial-Intelligence-Coursework/UCI HAR Dataset"

features_path = PATH + "/features.txt"
activity_labels_path = PATH + "/activity_labels.txt"
X_train_path = PATH + "/train/X_train.txt"
y_train_path = PATH + "/train/y_train.txt"
X_test_path = PATH + "/test/X_test.txt"
y_test_path = PATH + "/test/y_test.txt"



In [32]:
# Load feature names, This appends the column index to any duplicate names.

features_df = pd.read_csv(features_path, sep="\s+", header=None, names=["idx", "feature"])
feature_names = features_df["feature"].tolist()


# his appends the column index to any duplicate names.
features_df["feature"] = features_df["feature"].astype(str) + "_" + features_df.index.astype(str)
feature_names = features_df["feature"].tolist()


# Load activity labels (mapping IDs 1-6 to string names)
activity_labels_df = pd.read_csv(activity_labels_path, sep="\s+", header=None, names=["id", "activity"])
activity_map = dict(zip(activity_labels_df["id"], activity_labels_df["activity"]))

In [34]:
#Load the training data
X_train = pd.read_csv(X_train_path, sep="\s+", header=None, names=feature_names)
y_train = pd.read_csv(y_train_path, sep="\s+", header=None, names=["activity"])
X_test = pd.read_csv(X_test_path, sep="\s+", header=None, names=feature_names)
y_test = pd.read_csv(y_test_path, sep="\s+", header=None, names=["activity"])

In [36]:
#Map activity IDs to their names
y_train["activity"] = y_train["activity"].map(activity_map)
y_test["activity"] = y_test["activity"].map(activity_map)


In [66]:
 # ---2.CONVERT MULTI-CLASS LABELS TO BINARY ---
def to_binary_label(activity):
    if activity in ["WALKING", "WALKING_UPSTAIRS", "WALKING_DOWNSTAIRS"]:
        return 1 #Active
    else:
        return 0 #Inactive
    
y_train_binary = y_train["activity"].apply(to_binary_label)
y_test_binary = y_test["activity"].apply(to_binary_label)


y_train_binary = y_train_binary.astype(int).values.ravel()
y_test_binary = y_test_binary.astype(int).values.ravel()
#Now we have 0/1 labels in y_train["Binary"] and y_test["Binary"]


In [67]:
#Reduction of the number of features
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=50)), #reduce from 561 -> 50 
    ("svm", SVC(class_weight="balanced"))
])

#can tweak n components depending on how much dimension reduction you want vs how much#
#computational time you can spare

In [68]:
#Evaluate the model
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

pipeline.fit(X_train, y_train_binary)
y_pred = pipeline.predict(X_test)

f1 = f1_score(y_test_binary, y_pred)
precision = precision_score(y_test_binary, y_pred)
recall = recall_score(y_test_binary, y_pred)

print("F1 Score: {:.2f}".format(f1))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))

F1 Score: 1.00
Precision: 1.00
Recall: 1.00


In [69]:
#training the baseline svm models with different kernels (linear, polynomial, rbf)
param_grid = [
    {
        "svm__kernel": ["linear"],
        "svm__C": [0.1, 1, 10]
    },
    {
        "svm__kernel": ["poly"],
        "svm__degree": [2, 3, 4],
        "svm__gamma": [0.1, 1],
        "svm__C": [0.1, 1, 10]
    },
    {
        "svm__kernel": ["rbf"],
        "svm__C": [0.1, 1, 10],
        "svm__gamma": [0.1, 1]
    }
]

In [70]:
#grid search
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="accuracy",
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train.values.ravel())
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters:  {'svm__C': 10, 'svm__kernel': 'linear'}
Best score:  0.8974447192177566


In [71]:
#it took too long, so we are going do a RandomizedSearchCV instead
from sklearn.model_selection import RandomizedSearchCV


random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=10,
    scoring="accuracy",
    cv=3,
    n_jobs=-1,
    verbose=1
)

random_search.fit(X_train, y_train.values.ravel())
print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters:  {'svm__kernel': 'poly', 'svm__gamma': 0.1, 'svm__degree': 3, 'svm__C': 10}
Best score:  0.8892828971653941


In [72]:
from  sklearn.metrics import classification_report
y_pred = grid_search.best_estimator_.predict(X_test)
print(classification_report(y_test_binary, y_pred))

ValueError: Mix of label input types (string and number)