In [1]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split as tts

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, roc_auc_score

## This version of the script loads the preprocessed files, reserves one of each activity for testing and uses the others for training. 

### depending on setiup of feature_engi_second it may or may not be using the extra feature we calculate

In [None]:
import os
import random

# Directory containing the files
directory = 'preprocessed'

# Initialize dictionaries to store filenames for each activity
filenames = {
    'cycling': [],
    'running': [],
    'walking': []
}

# Loop through the files and categorize them based on activity
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        if 'cycling' in filename:
            filenames['cycling'].append(filename)
        elif 'running' in filename:
            filenames['running'].append(filename)
        elif 'walking' in filename:
            filenames['walking'].append(filename)

# Function to randomly select one file for testing and use the rest for training
def split_filenames(activity_filenames):
    test_filename = random.choice(activity_filenames)
    train_filenames = [f for f in activity_filenames if f != test_filename]
    return train_filenames, test_filename

# Split filenames for each activity
train_cycling, test_cycling = split_filenames(filenames['cycling'])
train_running, test_running = split_filenames(filenames['running'])
train_walking, test_walking = split_filenames(filenames['walking'])

# Combine all training filenames into a single list
train_filenames = train_cycling + train_running + train_walking

# Combine all testing filenames into a single list
test_filenames = [test_cycling, test_running, test_walking]

# Print the results
print("Training filenames:", train_filenames)
print("Testing filenames:", test_filenames)

# Function to load data from filenames
def load_data(filenames, directory):
    data = []
    for filename in filenames:
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path)
        data.append(df)
    return pd.concat(data, ignore_index=True)

# Load training and testing data
train_data = load_data(train_filenames, directory)
test_data = load_data(test_filenames, directory)

train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

# Assuming the label column is named 'activity'
X_train = train_data.drop(columns=['date_time','activity'])
y_train = train_data['activity']
X_test = test_data.drop(columns=['date_time','activity'])
y_test = test_data['activity']

# Print the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


Training filenames: ['output_summary_cycling 2024-06-07 12-45-31.csv', 'output_summary_cycling 2024-06-08 12-45-52.csv', 'output_summary_running 2024-06-09 12-44-51.csv', 'output_summary_walking 2024-06-09 11-28-48.csv']
Testing filenames: ['output_summary_cycling 2024-06-07 12-40-37.csv', 'output_summary_running 2024-06-09 12-49-21.csv', 'output_summary_walking 2024-06-09 11-42-12.csv']
X_train shape: (13915, 9)
y_train shape: (13915,)
X_test shape: (7738, 9)
y_test shape: (7738,)


In [13]:
bayes = GaussianNB()
bayes.fit(X_train, y_train)
pred_p = bayes.predict_proba(X_test)
pred = bayes.predict(X_test)

print('naive bayes')
print(classification_report(y_pred=pred, y_true=y_test))

roc_auc = roc_auc_score(y_test, pred_p, multi_class='ovr')
print(f'ROC AUC Score: {roc_auc}')

naive bayes
              precision    recall  f1-score   support

     cycling       0.90      0.85      0.88      2343
     running       0.95      0.84      0.89      1535
     walking       0.86      0.93      0.90      3860

    accuracy                           0.89      7738
   macro avg       0.90      0.87      0.89      7738
weighted avg       0.89      0.89      0.89      7738

ROC AUC Score: 0.968222930245727


In [14]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
pred_p = knn.predict_proba(X_test)
pred = knn.predict(X_test)

print('KNN')
print(classification_report(y_pred=pred, y_true=y_test))

roc_auc = roc_auc_score(y_test, pred_p, multi_class='ovr')
print(f'ROC AUC Score: {roc_auc}')

KNN
              precision    recall  f1-score   support

     cycling       0.97      1.00      0.98      2343
     running       1.00      0.95      0.97      1535
     walking       0.99      0.99      0.99      3860

    accuracy                           0.99      7738
   macro avg       0.99      0.98      0.98      7738
weighted avg       0.99      0.99      0.99      7738

ROC AUC Score: 0.9925968801105022


In [15]:
knn_gs = GridSearchCV(KNeighborsClassifier(), 
                      {
                          'n_neighbors':[1, 3, 5, 11],
                          'leaf_size':[3, 5, 10, 15]
                        }, cv=5, scoring='roc_auc_ovr').fit(X_train, y_train)

pred_p = knn_gs.predict_proba(X_test)
pred = knn_gs.predict(X_test)

print('KNN gridsearch')
print(knn_gs.best_params_)
print(classification_report(y_pred=pred, y_true=y_test))

roc_auc = roc_auc_score(y_test, pred_p, multi_class='ovr')
print(f'ROC AUC Score: {roc_auc}')

KNN gridsearch
{'leaf_size': 3, 'n_neighbors': 11}
              precision    recall  f1-score   support

     cycling       0.97      1.00      0.98      2343
     running       1.00      0.94      0.97      1535
     walking       0.99      0.99      0.99      3860

    accuracy                           0.98      7738
   macro avg       0.98      0.98      0.98      7738
weighted avg       0.98      0.98      0.98      7738

ROC AUC Score: 0.9943692211531179


In [16]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
pred_p = tree.predict_proba(X_test)
pred = tree.predict(X_test)

print('decision tree')
print(classification_report(y_pred=pred, y_true=y_test))

roc_auc = roc_auc_score(y_test, pred_p, multi_class='ovr')
print(f'ROC AUC Score: {roc_auc}')

decision tree
              precision    recall  f1-score   support

     cycling       0.95      1.00      0.97      2343
     running       0.94      0.94      0.94      1535
     walking       0.99      0.96      0.97      3860

    accuracy                           0.97      7738
   macro avg       0.96      0.96      0.96      7738
weighted avg       0.97      0.97      0.97      7738

ROC AUC Score: 0.9742381854029757


In [17]:
tree_gs = GridSearchCV(DecisionTreeClassifier(), 
                      {
                          'max_depth':[5, 10, 15, 50],
                        }, cv=5, scoring='roc_auc_ovr').fit(X_train, y_train)

pred_p = tree_gs.predict_proba(X_test)
pred = tree_gs.predict(X_test)

print('tree gridsearch')
print(tree_gs.best_params_)
print(classification_report(y_pred=pred, y_true=y_test))

roc_auc = roc_auc_score(y_test, pred_p, multi_class='ovr')
print(f'ROC AUC Score: {roc_auc}')

tree gridsearch
{'max_depth': 50}
              precision    recall  f1-score   support

     cycling       0.94      1.00      0.97      2343
     running       0.95      0.93      0.94      1535
     walking       0.99      0.96      0.97      3860

    accuracy                           0.96      7738
   macro avg       0.96      0.96      0.96      7738
weighted avg       0.97      0.96      0.96      7738

ROC AUC Score: 0.973095547522654


In [18]:
forest = RandomForestClassifier()
forest.fit(X_train, y_train)
pred_p = forest.predict_proba(X_test)
pred = forest.predict(X_test)

print('Random forest')
print(classification_report(y_pred=pred, y_true=y_test))

roc_auc = roc_auc_score(y_test, pred_p, multi_class='ovr')
print(f'ROC AUC Score: {roc_auc}')

Random forest
              precision    recall  f1-score   support

     cycling       0.98      1.00      0.99      2343
     running       0.99      0.97      0.98      1535
     walking       1.00      0.99      0.99      3860

    accuracy                           0.99      7738
   macro avg       0.99      0.99      0.99      7738
weighted avg       0.99      0.99      0.99      7738

ROC AUC Score: 0.9988634183670589


In [19]:
forest_gs = GridSearchCV(RandomForestClassifier(), 
                      {
                          'max_depth':[3, 5, 10, 20],
                          'n_estimators':[10, 50, 100]
                        }, cv=5, scoring='roc_auc_ovr').fit(X_train, y_train)

pred_p = forest_gs.predict_proba(X_test)
pred = forest_gs.predict(X_test)

print('random forest gridsearch')
print(tree_gs.best_params_)
print(classification_report(y_pred=pred, y_true=y_test))

roc_auc = roc_auc_score(y_test, pred_p, multi_class='ovr')
print(f'ROC AUC Score: {roc_auc}')

random forest gridsearch
{'max_depth': 50}
              precision    recall  f1-score   support

     cycling       0.98      1.00      0.99      2343
     running       0.99      0.97      0.98      1535
     walking       1.00      0.99      0.99      3860

    accuracy                           0.99      7738
   macro avg       0.99      0.99      0.99      7738
weighted avg       0.99      0.99      0.99      7738

ROC AUC Score: 0.9991676068615792
