In [2]:
import os
import sys
import glob
import numpy as np
import pandas as pd
import pickle

from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import (
    f1_score,
    roc_auc_score,
)

sys.path.append(os.path.join(os.path.dirname(os.path.realpath('__file__')), '../'))
from data import gather_signals_by_class
from exp_utils import (
    normalize_df_columns_0_1,
    display_roc_and_f1, display_conf_matrix,
    save_ml_model, load_ml_model
)
from experiments import d_users_split


In [None]:
def get_df_from_csv(l_csv: list, num_features: int):
    # filtering csv files list by user id
    df = pd.DataFrame()

    for path in l_csv[:]:
        temp_df = pd.read_csv(path, delimiter=';', header=0)
        df = pd.concat([df, temp_df], ignore_index=True)    


    print(' >> Normalizando: [0,1]')
    df = normalize_df_columns_0_1(df)

    X = df.iloc[:, :63]  # Features (sensor data)
    y = df.iloc[:, 63]

    if num_features < 63:
        print(' >> Running PCA')
        pca = PCA(n_components=num_features)
        X = pca.fit_transform(X)

    df = pd.DataFrame(X, columns = [f'sensor_{i}' for i in range(num_features)])
    df['label'] = y.values  # Adding the labels back

    return df

def save_ml_model(ml_model, model_path):
    with open(model_path,'wb') as f:
        pickle.dump(ml_model,f)

def load_ml_model(model_path):
    with open(model_path,'rb') as f:
        model = pickle.load(f)
    return model


In [None]:
paths_csv = glob.glob('../zhang-wamsley-2019/data/CSV/*.csv')

num_features = 63
window_size = 40

# Training Preparation

In [None]:
paths_train = [path for path in paths_csv 
                       if any(user_id in path 
                              for user_id in d_users_split['train']+d_users_split['val'])]

len(paths_train)

In [None]:
df_train = get_df_from_csv(paths_train, num_features)
df_train

In [None]:
X_train = df_train.iloc[:, :num_features]  # Features (sensor data)
y_train = df_train.iloc[:, num_features]

In [None]:
X_train

In [None]:
y_train

### Flatten window size slots

In [None]:
df_train = gather_signals_by_class(df_train, num_signals=window_size)

data_train = df_train[[f'sensor_{i}' for i in range(num_features)]].values
data_train.shape

In [None]:
# Flatten
data_reshaped_train = data_train.reshape(-1, window_size, data_train.shape[1])
data_reshaped_train = data_reshaped_train.reshape(data_reshaped_train.shape[0], -1)
data_reshaped_train.shape

In [None]:
labels_train = df_train['label'].values[::window_size]
labels_train.shape

# Testing Preparation

In [None]:
paths_test = [path for path in paths_csv 
                       if any(user_id in path 
                              for user_id in d_users_split['test'])]

len(paths_test)

In [None]:
df_test = get_df_from_csv(paths_test, num_features)
df_test

In [None]:
X_test = df_test.iloc[:, :num_features]  # Features (sensor data)
y_test = df_test.iloc[:, num_features]

In [None]:
X_test

In [None]:
y_test

### Flatten window size slots

In [None]:
df_test = gather_signals_by_class(df_test, num_signals=window_size)

data_test = df_test[[f'sensor_{i}' for i in range(num_features)]].values
data_test.shape

In [None]:
# Flatten
data_reshaped_test = data_test.reshape(-1, window_size, data_test.shape[1])
data_reshaped_test = data_reshaped_test.reshape(data_reshaped_test.shape[0], -1)
data_reshaped_test.shape

In [None]:
labels_test = df_test['label'].values[::window_size]
labels_test.shape

# SVM - Support Vector Machine

In [None]:
model_path = '..\models\ml_models\svm\.pkl'

if os.path.exists(model_path):
    print(' >> Loading model')
    clf = load_ml_model(model_path)

In [None]:
clf = svm.SVC(
    kernel='rbf',
    C=1.0
)

clf.fit(data_reshaped_train, labels_train)

In [None]:
# Save model
save_ml_model(clf, '../models/ml_models/svm.pkl')

In [None]:
y_pred_train_svm = clf.predict(data_reshaped_train)
y_pred_train_svm

In [None]:
y_pred_test_svm = clf.predict(data_reshaped_test)
y_pred_test_svm

In [None]:
# Show train metrics
print(' >> Displaying Training Metrics')
display_roc_and_f1(labels_train, y_pred_train_svm)
display_conf_matrix(labels_train, y_pred_train_svm)

In [None]:
# Show test metrics
print('Displaying Testing Metrics')
display_roc_and_f1(labels_test, y_pred_test_svm)
display_conf_matrix(labels_test, y_pred_test_svm)

# Logistic Regresion

In [None]:
model_path = '..\models\ml_models\logic_reg\logic_regre_pca.pkl'

if os.path.exists(model_path):
    print(' >> Loading model')
    lr_model = load_ml_model(model_path)

In [None]:
lr_model = LogisticRegression(
    class_weight='balanced',
    max_iter=10000,
    n_jobs = -1,
    random_state=11
)

lr_model.fit(data_reshaped_train, labels_train)

In [None]:
# Save model
save_ml_model(lr_model, '../models/ml_models/logic_regre.pkl')

In [None]:
y_pred_train_lr = lr_model.predict(data_reshaped_train)
y_pred_train_lr

In [None]:
y_pred_test_lr = lr_model.predict(data_reshaped_test)
y_pred_test_lr

In [None]:
# Show train metrics
print(' >> Displaying Training Metrics')
display_roc_and_f1(labels_train, y_pred_train_lr)
display_conf_matrix(labels_train, y_pred_train_lr)

In [None]:
# Show test metrics
print('Displaying Testing Metrics')
display_roc_and_f1(labels_test, y_pred_test_lr)
display_conf_matrix(labels_test, y_pred_test_lr)

# SGD - Stochastic Gradient Descent

In [None]:
model_path = '..\models\ml_models\stochastic\sgd_pca.pkl'

if os.path.exists(model_path):
    print(' >> Loading model')
    sgd_clf = load_ml_model(model_path)

In [None]:
sgd_clf = SGDClassifier(
    loss='log_loss',
    max_iter=100000,
    tol=1e-3,
    random_state=42
)

sgd_clf.fit(data_reshaped_train, labels_train)

In [None]:
# Save model
save_ml_model(sgd_clf, '../models/ml_models/sgd.pkl')

In [None]:
y_pred_train_sgd = sgd_clf.predict(data_reshaped_train)
y_pred_train_sgd

In [None]:
y_pred_test_sgd = sgd_clf.predict(data_reshaped_test)
y_pred_test_sgd

In [None]:
# Show train metrics
print(' >> Displaying Training Metrics')
display_roc_and_f1(labels_train, y_pred_train_sgd)
display_conf_matrix(labels_train, y_pred_train_sgd)

In [None]:
# Show test metrics
print('Displaying Testing Metrics')
display_roc_and_f1(labels_test, y_pred_test_sgd)
display_conf_matrix(labels_test, y_pred_test_sgd)

# Decision Tree

In [None]:
model_path = '..\models\ml_models\decision_tree\dtc_g_pca.pkl'

if os.path.exists(model_path):
    print(' >> Loading model')
    dtc_g = load_ml_model(model_path)

In [None]:
dtc_g = DecisionTreeClassifier(
    class_weight='balanced',
    random_state=1,
    max_depth=20
)

dtc_g.fit(data_reshaped_train, labels_train)

In [None]:
# Save model
save_ml_model(dtc_g, '../models/ml_models/dtc_g.pkl')

In [None]:
y_pred_train_dt = dtc_g.predict(data_reshaped_train)
y_pred_train_dt

In [None]:
y_pred_test_dt = dtc_g.predict(data_reshaped_test)
y_pred_test_dt

In [None]:
# Show train metrics
print(' >> Displaying Training Metrics')
display_roc_and_f1(labels_train, y_pred_train_dt)
display_conf_matrix(labels_train, y_pred_train_dt)

In [None]:
# Show test metrics
print('Displaying Testing Metrics')
display_roc_and_f1(labels_test, y_pred_test_dt)
display_conf_matrix(labels_test, y_pred_test_dt)

# Random Forest

In [None]:
model_path = '..\models\ml_models\random_forest\rdf_pca.pkl'

if os.path.exists(model_path):
    print(' >> Loading model')
    rdf = load_ml_model(model_path)

In [None]:
rdf = RandomForestClassifier(
    random_state=1,
    n_estimators=400,
    class_weight='balanced',
    max_depth=20,
)

rdf.fit(data_reshaped_train, labels_train)

In [None]:
# Save model
save_ml_model(rdf, '../models/ml_models/rdf.pkl')

In [None]:
y_pred_train_rdf = rdf.predict(data_reshaped_train)
y_pred_train_rdf

In [None]:
y_pred_test_rdf = rdf.predict(data_reshaped_test)
y_pred_test_rdf

In [None]:
# Show train metrics
print(' >> Displaying Training Metrics')
display_roc_and_f1(labels_train, y_pred_train_rdf)
display_conf_matrix(labels_train, y_pred_train_rdf)

In [None]:
# Show test metrics
print('Displaying Testing Metrics')
display_roc_and_f1(labels_test, y_pred_test_rdf)
display_conf_matrix(labels_test, y_pred_test_rdf)

# Ada Boost

In [None]:
model_path = '..\models\ml_models\adaBoost\ada.pkl'

if os.path.exists(model_path):
    print(' >> Loading model')
    ada = load_ml_model(model_path)

In [None]:
ada = AdaBoostClassifier(
    n_estimators=100,
    random_state=42
)

ada.fit(data_reshaped_train, labels_train)

In [None]:
# Save model
save_ml_model(ada, '../models/ml_models/ada.pkl')

In [None]:
y_pred_train_ada = ada.predict(data_reshaped_train)
y_pred_train_ada

In [None]:
y_pred_test_ada = ada.predict(data_reshaped_test)
y_pred_test_ada

In [None]:
# Show train metrics
print(' >> Displaying Training Metrics')
display_roc_and_f1(labels_train, y_pred_train_ada)
display_conf_matrix(labels_train, y_pred_train_ada)

In [None]:
# Show test metrics
print('Displaying Testing Metrics')
display_roc_and_f1(labels_test, y_pred_test_ada)
display_conf_matrix(labels_test, y_pred_test_ada)