In [1]:
import glob
import pandas as pd
import numpy as np
import os, sys
import collections
import math
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import cross_val_score


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier 

In [3]:
from numba.typed import List
from numba import jit, njit, vectorize

In [4]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [5]:
sample_rate = 50  # number of observation per second based on dataset documentation(150 samples in 3 second)

sliding_size = int((1/3) * sample_rate)  # number of skipped datapoints to start next window
print(sliding_size)

16


# Feature sets

In [6]:
@njit()
def mean_crossing_rate(col):
    # col = np.array(values)
    normalized = col - col.mean()  # to make elements of array possitive or negetive
    return ((normalized[:-1] * col[1:]) < 0).sum()  # Zero-Crossing_rate

@njit()
def iqr(window):  # inter-quartile range
    Q1 = np.median(window[:len(window)//2])  # First quartile (Q1) 
    Q3 = np.median(window[len(window)//2:])  # Third quartile (Q3) 
    IQR = Q3 - Q1 # Interquartile range (IQR) 
    return(IQR) 
@njit()
def calc_sma_for_window(data):
    return np.sum(data) / len(data)  
@njit()
def get_min(x):
    m = np.min(x)
    return m
@njit()
def get_max(x):
    m = np.max(x)
    return m
@njit()
def get_mean(x):
    m = np.mean(x)
    return m
@njit()
def get_var(x):
    m = np.var(x)
    return m
@njit()
def get_mean(x):
    m = np.mean(x)
    return m
@njit()
def get_sum(x):
    m = x.sum()
    return m 
@njit()
def get_median(x):
    m = np.median(x)
    return m 
@njit()
def get_std(x):
    m = np.median(x)
    return m 
@njit()
def get_rng(x):
    n = np.max(x)
    m = np.min(x)
    z = n-m
    return z 

def get_rms(x, axis=None):
    return np.sqrt(np.mean(x ** 2, axis=axis))

def calc_sma_for_window(data):
    return np.sum(data) / len(data)


def calc_sma_adv_for_window(data):
    return np.sum(data - np.mean(data) / len(data))


def calc_absolutes_for_list(list):
    return ([abs(i) for i in list])

def get_sma(data): 
    sma_sim = calc_sma_for_window(data)
    sma_adv = calc_sma_adv_for_window(data)

    sma_sim_abs = calc_sma_for_window(calc_absolutes_for_list(data))
    sma_adv_abs = calc_sma_adv_for_window(calc_absolutes_for_list(data))

    return sma_sim, sma_adv, sma_sim_abs, sma_adv_abs

def get_entropy(Y):
    """
    Also known as Shanon Entropy
    Reference: https://en.wikipedia.org/wiki/Entropy_(information_theory)
    """
    unique, count = np.unique(Y, return_counts=True, axis=0)
    prob = count/len(Y)
    en = np.sum((-1)*prob*np.log2(prob))
    return en

In [7]:
def Energy(frame):
    return sum( [ abs(x)**2 for x in frame ] ) / len(frame)

# Feature extraction and Vector_Normalization on signal

In [8]:
def Features(window):# mean, std,max,min and zero-crossing-rate
    wind = window.iloc[:, :-1]
    win = np.array(wind)
    
    
    features = []
    
    features.append(get_mean(win))
    features.append(get_median(win))
    features.append(get_std(win))
    features.append(get_min(win))
    features.append(get_max(win))
    features.append(get_sum(win))
    features.append(get_entropy(win))
    mean_crossing = [mean_crossing_rate(win[:, i]) for i in range(win.shape[1])]
    features.append(np.array(mean_crossing))
    IQR = iqr(win)
    features.append(np.array(IQR))
    energy_measure = Energy(win)
    features.append(np.array(energy_measure))
 

    
    features = np.hstack(features).tolist()
    
    label = window.iloc[:, -1].mode()[0]  ## select the most frequent label as the label of the window
    features.append(label)
    return features

In [9]:
def windowing_dataset(dataset, win_size, feature_extraction_function, subject_id, overlap=False):
    windowed_dataset = []
    win_count = 0
    if overlap:
        step_size = sliding_size  # for Overlapping technique
    else:
        step_size = win_size  # for Non-overlapping technique

    for index in range(0, dataset.shape[0], step_size):
        start = index
        end = start + win_size
        # to assure all of windows are equal in size
        if (end <= dataset.shape[0]):
            window = dataset.iloc[start:end, :].reset_index(drop=True)
            win_count = win_count + 1
            features = feature_extraction_function(window)

            windowed_dataset.append(features)

    final = pd.DataFrame(windowed_dataset)
    final.insert(0, 'group', subject_id)  # to use in Subject CV
    return final

In [10]:
def Preprocessing(dataset_path, overlapping):
    feature_function = Features
    win_size = 3

    print("Start for win size {}".format(win_size))
    datapoints_per_window = int(win_size * sample_rate)

    print(feature_function.__name__)

    ALL = []
   
    for subject in range(1,18):
        file_path = dataset_path + '\subject{0}_self.csv'.format(subject)
        all_cols = []
        
        
#########################################################################################################################
        for i in range(2, 117, 13):# indices of accelarations
            indices = list(range(i, i + 13))
            all_cols.extend(indices)

        all_cols.append(119)  # label index

        tmp_db = pd.read_csv(file_path, header=None, usecols=all_cols, sep='\t')
        tmp_db.columns = list(range(tmp_db.shape[1]))  # re-index the columns
        X = tmp_db.iloc[:, 0:-1]
        X = np.array(X)
    
        Y = tmp_db.iloc[:, -1]
        Y = np.array(Y)
        data_normalized = preprocessing.normalize(X, norm  = 'l1') # normalization on signal
        X = data_normalized
        X = pd.DataFrame(X)
        Y = pd.DataFrame(Y)
        tmp_db = pd.concat([X, Y], axis=1)
        transformed_db = windowing_dataset(tmp_db, datapoints_per_window, feature_function, subject,
                                                   overlap=overlapping)

        ALL.append(transformed_db)
#########################################################################################################################
       
    #final_dataset = pd.DataFrame()
    ALL_dataset = pd.DataFrame()
    ALL_dataset = ALL_dataset.append(ALL, ignore_index=True)
    
    return ALL_dataset
   

In [11]:
def subject_cross_validation(X, Y, groups, classifier):
    f1 = []
    logo = LeaveOneGroupOut()
    i = 0
    for train_index, test_index in logo.split(X, Y, groups=groups):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        i += 1
        classifier.fit(X_train, y_train)

        y_pred = classifier.predict(X_test)
        f = f1_score(y_true=y_test, y_pred=y_pred, average='micro')
        
        print('Model-',i ,' -',' f1 score: ', f)
       
        f1.append(f)
    return np.mean(f1)

In [12]:
def classifier(dataset, model):
    results = dict()
    win_size = float(3)
    print('window_size = ', win_size,' sec')

    dataset = dataset
    groups = dataset['group']
    X = dataset.iloc[:, 1:-1]
    X = np.array(X)
    
    Y = dataset.iloc[:, -1]
    Y = np.array(Y)

    for model_name, mod in model.items():
        f1 = 0

        f1 = subject_cross_validation(X, Y, groups, mod)

        if win_size in results:
            results[win_size].append(f1)
        else:
            results[win_size] = [f1]


        results = collections.OrderedDict(sorted(results.items()))

        final = []
        col = list(model.keys())
        col.insert(0, "window-size")
        final.append(col)
        for k, v in results.items():
            tmp = []
            tmp.append([k])
            tmp.append(v)
            flattened = [val for sublist in tmp for val in sublist]
            final.append(flattened)

    accuracy = final[1][1]
    
    return accuracy

In [13]:
mod = {'RF': RandomForestClassifier(n_estimators=40, random_state=42, n_jobs=-1)}

In [14]:
dataset_csv_path = r"D:\projec\proj\data_self_null"

overlapping = 1  # input 0 for non overlapping, 1 for overlapping

ALL = Preprocessing(dataset_path=dataset_csv_path, overlapping=bool(int(overlapping)))

Start for win size 3
Features


In [15]:
ALL.head(10)

Unnamed: 0,group,0,1,2,3,4,5,6,7,8,...,233,234,235,236,237,238,239,240,241,242
0,1,-0.001633,0.00054,0.00054,-0.161538,0.118863,-28.662614,7.228819,53.0,72.0,...,1.806114e-06,4.747233e-07,2.2e-05,4.985885e-07,5.342227e-07,1.1e-05,1.1e-05,1.3e-05,1e-05,0
1,1,-0.001606,0.00059,0.00059,-0.161538,0.118863,-28.188735,7.228819,40.0,72.0,...,1.81341e-06,4.980738e-07,2.2e-05,3.60856e-07,7.304786e-07,1e-05,1.2e-05,1.3e-05,9e-06,0
2,1,-0.001724,0.000544,0.000544,-0.161538,0.118863,-30.252523,7.228819,43.0,76.0,...,1.797857e-06,5.242552e-07,2.2e-05,2.754812e-07,9.325743e-07,9e-06,1.3e-05,1.3e-05,9e-06,0
3,1,-0.001764,0.000537,0.000537,-0.161538,0.118863,-30.95555,7.228819,69.0,84.0,...,1.648892e-06,5.272749e-07,2.2e-05,1.652461e-07,1.13786e-06,9e-06,1.4e-05,1.3e-05,9e-06,0
4,1,-0.001775,0.000538,0.000538,-0.161538,0.075149,-31.157406,7.228819,67.0,84.0,...,1.543571e-06,4.698655e-07,2.2e-05,1.081255e-07,1.344332e-06,9e-06,1.4e-05,1.3e-05,8e-06,0
5,1,-0.001785,0.00053,0.00053,-0.161538,0.075149,-31.32788,7.228819,64.0,86.0,...,9.751777e-07,4.00056e-07,2.1e-05,5.670264e-08,1.5591e-06,8e-06,1.5e-05,1.3e-05,8e-06,0
6,1,-0.001741,0.000525,0.000525,-0.161538,0.075149,-30.557354,7.228819,56.0,92.0,...,3.479792e-07,3.795153e-07,2.1e-05,2.831761e-08,1.708452e-06,8e-06,1.5e-05,1.3e-05,8e-06,0
7,1,-0.001669,0.000519,0.000519,-0.161538,0.075149,-29.289748,7.228819,33.0,94.0,...,7.347687e-08,3.372816e-07,2e-05,1.850317e-08,1.810202e-06,8e-06,1.5e-05,1.3e-05,7e-06,0
8,1,-0.001572,0.00051,0.00051,-0.07547,0.070304,-27.596964,7.228819,21.0,89.0,...,5.958167e-08,2.055966e-07,2.1e-05,1.720221e-08,1.888827e-06,8e-06,1.5e-05,1.3e-05,7e-06,0
9,1,-0.00149,0.000496,0.000496,-0.07547,0.070304,-26.149079,7.228819,22.0,88.0,...,2.719066e-08,1.940452e-07,2.1e-05,2.356697e-08,1.9322e-06,8e-06,1.5e-05,1.3e-05,8e-06,0


In [16]:
print('ALL_accuracy = ', )
ALL_accuracy = classifier(dataset=ALL, model=mod)
ALL_accuracy 

ALL_accuracy = 
window_size =  3.0  sec
Model- 1  -  f1 score:  0.8026784854032181
Model- 2  -  f1 score:  0.8744911804613297
Model- 3  -  f1 score:  0.8114098487038643
Model- 4  -  f1 score:  0.9094544872820349
Model- 5  -  f1 score:  0.798965873836608
Model- 6  -  f1 score:  0.68929173693086
Model- 7  -  f1 score:  0.9290284476818584
Model- 8  -  f1 score:  0.9374068554396423
Model- 9  -  f1 score:  0.917526591522464
Model- 10  -  f1 score:  0.9309613928841787
Model- 11  -  f1 score:  0.8578856152512998
Model- 12  -  f1 score:  0.8978134270959235
Model- 13  -  f1 score:  0.7099760027425437
Model- 14  -  f1 score:  0.8811114079615282
Model- 15  -  f1 score:  0.8405354415596084
Model- 16  -  f1 score:  0.821387171561051
Model- 17  -  f1 score:  0.9586045565500407


0.856972266051062

In [17]:
d ={'ALL': [ALL_accuracy]}

In [18]:
df_accuracy = pd.DataFrame(data=d)

In [19]:
df_accuracy

Unnamed: 0,ALL
0,0.856972
