In [1]:
import glob
import pandas as pd
import numpy as np
import os, sys
import collections

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import cross_val_score


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier 

In [3]:
from numba.typed import List
from numba import jit, njit, vectorize

In [4]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [5]:
sample_rate = 50  # number of observation per second based on dataset documentation(150 samples in 3 second)

sliding_size = int((1/3) * sample_rate)  # number of skipped datapoints to start next window
print(sliding_size)

16


# Feature sets

In [6]:
@njit()
def mean_crossing_rate(col):
    # col = np.array(values)
    normalized = col - col.mean()  # to make elements of array possitive or negetive
    return ((normalized[:-1] * col[1:]) < 0).sum()  # Zero-Crossing_rate

@njit()
def iqr(window):  # inter-quartile range
    Q1 = np.median(window[:len(window)//2])  # First quartile (Q1) 
    Q3 = np.median(window[len(window)//2:])  # Third quartile (Q3) 
    IQR = Q3 - Q1 # Interquartile range (IQR) 
    return(IQR) 
@njit()
def calc_sma_for_window(data):
    return np.sum(data) / len(data)  
@njit()
def get_min(x):
    m = np.min(x)
    return m
@njit()
def get_max(x):
    m = np.max(x)
    return m
@njit()
def get_mean(x):
    m = np.mean(x)
    return m
@njit()
def get_var(x):
    m = np.var(x)
    return m
@njit()
def get_mean(x):
    m = np.mean(x)
    return m
@njit()
def get_sum(x):
    m = x.sum()
    return m 
@njit()
def get_median(x):
    m = np.median(x)
    return m 
@njit()
def get_std(x):
    m = np.median(x)
    return m 



In [7]:
def Energy(frame):
    return sum( [ abs(x)**2 for x in frame ] ) / len(frame)

# Histogram_equalization on signal

In [8]:
def histogram_equalize(im):#https://www.researchgate.net/publication/281118372_NumPy_SciPy_Recipes_for_Image_Processing_Intensity_Normalization_and_Histogram_Equalization
    data = im.copy().flatten()
    hist, bins = np.histogram(data, 256, density=True)
    cdf = hist.cumsum()#normalized cumulative histogram H using NumPy’s inbuilt function cumsum
    cdf = 255*cdf/cdf[-1]  #intensity transformation
    img_eq = np.interp(data, bins[:-1], cdf)
    return img_eq.reshape(im.shape)

In [9]:
def Features(window):# mean, std,max,min and zero-crossing-rate
    wind = window.iloc[:, :-1]
    winn = np.array(wind)
    win = histogram_equalize(winn) # histogram normalization on signal
    
    features = []
    
    features.append(get_mean(win))
    features.append(get_median(win))
    features.append(get_var(win))
    features.append(get_std(win))
    features.append(get_min(win))
    features.append(get_max(win))
    features.append(get_sum(win))
    mean_crossing = [mean_crossing_rate(win[:, i]) for i in range(win.shape[1])]
    features.append(np.array(mean_crossing))
    IQR = iqr(win)
    features.append(np.array(IQR))
    energy_measure = Energy(win)
    features.append(np.array(energy_measure))
 

    features = np.hstack(features).tolist()

    label = window.iloc[:, -1].mode()[0]  ## select the most frequent label as the label of the window
    features.append(label)
    return features

In [10]:
def windowing_dataset(dataset, win_size, feature_extraction_function, subject_id, overlap=False):
    windowed_dataset = []
    win_count = 0
    if overlap:
        step_size = sliding_size  # for Overlapping technique
    else:
        step_size = win_size  # for Non-overlapping technique

    for index in range(0, dataset.shape[0], step_size):
        start = index
        end = start + win_size
        # to assure all of windows are equal in size
        if (end <= dataset.shape[0]):
            window = dataset.iloc[start:end, :].reset_index(drop=True)
            win_count = win_count + 1
            features = feature_extraction_function(window)

            windowed_dataset.append(features)

    final = pd.DataFrame(windowed_dataset)
    final.insert(0, 'group', subject_id)  # to use in Subject CV
    return final

In [11]:
def Preprocessing(dataset_path, overlapping):
    feature_function = Features
    win_size = 3
    #for win_size in win_sizes:
    print("Start for win size {}".format(win_size))
    datapoints_per_window = int(win_size * sample_rate)

 
    print(feature_function.__name__)

    windowed_dataset = []

    for subject in range(1,18):
        file_path = dataset_path + '\subject{0}_ideal.csv'.format(subject)
        acc_cols = []
        for i in range(8, 117, 13):# indices of accelarations
            indices = list(range(i, i + 3))
            acc_cols.extend(indices)

        acc_cols.append(119)  # label index

        tmp_db = pd.read_csv(file_path, header=None, usecols=acc_cols, sep='\t')
        tmp_db.columns = list(range(tmp_db.shape[1]))  # re-index the columns
        
        transformed_db = windowing_dataset(tmp_db, datapoints_per_window, feature_function, subject,
                                                   overlap=overlapping)

        windowed_dataset.append(transformed_db)

    final_dataset = pd.DataFrame()
    
    final_dataset = final_dataset.append(windowed_dataset, ignore_index=True)
    return final_dataset
   

In [12]:
def subject_cross_validation(X, Y, groups, classifier):
    f1 = []
    logo = LeaveOneGroupOut()
    i = 0
    for train_index, test_index in logo.split(X, Y, groups=groups):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        i += 1
        classifier.fit(X_train, y_train)

        y_pred = classifier.predict(X_test)
        f = f1_score(y_true=y_test, y_pred=y_pred, average='micro')
        
        print('Model-',i ,' -',' f1 score: ', f)
       
        f1.append(f)
    return np.mean(f1)

In [13]:
def apply_classifiers(dataset, model):
    results = dict()
    win_size = float(3)
    print('window_size = ', win_size,' sec')

    dataset = dataset
    groups = dataset['group']
    X = dataset.iloc[:, 1:-1].values

    Y = dataset.iloc[:, -1].values


    for model_name, mod in model.items():
        f1 = 0


        f1 = subject_cross_validation(X, Y, groups, mod)
        
        if win_size in results:
            results[win_size].append(f1)
        else:
            results[win_size] = [f1]

 
        results = collections.OrderedDict(sorted(results.items()))

        final = []
        col = list(model.keys())
        col.insert(0, "window-size")
        final.append(col)
        for k, v in results.items():
            tmp = []
            tmp.append([k])
            tmp.append(v)
            flattened = [val for sublist in tmp for val in sublist]
            final.append(flattened)

        print('accuracy : ', final[1][1])

In [14]:
model = {'RF': RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)}

In [15]:
dataset_csv_path = r"D:\projec\proj\data"

overlapping = 1   # input 0 for non overlapping, 1 for overlapping

df = Preprocessing(dataset_path=dataset_csv_path, overlapping=bool(int(overlapping)))

df.head(10)

Start for win size 3
Features


Unnamed: 0,group,0,1,2,3,4,5,6,7,8,...,53,54,55,56,57,58,59,60,61,62
0,1,129.275471,129.338097,5408.886601,129.338097,0.503704,255.0,523565.657988,82.0,80.0,...,47377.745431,10371.682363,11545.408099,62506.882653,12624.7323,21231.611811,50725.050814,27778.792211,13756.723181,1
1,1,129.299155,129.436139,5406.098262,129.436139,0.62963,255.0,523661.576694,71.0,82.0,...,47355.307623,10568.000789,11044.208605,62582.721894,13249.39926,21425.593507,51026.296781,28841.06215,9894.540042,1
2,1,129.269826,128.915459,5408.045394,128.915459,0.503704,255.0,523542.795836,66.0,76.0,...,47334.856877,11037.249221,9726.583998,62289.145128,13783.489462,24107.060378,52427.698459,27747.346119,10427.404324,1
3,1,129.314295,129.709394,5405.383461,129.709394,0.062963,255.0,523722.894027,55.0,74.0,...,46854.394187,11485.275829,8266.62394,62398.140651,13638.942024,25290.752899,53402.343486,26183.511052,14457.992016,1
4,1,129.712363,131.062825,5408.145664,131.062825,0.062963,255.0,525335.068207,48.0,34.0,...,46558.013691,10988.153576,8663.806729,62415.534716,12914.324272,24724.611646,53397.488924,25937.496049,16217.049234,1
5,1,129.949156,130.198641,5408.880163,130.198641,0.251852,255.0,526294.082985,88.0,40.0,...,46238.645965,9747.774331,10112.292525,62190.498898,11667.786575,22878.996179,51789.100264,25750.516115,15432.25731,1
6,1,129.978749,128.742802,5408.882754,128.742802,0.251852,255.0,526413.934151,115.0,130.0,...,44836.417525,9011.976898,10013.396954,62065.34413,11055.952131,21393.606529,50083.369571,25590.077404,13629.517629,1
7,1,129.91246,128.404269,5419.886365,128.404269,0.251852,255.0,526145.462302,102.0,119.0,...,43792.968669,9103.909587,9663.493399,61601.216493,11553.855512,21630.596104,50251.602498,25808.438,10730.906952,1
8,1,130.065403,128.813694,5421.319106,128.813694,0.251852,255.0,526764.880604,87.0,110.0,...,42245.74736,9229.789607,8335.625172,60591.575301,11384.391004,22738.883859,49507.491042,24491.204,12440.26006,1
9,1,130.074689,128.894727,5413.147163,128.894727,0.251852,255.0,526802.491178,74.0,110.0,...,40648.212573,9355.315776,7388.163943,59927.098993,10766.54266,21906.301323,49211.680138,23503.63756,13566.385718,1


In [16]:
dataset = df

apply_classifiers(dataset=dataset, model=model)

window_size =  3.0  sec
Model- 1  -  f1 score:  0.7741258741258741
Model- 2  -  f1 score:  0.7278815650334861
Model- 3  -  f1 score:  0.7752
Model- 4  -  f1 score:  0.8733363928407526
Model- 5  -  f1 score:  0.7241379310344829
Model- 6  -  f1 score:  0.7472222222222223
Model- 7  -  f1 score:  0.9778393351800554
Model- 8  -  f1 score:  0.8003072196620584
Model- 9  -  f1 score:  0.8748717948717949
Model- 10  -  f1 score:  0.8166609764424719
Model- 11  -  f1 score:  0.9009375
Model- 12  -  f1 score:  0.8885050251256281
Model- 13  -  f1 score:  0.8346034434095401
Model- 14  -  f1 score:  0.883843717001056
Model- 15  -  f1 score:  0.8284971849285405
Model- 16  -  f1 score:  0.7059897735573412
Model- 17  -  f1 score:  0.8079117535184481
accuracy :  0.8201101005266914
