In [1]:
import glob

import pandas as pd
import numpy as np
import os, sys

In [2]:
from numba.typed import List
from numba import jit, njit, vectorize

In [3]:
sample_rate = 50  # number of observation per second based on dataset documentation

sliding_size = int((1/3) * sample_rate)  # number of skipped datapoints to start next window
print(sliding_size)

16


# Feature sets

In [4]:
@njit()
def mean_crossing_rate(col):
    # col = np.array(values)
    normalized = col - col.mean()  # to make elements of array possitive or negetive
    return ((normalized[:-1] * col[1:]) < 0).sum()  # Zero-Crossing_rate

@njit()
def iqr(window):  # inter-quartile range
    Q1 = np.median(window[:len(window)//2])  # First quartile (Q1) 
    Q3 = np.median(window[len(window)//2:])  # Third quartile (Q3) 
    IQR = Q3 - Q1 # Interquartile range (IQR) 
    return(IQR) 
@njit()
def calc_sma_for_window(data):
    return np.sum(data) / len(data)  
@njit()
def get_min(x):
    m = np.min(x)
    return m
@njit()
def get_max(x):
    m = np.max(x)
    return m
@njit()
def get_mean(x):
    m = np.mean(x)
    return m
@njit()
def get_var(x):
    m = np.var(x)
    return m
@njit()
def get_mean(x):
    m = np.mean(x)
    return m
@njit()
def get_sum(x):
    m = x.sum()
    return m 
@njit()
def get_median(x):
    m = np.median(x)
    return m 
@njit()
def get_std(x):
    m = np.median(x)
    return m 

In [5]:
def Energy(frame):
    return sum( [ abs(x)**2 for x in frame ] ) / len(frame)

In [6]:
def FS3(window):# mean, std,max,min and zero-crossing-rate
    win = np.array(window[:-1])
    signal = np.array(win, dtype=float)
    fourier = np.fft.fft(signal) # FFT
    N = len(fourier)//2+1
    real_fft = np.abs(fourier[:N]) # real value of FFT
    n = signal.size
    timestep = 0.02 # Sample spacing (inverse of the sampling rate).
    freq = np.fft.fftfreq(n, d=timestep) # FREQUENCY DATA
    fft_ps = np.abs(fourier)**2 # POWER SPECTRUM
    features = []
    
    features.append(get_mean(fft_ps))
    features.append(get_median(fft_ps))
    features.append(get_std(fft_ps))
    features.append(get_mean(win))
    features.append(get_median(win))
    features.append(get_std(win))
    

    features = np.hstack(features).tolist()

    label = window.iloc[:, -1].mode()[0]  ## select the most frequent label as the label of the window
    features.append(label)
    return features

In [7]:
def windowing_dataset(dataset, win_size, feature_extraction_function, subject_id, overlap=False):
    windowed_dataset = []
    win_count = 0
    if overlap:
        step_size = sliding_size  # for Overlapping technique
    else:
        step_size = win_size  # for Non-overlapping technique

    for index in range(0, dataset.shape[0], step_size):
        start = index
        end = start + win_size
        # to assure all of windows are equal in size
        if (end <= dataset.shape[0]):
            window = dataset.iloc[start:end, :].reset_index(drop=True)
            win_count = win_count + 1
            features = feature_extraction_function(window)

            windowed_dataset.append(features)

    final = pd.DataFrame(windowed_dataset)
    final.insert(0, 'group', subject_id)  # to use in Subject CV
    return final

In [8]:
def Preprocessing(dataset_path, output_path, overlapping):
    feature_function = FS3
    win_size = 3
    #for win_size in win_sizes:
    print("Start for win size {}".format(win_size))
    datapoints_per_window = int(win_size * sample_rate)

    #for feature_function in features_functions:

    print(feature_function.__name__)

    windowed_dataset = []

    for subject in range(1,18):
        file_path = dataset_path + '\subject{0}_ideal.csv'.format(subject)
        acc_cols = []
        for i in range(11, 117, 13):# indices of accelarations
            indices = list(range(i, i + 4))
            acc_cols.extend(indices)

        acc_cols.append(119)  # label index

        tmp_db = pd.read_csv(file_path, header=None, usecols=acc_cols, sep='\t')
        tmp_db.columns = list(range(tmp_db.shape[1]))  # re-index the columns

        transformed_db = windowing_dataset(tmp_db, datapoints_per_window, feature_function, subject,
                                                   overlap=overlapping)

        windowed_dataset.append(transformed_db)

    final_dataset = pd.DataFrame()
    print("Merging!")
    final_dataset = final_dataset.append(windowed_dataset, ignore_index=True)

    out_folder_name = 'QUAD_COMBINED'
           
    os.makedirs('{}/{}'.format(output_path, out_folder_name), exist_ok=True)

    os.makedirs('{}/{}/FS3'.format(output_path, out_folder_name), exist_ok=True)

    final_dataset.to_csv('{}/QUAD_COMBINED/FS3/data{}.csv'.format(output_path, out_folder_name, win_size), sep='\t',index=False)


In [9]:
'''

 - Reads the raw data from input_path
 - Segments the raw datasets into windowed ones by different window sizes  
 - From each window it extracts FS1,FS2 and FS3.
 - Saves results in output_path. 
 
  Parameters:
    -----------
    dataset_path : Path of raw dataset
    
    output_path : Path to save the processed dataset
    
    overlapping : Controls the sliding windows technique;
    1: Overlapping sliding windows
    0: Non-overlapping sliding windows

    

'''

'\n\n - Reads the raw data from input_path\n - Segments the raw datasets into windowed ones by different window sizes  \n - From each window it extracts FS1,FS2 and FS3.\n - Saves results in output_path. \n \n  Parameters:\n    -----------\n    dataset_path : Path of raw dataset\n    \n    output_path : Path to save the processed dataset\n    \n    overlapping : Controls the sliding windows technique;\n    1: Overlapping sliding windows\n    0: Non-overlapping sliding windows\n\n    \n\n'

In [10]:
input_path = r"D:\projec\proj\data"
output_path = r"D:\projec\proj\result"
overlapping = 1

Preprocessing(dataset_path=input_path, output_path=output_path, overlapping=bool(int(overlapping)))


Start for win size 3
FS3
Merging!
