In [36]:
from glob import glob
import pandas as pd

# Reading data from our CSV files that are stored inside this repository (single files)

try: 
    single_file_accelerometer = pd.read_csv(
        '../../data/raw/MetaMotion/A-bench-heavy2-rpe8_MetaWear_2019-01-11T16.10.08.270_C42732BE255C_Accelerometer_12.500Hz_1.4.4.csv')
    single_file_gyroscope = pd.read_csv(
        '../../data/raw/MetaMotion/A-bench-heavy2-rpe8_MetaWear_2019-01-11T16.10.08.270_C42732BE255C_Gyroscope_25.000Hz_1.4.4.csv')
except FileNotFoundError:
    print("Files that you try to use are not available.")

# Reading all CVS files from data/raw/MetaMotion that we will use as a list later 

# Read all files that have csv extension if inside this repo the compiler finds some files that are not with this extension they 
# will be ignored

files = glob('../../data/raw/MetaMotion/*.csv')


data_path = '../../data/raw/MetaMotion/'

first_file = files[0]

def get_data_from_files(files):
    accelerometer_data_frame = pd.DataFrame() # creating an empty data frame
    gyroscope_data_frame = pd.DataFrame()

    accelerometer_set = 1
    gyroscope_set = 1

    for file in files:
        # Extract pieces of the file name e.g. A-bench-heavy2-rpe8_MetaWear_2019-01-11T16.10.08.270_C42732BE255C_Gyroscope_25.000Hz_1.4.4.csv and append 
        # it to the data frame
        participant = file.split('-')[0].replace(data_path,'') # on this way we are updating existing path and getting the participate
        exercise = file.split('-')[1]
        category = file.split('-')[2].rstrip('123').rstrip('_MetaWear_2019') # with category we have the number of the set so we need to remove it

        data_frame = pd.read_csv(file)

        # We will extract three variables from the file name: participant: A, exercise: bench, category of sets(e.g. heavy)
        data_frame['participant'] = participant # adding new columns inside the data frame
        data_frame['exercise'] = exercise
        data_frame['category'] = category

        if 'Accelerometer' in file:
            data_frame['set'] = accelerometer_set 
            accelerometer_set += 1
            accelerometer_data_frame = pd.concat([accelerometer_data_frame,data_frame])
    
        if 'Gyroscope' in file:
            data_frame['set'] = gyroscope_set
            gyroscope_set += 1
            gyroscope_data_frame = pd.concat([gyroscope_data_frame,data_frame])

    # Working with date-times (epoch and time columns), epoch is the UTC date-time format from 1 January 1970 to today (in milliseconds),
    # doesn't care in which time zone you are
    accelerometer_data_frame.index = pd.to_datetime(accelerometer_data_frame["epoch (ms)"],unit='ms')
    gyroscope_data_frame.index = pd.to_datetime(gyroscope_data_frame["epoch (ms)"],unit='ms')

    # we need to delete duplicated columns 

    del accelerometer_data_frame['epoch (ms)']
    del accelerometer_data_frame['time (01:00)']
    del accelerometer_data_frame['elapsed (s)']

    del gyroscope_data_frame['epoch (ms)']
    del gyroscope_data_frame['time (01:00)']
    del gyroscope_data_frame['elapsed (s)']

    return accelerometer_data_frame, gyroscope_data_frame

accelerometer_data_frame, gyroscope_data_frame = get_data_from_files(files)


# Need to merge datasets into one

dataset = pd.concat([accelerometer_data_frame.iloc[:,:3],gyroscope_data_frame],axis=1)

dataset.dropna()

dataset.columns = [
    'acc_x',
    'acc_y',
    'acc_z',
    'gyr_x',
    'gyr_y',
    'gyr_z',
    'label',
    'category',
    'participant',
    'set',
]

# Resample data (frequency conversion)

sampling = {
    'acc_x':'mean',
    'acc_y':'mean',
    'acc_z':'mean',
    'gyr_x':'mean',
    'gyr_y':'mean',
    'gyr_z':'mean',
    'label':'last',
    'category':'last',
    'participant':'last',
    'set':'last',
}


dataset[:100].resample(rule="200ms").apply(sampling)

days = [g for n, g in dataset.groupby(pd.Grouper(freq="D"))]

data_resampled = pd.concat([df.resample(rule="200ms").apply(sampling).dropna() for df in days])

data_resampled['set'] = data_resampled['set'].astype('int')

data_resampled.info()


# Exporting dataset

data_resampled.to_pickle('../../data/interim/01_data_processed.pkl')

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9009 entries, 2019-01-11 15:08:05.200000 to 2019-01-20 17:33:27.800000
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   acc_x        9009 non-null   float64
 1   acc_y        9009 non-null   float64
 2   acc_z        9009 non-null   float64
 3   gyr_x        9009 non-null   float64
 4   gyr_y        9009 non-null   float64
 5   gyr_z        9009 non-null   float64
 6   label        9009 non-null   object 
 7   category     9009 non-null   object 
 8   participant  9009 non-null   object 
 9   set          9009 non-null   int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 774.2+ KB
