In [1]:
import pandas as pd
import os
import zipfile
from datetime import datetime
from tqdm import tqdm
from joblib import Parallel, delayed
import math

In [3]:
def try_convert_to_float(value):
    try:
        return float(value)
    except:
        return None

In [None]:
def unpack_zip_files(main_user_path):
    for user_folders in os.listdir(main_user_path):
        zip_file_path = os.path.join(main_user_path, user_folders)
        unpack_path = zip_file_path.split(user_folders)[0]
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(unpack_path)

In [4]:
def count_mean_minute_hr(df):
    """Count and round mean minute HR"""
    df_copy = df.copy()
    df_copy['ts'] = df_copy['ts'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S:%f').replace(second=0, microsecond=0))

    df_samsung = df_copy[df_copy['device'] == 'samsung']
    if len(df_samsung) > 0:   
        minute_avg_samsung = df_samsung.groupby(df_samsung['ts'].dt.strftime('%Y-%m-%d %H:%M')).mean().round()
        minute_avg_samsung.reset_index(inplace=True)
        minute_avg_samsung['device'] = 'samsung'
    else:
        minute_avg_samsung = pd.DataFrame()

    df_polar = df_copy[df_copy['device'] == 'polar']
    if len(df_polar) > 0:
        minute_avg_polar = df_polar.groupby(df_polar['ts'].dt.strftime('%Y-%m-%d %H:%M')).mean().round()
        minute_avg_polar.reset_index(inplace=True)
        minute_avg_polar['device'] = 'polar'
    else:
        minute_avg_polar = pd.DataFrame()
    minute_avg_df = minute_avg_samsung.append(minute_avg_polar, ignore_index=True)
    minute_avg_df.reset_index(inplace=True, drop=True)
    
    minute_avg_df['ts'] = minute_avg_df['ts'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M'))
    
    return minute_avg_df

In [5]:
def read_aggregate_hr_files_for_user(in_path, out_path):
    df_hr = pd.DataFrame()
    for user_folders in os.listdir(in_path):
        if (user_folders.find('SAMSUNG') != -1 or user_folders.find('POLAR') != -1) and (user_folders.find('.zip') == -1):
            user_files_path = os.path.join(in_path, user_folders)
            for user_file in os.listdir(user_files_path):
                file_path = os.path.join(user_files_path, user_file)
                # HR z samsunga
                if user_file == 'HRM.csv':
                    try:
                        hr_data = pd.read_csv(file_path, sep='\t')
                        hr_data['device'] = 'samsung'
                        hr_data.rename(columns={'v1':'hr'}, inplace=True)
                        df_hr = df_hr.append(hr_data[['hr', 'ts', 'device']], ignore_index=True)
                    
                    except:
                        print(file_path, "rozmiar pliku:", os.path.getsize(file_path))
                # HR z polara
                elif user_file == 'HR.csv':
                    try:
                        hr_data = pd.read_csv(file_path, sep='\t')
                        hr_data['device'] = 'polar'
                        df_hr = df_hr.append(hr_data[['hr', 'ts', 'device']], ignore_index=True)
                        
                    except:
                        print(file_path, "rozmiar pliku:", os.path.getsize(file_path))
    # zmiana wszystkich wartości na float
    df_hr['hr'] = df_hr['hr'].apply(try_convert_to_float)
    df_hr = df_hr[df_hr['hr'] != None]

    # odrzucenie danych, gdzie tętno jest < 40 BPM i > 210 BPM
    df_hr_filtered = df_hr[(df_hr['hr'] >= 40) & (df_hr['hr'] <= 210)]
    df_hr_filtered.reset_index(inplace=True, drop=True)

    mean_minute_hr = count_mean_minute_hr(df_hr_filtered)
    mean_minute_hr.to_csv(out_path+'mean_minute_hr.csv', index=False)

In [6]:
def calculate_vector(row):
    x = row['x']
    y = row['y']
    z = row['z']
    vector = math.sqrt(x**2 + y**2 + z**2)
    return vector

In [7]:
def read_aggregate_acc_files_for_user(in_path, out_path):
    df_acc = pd.DataFrame()
    for user_folders in os.listdir(in_path):
        if (user_folders.find('SAMSUNG') != -1 or user_folders.find('POLAR') != -1) and user_folders.find('.zip') == -1:
            user_files_path = os.path.join(in_path, user_folders)
            for user_file in os.listdir(user_files_path):
                file_path = os.path.join(user_files_path, user_file)
                # dane ACC z samsunga
                if (user_file == 'ACC.csv') and (user_folders.find('SAMSUNG') != -1):
                    try:
                        df = pd.read_csv(file_path, sep='\t')
                        df['ts'] = pd.to_datetime(df['ts'], format='%Y-%m-%dT%H:%M:%S:%f')
                        # grupowanie po sekundzie i wybór najwcześniejszej wartości dla danej sekundy
                        filtered_df = df.loc[df.groupby(df['ts'].dt.strftime('%Y-%m-%d %H:%M:%S'))['ts'].idxmin()]
                        samsung_df = filtered_df[['x', 'y', 'z', 'ts']]
                        # zmiana wartości na float
                        samsung_df['x'] = samsung_df['x'].apply(try_convert_to_float)
                        samsung_df['y'] = samsung_df['y'].apply(try_convert_to_float)
                        samsung_df['z'] = samsung_df['z'].apply(try_convert_to_float)
                        samsung_df = samsung_df.dropna(axis=0)
                        # dodanie nowej kolumny 'vector' z wyliczonymi wektorami
                        samsung_df['vector'] = samsung_df.apply(calculate_vector, axis=1)
                        samsung_df['device'] = 'samsung'
                        df_acc = df_acc.append(samsung_df[['ts', 'x', 'y', 'z', 'vector', 'device']], ignore_index=True)
                    except:
                        print(file_path, "rozmiar pliku:", os.path.getsize(file_path))

                # dane ACC z polara
                elif (user_file == 'ACC.csv') and (user_folders.find('POLAR') != -1):
                    try:
                        df = pd.read_csv(file_path, sep='\t')
                        df['ts'] = pd.to_datetime(df['ts'], format='%Y-%m-%dT%H:%M:%S:%f')
                        filtered_df = df.loc[df.groupby(df['ts'].dt.strftime('%Y-%m-%d %H:%M:%S'))['ts'].idxmin()]
                        polar_df = filtered_df[['x', 'y', 'z', 'ts']]
                        # zmiana wartości na float
                        polar_df['x'] = polar_df['x'].apply(try_convert_to_float)
                        polar_df['y'] = polar_df['y'].apply(try_convert_to_float)
                        polar_df['z'] = polar_df['z'].apply(try_convert_to_float)
                        polar_df = polar_df.dropna(axis=0)
                        polar_df['vector'] = polar_df.apply(calculate_vector, axis=1)
                        polar_df['device'] = 'polar'
                        df_acc = df_acc.append(polar_df[['ts', 'x', 'y', 'z', 'vector', 'device']], ignore_index=True)
                    except:
                        print(file_path, "rozmiar pliku:", os.path.getsize(file_path))     
                                 
    df_acc.to_csv(out_path+'seconds_acc.csv', index=False)

In [8]:
def read_aggregate_gyr_files_for_user(in_path, out_path):
    df_gyr = pd.DataFrame()
    for user_folders in os.listdir(in_path):
        if (user_folders.find('SAMSUNG') != -1) and user_folders.find('.zip') == -1:
            user_files_path = os.path.join(in_path, user_folders)
            for user_file in os.listdir(user_files_path):
                file_path = os.path.join(user_files_path, user_file)
                if user_file == 'GYR.csv':
                    try:
                        df = pd.read_csv(file_path, sep='\t')
                        df['ts'] = pd.to_datetime(df['ts'], format='%Y-%m-%dT%H:%M:%S:%f')
                        filtered_df = df.loc[df.groupby(df['ts'].dt.strftime('%Y-%m-%d %H:%M:%S'))['ts'].idxmin()]
                        new_df = filtered_df[['x', 'y', 'z', 'ts']]
                        new_df['x'] = new_df['x'].apply(try_convert_to_float)
                        new_df['y'] = new_df['y'].apply(try_convert_to_float)
                        new_df['z'] = new_df['z'].apply(try_convert_to_float)
                        new_df = new_df.dropna(axis=0)
                        new_df['vector'] = new_df.apply(calculate_vector, axis=1)
                        df_gyr = df_gyr.append(new_df[['ts', 'x', 'y', 'z', 'vector']], ignore_index=True)
                    except:
                        print(file_path, "rozmiar pliku:", os.path.getsize(file_path))
    df_gyr.to_csv(out_path+'seconds_gyr.csv', index=False)

In [9]:
def read_aggregate_grav_files_for_user(in_path, out_path):
    df_grav = pd.DataFrame()
    for user_folders in os.listdir(in_path):
        if (user_folders.find('SAMSUNG') != -1) and user_folders.find('.zip') == -1:
            user_files_path = os.path.join(in_path, user_folders)
            for user_file in os.listdir(user_files_path):
                file_path = os.path.join(user_files_path, user_file)
                if user_file == 'GRAV.csv':
                    try:
                        df = pd.read_csv(file_path, sep='\t')
                        df['ts'] = pd.to_datetime(df['ts'], format='%Y-%m-%dT%H:%M:%S:%f')
                        filtered_df = df.loc[df.groupby(df['ts'].dt.strftime('%Y-%m-%d %H:%M:%S'))['ts'].idxmin()]
                        new_df = filtered_df[['x', 'y', 'z', 'ts']]
                        new_df['x'] = new_df['x'].apply(try_convert_to_float)
                        new_df['y'] = new_df['y'].apply(try_convert_to_float)
                        new_df['z'] = new_df['z'].apply(try_convert_to_float)
                        new_df = new_df.dropna(axis=0)
                        new_df['vector'] = new_df.apply(calculate_vector, axis=1)
                        df_grav = df_grav.append(new_df[['ts', 'x', 'y', 'z', 'vector']], ignore_index=True)
                    except:
                        print(file_path, "rozmiar pliku:", os.path.getsize(file_path))
    df_grav.to_csv(out_path+'seconds_grav.csv', index=False)

In [10]:
def read_aggregate_light_files_for_user(in_path, out_path):
    df_light = pd.DataFrame()
    for user_folders in os.listdir(in_path):
        if (user_folders.find('SAMSUNG') != -1 or user_folders.find('POLAR') != -1) and user_folders.find('.zip') == -1:
            user_files_path = os.path.join(in_path, user_folders)
            for user_file in os.listdir(user_files_path):
                file_path = os.path.join(user_files_path, user_file)
                if user_file == 'LIGHT.csv':
                    try:
                        df = pd.read_csv(file_path, sep='\t')
                        df['ts'] = pd.to_datetime(df['ts'], format='%Y-%m-%dT%H:%M:%S:%f')
                        filtered_df = df.loc[df.groupby(df['ts'].dt.strftime('%Y-%m-%d %H:%M:%S'))['ts'].idxmin()]
                        df_light = df_light.append(filtered_df[['ts', 'light_level']], ignore_index=True)
                    except:
                        print(file_path, "rozmiar pliku:", os.path.getsize(file_path))
                        
    # zmiana wartości na float
    df_light['light_level'] = df_light['light_level'].apply(try_convert_to_float)
    df_light = df_light[df_light['light_level'] != None]

    df_light_filtered = df_light[(df_light['light_level'] >= 0) & (df_light['light_level'] <= 45875)]
    df_light_filtered.reset_index(inplace=True, drop=True)
    
    df_light_filtered.to_csv(out_path+'seconds_light.csv', index=False)

In [11]:
def read_aggregate_pedometer_files_for_user(in_path, out_path):
    df_steps = pd.DataFrame()
    for user_folders in os.listdir(in_path):
        if user_folders.find('SAMSUNG') != -1 and user_folders.find('.zip') == -1:
            user_files_path = os.path.join(in_path, user_folders)
            for user_file in os.listdir(user_files_path):
                file_path = os.path.join(user_files_path, user_file)
                if user_file == 'PEDOMETER.csv':
                    try:
                        df = pd.read_csv(file_path, sep='\t')
                        df['ts'] = pd.to_datetime(df['ts'], format='%Y-%m-%dT%H:%M:%S:%f')
                        # grupowanie po minucie i wybór najwcześniejszej wartości dla danej minuty
                        filtered_df = df.loc[df.groupby(df['ts'].dt.strftime('%Y-%m-%d %H:%M'))['ts'].idxmin()]
                        df_steps = df_steps.append(filtered_df[['ts', 'steps', 'walking_steps', 'running_steps', 'distance', 'speed']], ignore_index=True)
                    except:
                        print(file_path, "rozmiar pliku:", os.path.getsize(file_path))

    df_steps_filtered = df_steps[(df_steps['steps'] >= 0) & (df_steps['walking_steps'] >= 0) & (df_steps['running_steps'] >= 0) & (df_steps['distance'] >= 0) & (df_steps['speed'] >= 0)]
    df_steps_filtered.reset_index(inplace=True, drop=True)
    df_steps_filtered.to_csv(out_path+'minute_pedometer.csv', index=False)

In [27]:
def resample_user_data(in_path, out_path):
    df_light = pd.DataFrame()
    df_acc = pd.DataFrame()
    df_grav = pd.DataFrame()
    df_gyr = pd.DataFrame()
    for user_file in os.listdir(in_path):
        file_path = os.path.join(in_path, user_file)
        if user_file == 'seconds_light.csv':
            df = pd.read_csv(file_path)
            df['ts'] = pd.to_datetime(df['ts'], format='%Y-%m-%dT%H:%M:%S.%f')
            filtered_df = df.loc[df.groupby(df['ts'].dt.strftime('%Y-%m-%d %H:%M'))['ts'].idxmin()]
            df_light = df_light.append(filtered_df[['ts', 'light_level']], ignore_index=True)

        elif user_file == 'seconds_acc.csv':
            df = pd.read_csv(file_path)
            df['ts'] = pd.to_datetime(df['ts'], format='%Y-%m-%dT%H:%M:%S.%f')
            # grupowanie po urządzeniach
            df_polar = df[df['device'] == 'polar']
            df_samsung = df[df['device'] == 'samsung']
            filtered_df_polar = df_polar.loc[df_polar.groupby(df_polar['ts'].dt.strftime('%Y-%m-%d %H:%M'))['ts'].idxmin()]
            filtered_df_samsung = df_samsung.loc[df_samsung.groupby(df_samsung['ts'].dt.strftime('%Y-%m-%d %H:%M'))['ts'].idxmin()]
            df_acc = df_acc.append(filtered_df_samsung[['ts', 'x', 'y', 'z', 'vector', 'device']], ignore_index=True)
            df_acc = df_acc.append(filtered_df_polar[['ts', 'x', 'y', 'z', 'vector', 'device']], ignore_index=True)

        elif user_file == 'seconds_grav.csv':
            df = pd.read_csv(file_path)
            df['ts'] = pd.to_datetime(df['ts'], format='%Y-%m-%dT%H:%M:%S.%f')
            filtered_df = df.loc[df.groupby(df['ts'].dt.strftime('%Y-%m-%d %H:%M'))['ts'].idxmin()]
            df_grav = df_grav.append(filtered_df[['ts', 'x', 'y', 'z', 'vector']], ignore_index=True)

        elif user_file == 'seconds_gyr.csv':
            df = pd.read_csv(file_path)
            df['ts'] = pd.to_datetime(df['ts'], format='%Y-%m-%dT%H:%M:%S.%f')
            filtered_df = df.loc[df.groupby(df['ts'].dt.strftime('%Y-%m-%d %H:%M'))['ts'].idxmin()]
            df_gyr = df_gyr.append(filtered_df[['ts', 'x', 'y', 'z', 'vector']], ignore_index=True)


    
    df_light.to_csv(out_path+'minute_light.csv', index=False)
    df_acc.to_csv(out_path+'minute_acc.csv', index=False)
    df_grav.to_csv(out_path+'minute_grav.csv', index=False)
    df_gyr.to_csv(out_path+'minute_gyr.csv', index=False)

In [None]:
########### PRZETWARZANIE DANYCH FIZJOLOGICZNYCH I ZAPISYWANIE DO PLIKÓW (zmiana próbkowania danych na sekundowe) ###########

for iter in range(1, 8):
    ###########  W MIEJSCU /.../ PODAJ ŚCIEŻKĘ DO DANYCH FIZJOLOGICZNYCH ###########
    main_path = f"C:/.../i_0{iter}"
    for i, user_name in tqdm(enumerate(os.listdir(main_path))):
        in_path = main_path+user_name

        ########### W MIEJSCU /.../ PODAJ ŚCIEŻKĘ, W KTÓREJ CHCESZ ZAPISAĆ DANE PO PRZETWORZENIU ###########
        out_path = f"C:/.../aggregated_data/i_0{iter}/{user_name}/"

        # rozpakowanie folderów skompresowanych
        unpack_zip_files(in_path)

        # przetwarzanie danych
        if not os.path.exists(out_path):
            os.makedirs(out_path)

        Parallel(n_jobs=-1)([delayed(read_aggregate_hr_files_for_user)(in_path, out_path),
                            delayed(read_aggregate_pedometer_files_for_user)(in_path, out_path),
                            delayed(read_aggregate_acc_files_for_user)(in_path, out_path),
                            delayed(read_aggregate_gyr_files_for_user)(in_path, out_path),
                            delayed(read_aggregate_grav_files_for_user)(in_path, out_path),
                            delayed(read_aggregate_light_files_for_user)(in_path, out_path)])

In [None]:
########### ZMIANA PRÓBKOWANIA DANYCH (light, acc, gyr, grav) Z SEKUNDOWYCH NA MINUTOWE ###########

for iter in range(1, 8):
    ###########  W MIEJSCU /.../ PODAJ ŚCIEŻKĘ DO DANYCH FIZJOLOGICZNYCH (już po zmianie próbkowanie na sekundowe) ###########
    iter_path = f"C:/.../aggregated_data/i_0{iter}/"
    for user_name in tqdm(os.listdir(iter_path)):
        in_path = iter_path+user_name

        resample_user_data(in_path, in_path)