In [1]:
%matplotlib inline

In [2]:
# https://github.com/nrg-projects/sara-on-off-boari/blob/main/classification_kinematics.ipynb

In [3]:
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore
import numpy as np
from sklearn.svm import SVC
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import LeaveOneGroupOut, HalvingGridSearchCV
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from IPython.display import display
from numpy import ravel

### Load csv (output from load kinematics)

In [4]:
df = pd.read_csv('output_kinematics_dataframe.csv',index_col=0).convert_dtypes()
df.head(20)

Unnamed: 0,subject_number,state,trial_number,trunk_lateral_flexion,trunk_rotation,trunk_flx_extension,left_shoulder_add_abduction,left_shoulder_int_external_rotation,left_shoulder_flx_extension,right_shoulder_add_abduction,...,right_steps_per_minute_mean,right_stride_length_mean,right_strides_per_minute_mean,speed,stride_length_mean,stride_width_mean,cycle_time_mean,double_limb_support_time_ave,right_initial_double_limb_support_time_mean,right_terminal_double_limb_support_time_mean
0,22,on,15,2.97897,7.90822,3.46405,2.62925,7.44,3.67538,2.25357,...,111.111107,1.075387,54.216866,0.953966,1.055722,0.043635,1.106667,0.276667,0.14,0.136667
1,22,on,12,2.95278,6.07138,3.67487,3.08497,5.54828,4.4002,2.24671,...,114.65432,1.024454,54.723885,0.930965,1.024061,0.040329,1.1,0.273333,0.146667,0.126667
2,22,on,11,2.86238,7.05173,2.70137,2.72333,5.98249,4.07167,2.71655,...,108.433739,1.02052,53.101505,0.901366,1.015539,0.043939,1.126667,0.28,0.146667,0.133333
3,22,on,16,2.51971,6.33676,4.21162,3.5862,8.06377,6.83211,5.36561,...,115.384636,1.051218,55.05051,0.96528,1.055373,0.056318,1.093333,0.273333,0.14,0.133333
4,22,on,10,2.13155,6.12791,2.98573,3.33085,7.23728,4.59507,1.71036,...,111.840065,1.003775,54.545452,0.907739,1.004565,0.054027,1.106667,0.294444,0.151111,0.143333
5,22,on,13,2.81942,6.70328,2.5504,2.43887,5.2067,2.91801,2.6017,...,116.883102,1.063178,56.603771,0.990234,1.052948,0.051866,1.063333,0.263333,0.136667,0.126667
6,22,on,14,2.29745,6.3868,3.12816,3.31756,8.24121,4.96836,2.90033,...,114.691544,1.066811,55.389332,0.987408,1.0664,0.043348,1.08,0.271111,0.137778,0.133333
7,22,on,5,2.56575,6.13994,3.46533,3.63583,7.22235,5.28541,3.39935,...,113.212029,1.044081,54.216873,0.930171,1.031456,0.054255,1.108889,0.278889,0.143333,0.135556
8,22,on,2,2.23096,5.40391,2.75989,4.79339,12.60892,9.92368,3.80818,...,105.897003,1.039601,52.631573,0.922987,1.044001,0.039526,1.131111,0.265556,0.123333,0.142222
9,22,on,8,2.74622,7.04495,3.09662,3.19618,6.57645,3.67301,2.82616,...,113.924057,1.082253,54.54546,0.960858,1.053741,0.042514,1.096667,0.283333,0.153333,0.13


### Missing values


#### - Identify the indexes with missing values
#### - Print the subject number, state and trial of this missing data
#### - Drop these rows and reset indexes


In [5]:
missing_rows = df.isnull().sum(axis=1)
missing_rows_filtered = missing_rows[missing_rows != 0]
print(missing_rows_filtered.index)

Index([ 19,  21,  22,  23,  24,  25,  26,  30,  31,  32,  33,  34, 231, 233,
       239, 335, 343, 349, 449, 668, 673, 674, 678, 680, 682, 685, 687, 768,
       770, 778, 779, 882, 884, 885],
      dtype='int64')


In [6]:
import pandas as pd
import pathlib as path
import numpy as np
import warnings
import re
import xlrd

In [7]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [28]:
root_folder = path.Path('C3Dfiles')

In [9]:
angles_header = ['Trunk_Lateral_Flexion', 'Trunk_Rotation', 'Trunk_Flx/Extension',
                'Left_Shoulder_Add/Abduction', 'Left_Shoulder_Int/External_Rotation', 'Left_Shoulder_Flx/Extension',
                'Right_Shoulder_Add/Abduction', 'Right_Shoulder_Int/External_Rotation', 'Right_Shoulder_Flx/Extension',
                'Left_Elbow_Add/Abduction', 'Left_Elbow_Pron/Supination', 'Left_Elbow_Flx/Extension',
                'Right_Elbow_Add/Abduction', 'Right_Elbow_Pron/Supination', 'Right_Elbow_Flx/Extension',
                'Left_Pelvic_Obliquity', 'Left_Pelvic_Rotation', 'Left_Pelvic_Tilt',
                'Right_Pelvic_Obliquity', 'Right_Pelvic_Rotation', 'Right_Pelvic_Tilt',
                'Left_Hip_Add/Abduction', 'Left_Hip_Int/External_Rotation', 'Left_Hip_Flexion/Extension',
                'Right_Hip_Add/Abduction', 'Right_Hip_Int/External_Rotation', 'Right_Hip_Flexion/Extension',
                'Left_Knee_Add/Abduction', 'Left_Knee_Int/External_Rotation', 'Left_Knee_Flx/Extension',
                'Right_Knee_Add/Abduction', 'Right_Knee_Int/External_Rotation', 'Right_Knee_Flx/Extension',
                'Left_Ankle_Inv/Eversion', 'Left_Ankle_Add/Abduction', 'Left_Ankle_Dorsi/Plantarflexion',
                'Right_Ankle_Inv/Eversion', 'Right_Ankle_Add/Abduction', 'Right_Ankle_Dorsi/Plantarflexion',
                'Left_Foot_Inv/Eversion', 'Left_Foot_Int/External_Rotation', 'Left_Foot_DF/Plantarflexion',
                'Right_Foot_Inv/Eversion', 'Right_Foot_Int/External_Rotation', 'Right_Foot_DF/Plantarflexion']

# Replace spaces and slashes with underscores in angles_header
angles_header = [header.replace(' ', '_').replace('/', '_') for header in angles_header]

In [10]:
trajectories_header = ['CoM_AP', 'CoM_Vertical', 'CoM_ML', 'CLAV_AP', 'CLAV_Vertical', 'CLAV_ML', 'STRN_AP', 
                       'STRN_Vertical', 'STRN_ML', 'C7_AP', 'C7_Vertical', 'C7_ML', 'T10_AP', 'T10_Vertical',
                       'T10_ML', 'RSHO_AP', 'RSHO_Vertical', 'RSHO_ML', 'LSHO_AP', 'LSHO_Vertical', 'LSHO_ML',
                       'RUPA_AP', 'RUPA_Vertical', 'RUPA_ML', 'REL_AP', 'REL_Vertical', 'REL_ML', 'REM_AP',
                       'REM_Vertical', 'REM_ML', 'RFRA_AP', 'RFRA_Vertical', 'RFRA_ML', 'RWL_AP', 'RWL_Vertical',
                       'RWL_ML', 'RWM_AP', 'RWM_Vertical', 'RWM_ML', 'LUPA_AP', 'LUPA_Vertical', 'LUPA_ML', 'LEL_AP',
                       'LEL_Vertical', 'LEL_ML', 'LEM_AP', 'LEM_Vertical', 'LEM_ML', 'LFRA_AP', 'LFRA_Vertical',
                       'LFRA_ML', 'LWL_AP', 'LWL_Vertical', 'LWL_ML', 'LWM_AP', 'LWM_Vertical', 'LWM_ML', 'R.ASIS_AP',
                       'R.ASIS_Vertical', 'R.ASIS_ML', 'L.ASIS_AP', 'L.ASIS_Vertical', 'L.ASIS_ML', 'R.PSIS_AP',
                       'R.PSIS_Vertical', 'R.PSIS_ML', 'L.PSIS_AP', 'L.PSIS_Vertical', 'L.PSIS_ML', 'R.GTR_AP',
                       'R.GTR_Vertical', 'R.GTR_ML', 'R.Knee_AP', 'R.Knee_Vertical', 'R.Knee_ML', 'R.HF_AP', 
                       'R.HF_Vertical', 'R.HF_ML', 'R.TT_AP', 'R.TT_Vertical', 'R.TT_ML', 'R.Ankle_AP',
                       'R.Ankle_Vertical', 'R.Ankle_ML', 'R.Heel_AP', 'R.Heel_Vertical', 'R.Heel_ML', 'R.MT1_AP',
                       'R.MT1_Vertical', 'R.MT1_ML', 'R.MT5_AP', 'R.MT5_Vertical', 'R.MT5_ML', 'L.GTR_AP',
                       'L.GTR_Vertical', 'L.GTR_ML', 'L.Knee_AP', 'L.Knee_Vertical', 'L.Knee_ML', 'L.HF_AP',
                       'L.HF_Vertical', 'L.HF_ML', 'L.TT_AP', 'L.TT_Vertical', 'L.TT_ML', 'L.Ankle_AP', 
                       'L.Ankle_Vertical', 'L.Ankle_ML', 'L.Heel_AP', 'L.Heel_Vertical', 'L.Heel_ML', 'L.MT1_AP',
                       'L.MT1_Vertical', 'L.MT1_ML', 'L.MT5_AP', 'L.MT5_Vertical', 'L.MT5_ML', 'R.Knee.Medial_AP',
                       'R.Knee.Medial_Vertical', 'R.Knee.Medial_ML', 'R.Ankle.Medial_AP', 'R.Ankle.Medial_Vertical',
                       'R.Ankle.Medial_ML', 'R.MT2_AP', 'R.MT2_Vertical', 'R.MT2_ML', 'L.Knee.Medial_AP',
                       'L.Knee.Medial_Vertical', 'L.Knee.Medial_ML', 'L.Ankle.Medial_AP', 'L.Ankle.Medial_Vertical',
                       'L.Ankle.Medial_ML', 'L.MT2_AP', 'L.MT2_Vertical', 'L.MT2_ML']

# Replace spaces and slashes with underscores in trajectories_header
trajectories_header = [trajectory.replace(' ', '_').replace('/', '_') for trajectory in trajectories_header]

In [27]:
df_angles = pd.DataFrame()
df_spatiotemporal = pd.DataFrame()
df_trajectories = pd.DataFrame()


In [None]:
# Load all folders
for subject in root_folder.glob('**/*'):
    
    #string with subject number
    subject_str = str(subject)
    
    # ignore files, consider just directories
    if subject.is_dir() and "SUB" in subject_str:
        
        # save strings of subject number and condition and load this data as columns of a new dataframe
        # this dataframe will not be relevant, itself, is just used as an organizing tool
        subject_number = int(subject_str[subject_str.find("SUB") + 3 : subject_str.find("SUB") + 5])
        condition = [subject_str[subject_str.find("SUB") + 6 :]]
        df_num_state = pd.DataFrame({'subject_number':subject_number, 'state':condition})
       

        # consider all (but only) csv_angles in the subject directory
        for csv_angles in subject.glob('**/*_angular_kinematics.csv'):
            
           # just like before with the subject number, we save the number of the trial in another dataframe
           trial_str = str(csv_angles)
           trial_number = int(trial_str[trial_str.find("_walk_") + 6 : trial_str.find("_angular_kinematics")])
           df_trial_num = pd.Series(trial_number, name='trial_number')

           #read csv_angles file and delete three first rows (missing info) and the fourth row (units)
           csv_ang = pd.read_excel(csv_angles, engine='xlrd')
           csv_ang.drop([0, 1, 2, 3, 4], inplace=True)

           #delete time and frame columns
           csv_ang.drop(csv_ang.columns[[0, 1]], axis = 1, inplace = True)

           #set first row as dataframe header
           #set all headers to lower letters and replace spaces and slashes with "_"
           csv_ang = csv_ang[0:]
           csv_ang.columns = angles_header
           csv_ang.columns = csv_ang.columns.str.lower()
           csv_ang.reset_index(drop=True, inplace=True)

           #calculate range of motion (ROM) for each joint and set values as a df
           #the ROM is calculated as the maximum registered angle minus the minimum one
           #rom = csv_ang.max() - csv_ang.min()
           #df_rom = pd.DataFrame([rom.values], columns=csv_ang.columns)

           # concatenate the three created databases: subject number, state and ROMS
           # the name of this database is df_angles
           df_trial = pd.concat([df_num_state, df_trial_num, csv_ang], axis=1)
           df_angles = pd.concat([df_angles, df_trial], ignore_index=True)


        # Repeat process for spatiotemporal parameters
        for csv_spatiotemporal in subject.glob('**/*_temporal_distance.txt'):

            # just like before we save the number of the trial in another dataframe
            trial_str = str(csv_spatiotemporal)
            trial_number = int(trial_str[trial_str.find("_walk_") + 6 : trial_str.find("_temporal_distance")])
            df_trial_num = pd.Series(trial_number, name='trial_number')

            # read the trajectories csv seting the correct separator. 
            # Drop unuseful columns. Fix format. Reset index after. 
            csv_spat = pd.read_csv(csv_spatiotemporal, sep = '\t')
            csv_spat.drop([1, 2, 3], inplace=True)
            csv_spat.drop(csv_spat.columns[[0]], axis = 1, inplace = True)
            csv_spat.reset_index(drop=True, inplace=True)
            csv_spat.columns = csv_spat.iloc[0].str.lower()
            csv_spat = csv_spat[1:].reset_index(drop=True)

            # concatenate the three created databases: subject number, state, trial and spatiotemporal parameters
            # the name of this database is df_spatiotemporal
            df_spat = pd.concat([df_num_state, df_trial_num, csv_spat], axis=1)
            df_spatiotemporal = pd.concat([df_spatiotemporal, df_spat], ignore_index=True)




        
        

In [19]:
# Load all folders
for subject in root_folder.glob('**/*'):
    
    #string with subject number
    subject_str = str(subject)
    
    # ignore files, consider just directories
    if subject.is_dir() and "SUB" in subject_str:
        
        # save strings of subject number and condition and load this data as columns of a new dataframe
        # this dataframe will not be relevant, itself, is just used as an organizing tool
        subject_number = int(subject_str[subject_str.find("SUB") + 3 : subject_str.find("SUB") + 5])
        condition = [subject_str[subject_str.find("SUB") + 6 :]]
        df_num_state = pd.DataFrame({'subject_number':subject_number, 'state':condition})
       

        


        # Repeat process for spatiotemporal parameters
        for csv_spatiotemporal in subject.glob('**/*_temporal_distance.txt'):

            # just like before we save the number of the trial in another dataframe
            trial_str = str(csv_spatiotemporal)
            trial_number = int(trial_str[trial_str.find("_walk_") + 6 : trial_str.find("_temporal_distance")])
            df_trial_num = pd.Series(trial_number, name='trial_number')

            # read the trajectories csv seting the correct separator. 
            # Drop unuseful columns. Fix format. Reset index after. 
            csv_spat = pd.read_csv(csv_spatiotemporal, sep = '\t')
            csv_spat.drop([1, 2, 3], inplace=True)
            csv_spat.drop(csv_spat.columns[[0]], axis = 1, inplace = True)
            csv_spat.reset_index(drop=True, inplace=True)
            csv_spat.columns = csv_spat.iloc[0].str.lower()
            csv_spat = csv_spat[1:].reset_index(drop=True)

            # concatenate the three created databases: subject number, state, trial and spatiotemporal parameters
            # the name of this database is df_spatiotemporal
            df_spat = pd.concat([df_num_state, df_trial_num, csv_spat], axis=1)
            df_spatiotemporal = pd.concat([df_spatiotemporal, df_spat], ignore_index=True)




        
        for csv_trajectories in subject.glob('**/*_linear_kinematics.csv'):
            print(csv_trajectories)
                # just like before we save the number of the trial in another dataframe
            trial_str = str(csv_trajectories)
            trial_number = int(trial_str[trial_str.find("_walk_") + 6 : trial_str.find("_linear_kinematics")])
            df_trial_num = pd.Series(trial_number, name='trial_number')
        
            #read file and delete three first rows (missing info) and the fourth (units).
            csv_traj = pd.read_excel(csv_trajectories, engine='xlrd')
            csv_traj.drop([0, 1, 2, 3, 4], inplace=True)
        
            #delete time and frame columns
            csv_traj.drop(csv_traj.columns[[0, 1]], axis = 1, inplace = True)
        
            #set first row as dataframe header
            #set all headers to lower letters
            csv_traj = csv_traj[0:]
            csv_traj.columns = trajectories_header
            csv_traj.columns = csv_traj.columns.str.lower()
            csv_traj.reset_index(drop=True, inplace=True)
        
            #calculate range of motion for each joint and set values as a df
            #rom = csv_traj.max() - csv_traj.min()
            #df_rom = pd.DataFrame([rom.values], columns=csv_traj.columns)
        
            # concatenate subject number, state and roms
            #df_trial = pd.concat([df_num_state, df_trial_num, df_rom], axis=1)
            df_trial = pd.concat([df_num_state, df_trial_num, csv_traj], axis=1)
            df_trajectories = pd.concat([df_trajectories, df_trial], ignore_index=True)
            

C3Dfiles/SUB22_on/SUB22_on_walk_16_linear_kinematics.csv
C3Dfiles/SUB22_on/SUB22_on_walk_3_linear_kinematics.csv
C3Dfiles/SUB22_on/SUB22_on_walk_14_linear_kinematics.csv
C3Dfiles/SUB22_on/SUB22_on_walk_9_linear_kinematics.csv
C3Dfiles/SUB22_on/SUB22_on_walk_1_linear_kinematics.csv
C3Dfiles/SUB22_on/SUB22_on_walk_7_linear_kinematics.csv
C3Dfiles/SUB22_on/SUB22_on_walk_12_linear_kinematics.csv
C3Dfiles/SUB22_on/SUB22_on_walk_5_linear_kinematics.csv
C3Dfiles/SUB22_on/SUB22_on_walk_10_linear_kinematics.csv
C3Dfiles/SUB22_on/SUB22_on_walk_2_linear_kinematics.csv
C3Dfiles/SUB22_on/SUB22_on_walk_15_linear_kinematics.csv
C3Dfiles/SUB22_on/SUB22_on_walk_8_linear_kinematics.csv
C3Dfiles/SUB22_on/SUB22_on_walk_13_linear_kinematics.csv
C3Dfiles/SUB22_on/SUB22_on_walk_6_linear_kinematics.csv
C3Dfiles/SUB22_on/SUB22_on_walk_11_linear_kinematics.csv
C3Dfiles/SUB22_on/SUB22_on_walk_4_linear_kinematics.csv
C3Dfiles/SUB16_on/SUB16_on_walk_18_linear_kinematics.csv
C3Dfiles/SUB16_on/SUB16_on_walk_8_linear

code fro Chatgpt that makes all the columns have the subject number and 

In [26]:
df_trajectories.to_csv('linear_kinematics_csv.csv', index=True)

In [31]:
df_trajectories.shape

(497444, 138)

In [13]:
df_angles

Unnamed: 0,subject_number,state,trial_number,trunk_lateral_flexion,trunk_rotation,trunk_flx_extension,left_shoulder_add_abduction,left_shoulder_int_external_rotation,left_shoulder_flx_extension,right_shoulder_add_abduction,...,left_ankle_dorsi_plantarflexion,right_ankle_inv_eversion,right_ankle_add_abduction,right_ankle_dorsi_plantarflexion,left_foot_inv_eversion,left_foot_int_external_rotation,left_foot_df_plantarflexion,right_foot_inv_eversion,right_foot_int_external_rotation,right_foot_df_plantarflexion
0,22,on,15,2.97897,7.90822,3.46405,2.62925,7.44000,3.67538,2.25357,...,28.94538,21.10539,21.09880,26.11941,16.88379,15.08970,67.90324,18.90389,15.11434,70.07257
1,22,on,12,2.95278,6.07138,3.67487,3.08497,5.54828,4.40020,2.24671,...,23.84957,23.18557,22.43639,27.35110,11.67824,12.63517,65.35984,23.52206,15.63506,68.27096
2,22,on,11,2.86238,7.05173,2.70137,2.72333,5.98249,4.07167,2.71655,...,25.56256,20.43074,18.83621,24.30115,16.60968,12.20308,61.42190,17.72578,12.67905,67.35457
3,22,on,16,2.51971,6.33676,4.21162,3.58620,8.06377,6.83211,5.36561,...,26.40672,19.24308,19.30332,25.49302,11.07394,12.02889,68.72693,17.14466,14.70980,69.99992
4,22,on,10,2.13155,6.12791,2.98573,3.33085,7.23728,4.59507,1.71036,...,24.79512,20.42808,18.09145,24.39168,12.56443,12.50767,65.63166,16.15997,12.74538,67.42650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
879,1,on,6,2.52825,5.40877,4.74835,4.62039,12.47673,14.47797,6.03567,...,20.31854,14.95585,13.17234,27.77969,12.09246,12.49548,59.83803,18.82516,12.72124,67.34195
880,1,on,18,5.14639,6.77762,4.43540,2.62209,6.09964,13.27502,8.07956,...,28.65829,15.96603,17.71778,32.40628,17.51061,11.84632,64.85763,21.83822,13.10276,72.81008
881,1,on,12,2.50659,6.05873,4.60405,4.79872,10.49295,16.56628,4.10215,...,27.67690,15.67198,19.29226,31.59534,17.43477,14.79860,65.90461,17.34572,9.90335,75.18902
882,1,on,1,4.69721,3.80865,4.34763,5.93798,9.41827,19.23258,10.50362,...,20.26136,14.34632,10.05948,27.20666,12.97018,15.44581,60.58130,13.90391,13.48655,74.59862


In [14]:
df_spatiotemporal

Unnamed: 0,subject_number,state,trial_number,left_stance_time_mean,left_swing_time_mean,left_step_length_mean,left_steps_per_minute_mean,left_stride_length_mean,left_strides_per_minute_mean,right_stance_time_mean,...,right_steps_per_minute_mean,right_stride_length_mean,right_strides_per_minute_mean,speed,stride_length_mean,stride_width_mean,cycle_time_mean,double_limb_support_time_ave,right_initial_double_limb_support_time_mean,right_terminal_double_limb_support_time_mean
0,22,on,12,0.673333,0.433333,0.522259,104.707802,1.023277,54.216873,0.700000,...,114.654320,1.024454,54.723885,0.930965,1.024061,0.040329,1.100000,0.273333,0.146667,0.126667
1,22,on,7,0.653333,0.416667,0.542492,106.542450,1.069798,56.079666,0.693333,...,118.421051,1.091258,55.555553,1.003371,1.076951,0.061528,1.073333,0.277778,0.146667,0.131111
2,22,on,10,0.700000,0.420000,0.500193,106.542442,1.006144,53.571430,0.706667,...,111.840065,1.003775,54.545452,0.907739,1.004565,0.054027,1.106667,0.294444,0.151111,0.143333
3,22,on,5,0.673333,0.436667,0.506549,103.448265,1.025144,54.054539,0.713333,...,113.212029,1.044081,54.216873,0.930171,1.031456,0.054255,1.108889,0.278889,0.143333,0.135556
4,22,on,3,0.693333,0.430000,0.516432,105.266762,1.027873,53.412937,0.700000,...,108.449478,1.025155,53.892220,0.916935,1.026967,0.051796,1.120000,0.277778,0.140000,0.137778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
880,1,on,9,0.553333,0.410000,0.546235,115.403580,1.057160,62.320301,0.620000,...,135.129379,1.061523,61.643837,1.093984,1.059341,0.070320,0.968333,0.213333,0.117778,0.095556
881,1,on,1,,,0.499765,126.760574,,,0.626667,...,121.621620,1.054528,62.068966,1.090891,1.054528,0.110601,0.966667,0.286667,0.133333,0.153333
882,1,on,11,0.613333,0.353333,0.511460,124.143829,1.018115,62.068962,0.610000,...,120.854362,1.032870,61.227322,1.053709,1.027952,0.078080,0.975556,0.237778,0.111111,0.126667
883,1,on,3,0.580000,0.420000,0.543415,113.942299,1.097798,60.002663,0.620000,...,126.760559,1.136512,59.602650,1.108240,1.110702,0.093364,1.002222,0.204444,0.106667,0.097778


In [92]:
import pandas as pd
import numpy as np

# Column names
columns = ['subject_number', 'state', 'trial_number', 'trunk', 'knee', 'foot']

# Data
data = [
    [22, 'on', 15, 17.5, 56.9, 13.0],
    [np.nan, np.nan, np.nan, 23.5, 34.5, 14.1],
    [np.nan, np.nan, np.nan, 21.5, 24.5, 15.3],
    [np.nan, np.nan, np.nan, 20.5, 21.2, 16.1],
    [np.nan, np.nan, np.nan, 23.5, 34.5, 14.1],
    [np.nan, np.nan, np.nan, 21.5, 24.5, 15.3],
    [np.nan, np.nan, np.nan, 20.5, 21.2, 16.1],
    [22, 'on', 7.0, 17.5, 56.9, 13.0],
    [np.nan, np.nan, np.nan, 23.5, 24.5, 14.1],
    [np.nan, np.nan, np.nan, 21.0, 14.5, 12.3],
    [np.nan, np.nan, np.nan, 23.5, 34.5, 14.1],
    [np.nan, np.nan, np.nan, 21.5, 24.5, 15.3],
    [np.nan, np.nan, np.nan, 20.5, 21.2, 16.1],
    [10, 'on', 15, 17.5, 56.9, 13.0],
    [np.nan, np.nan, np.nan, 23.5, 34.5, 14.1],
    [np.nan, np.nan, np.nan, 21.5, 24.5, 15.3],
    [np.nan, np.nan, np.nan, 20.5, 21.2, 16.1],
    [10, 'off', 12, 27.5, 56.9, 13.0],
    [np.nan, np.nan, np.nan, 23.5, 34.5, 14.1],
    [np.nan, np.nan, np.nan, 21.5, 24.5, 15.3],
    [np.nan, np.nan, np.nan, 20.5, 21.2, 16.1],
    [np.nan, np.nan, np.nan, 21.5, 24.5, 15.3],
    [np.nan, np.nan, np.nan, 20.5, 21.2, 18.1],
    [np.nan, np.nan, np.nan, 2.5, 22.5, 12.3],
    [np.nan, np.nan, np.nan, 25.5, 51.2, 16.1],

    
]

# Create DataFrame
df = pd.DataFrame(data, columns=columns)

# Display DataFrame
print(df)


    subject_number state  trial_number  trunk  knee  foot
0             22.0    on          15.0   17.5  56.9  13.0
1              NaN   NaN           NaN   23.5  34.5  14.1
2              NaN   NaN           NaN   21.5  24.5  15.3
3              NaN   NaN           NaN   20.5  21.2  16.1
4              NaN   NaN           NaN   23.5  34.5  14.1
5              NaN   NaN           NaN   21.5  24.5  15.3
6              NaN   NaN           NaN   20.5  21.2  16.1
7             22.0    on           7.0   17.5  56.9  13.0
8              NaN   NaN           NaN   23.5  24.5  14.1
9              NaN   NaN           NaN   21.0  14.5  12.3
10             NaN   NaN           NaN   23.5  34.5  14.1
11             NaN   NaN           NaN   21.5  24.5  15.3
12             NaN   NaN           NaN   20.5  21.2  16.1
13            10.0    on          15.0   17.5  56.9  13.0
14             NaN   NaN           NaN   23.5  34.5  14.1
15             NaN   NaN           NaN   21.5  24.5  15.3
16            

In [96]:
i=0
a=3

# Initialize X_data and y_data
X_data = []
y_data = []

current_trial_state = None

while i< len(df):
    
    possible_window = df.iloc[i:i + a, 3:]

    if df.iloc[i:i+a, :1].notna().any().any(): # There are non-NaN values in the first 3 columns for rows a to b, es que hay un nuevo trial dentro de esta ventana
        # count how many non nans are there
        non_nan_count = df.iloc[i:i+a, :3].count().sum()
        
            # esta en la primera posicion?
        if df.iloc[i, :1].notna().any().any() and possible_window.shape[0]==3:
            print("There is a new subject in the first position so this is a new trial")
            window_sample = possible_window

            X_data.append(window_sample.values)
            print("window: ", window_sample)

            row = df.iloc[i]
            state = row['state']
                #current_trial_state = state
                # Append the state to y_data
            y_data.append(state)
            i = i + 3
                
        else:
            

            
                # Entonces procedemos a buscar la possicion desplazando la ventana hasta que no haya más non NaN. 
            buscando = True
            j = i - 1
            while buscando == True:
                possible_window = df.iloc[j:j + a, 3:]
                if df.iloc[j:j+a, :1].notna().any().any():
                    j = j-1
                else:
                    window_sample = possible_window
                        # Append the window sample to X_data
                    X_data.append(window_sample.values)
                    print("window", window_sample)
                    y_data.append(state)
                    buscando = False
                    i = j + a
                
                        
    else: # There is not a new trial in this sample

        window_sample = possible_window
        i = i + 3

        # Append the window sample to X_data
        X_data.append(window_sample.values)
        print("window", window_sample)

        # Append the state to y_data
        y_data.append(state)

    print("i:",i)
# Convert lists to numpy arrays
X_data_array = np.array(X_data)
y_data_array = np.array(y_data)               

# Display the shapes of X_data and y_data
print('X_data shape:', X_data_array.shape)
print('y_data shape:', y_data_array.shape)

There is a new subject in the first position so this is a new trial
window:     trunk  knee  foot
0   17.5  56.9  13.0
1   23.5  34.5  14.1
2   21.5  24.5  15.3
i: 3
window    trunk  knee  foot
3   20.5  21.2  16.1
4   23.5  34.5  14.1
5   21.5  24.5  15.3
i: 6
window    trunk  knee  foot
4   23.5  34.5  14.1
5   21.5  24.5  15.3
6   20.5  21.2  16.1
i: 7
There is a new subject in the first position so this is a new trial
window:     trunk  knee  foot
7   17.5  56.9  13.0
8   23.5  24.5  14.1
9   21.0  14.5  12.3
i: 10
window     trunk  knee  foot
10   23.5  34.5  14.1
11   21.5  24.5  15.3
12   20.5  21.2  16.1
i: 13
There is a new subject in the first position so this is a new trial
window:      trunk  knee  foot
13   17.5  56.9  13.0
14   23.5  34.5  14.1
15   21.5  24.5  15.3
i: 16
window     trunk  knee  foot
14   23.5  34.5  14.1
15   21.5  24.5  15.3
16   20.5  21.2  16.1
i: 17
There is a new subject in the first position so this is a new trial
window:      trunk  knee  foot
17 

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10,) + inhomogeneous part.

In [94]:
# Take it from here and modify the 3 by 3 stuff

In [None]:

import numpy as np
import pandas as pd

# Assuming df is your DataFrame with the specified structure
# Assuming columns: subject_number, state, trial_number, trunk, knee, foot

# Initialize X_data and y_data
X_data = []
y_data = []

# Initialize variables to track the current trial
current_trial_number = None
current_trial_state = None
current_trial_state = None

# Iterate through rows
for i in range(len(df)):
    row = df.iloc[i]
    
    # Check if any of the specified columns is not NaN
    if not pd.isna(row['subject_number']) or not pd.isna(row['state']) or not pd.isna(row['trial_number']):
        # Update current trial information
        current_trial_number = row['trial_number']
        current_trial_state = row['state']
        current_trial_subject = row['subject_number']
        
    # Extract 3 rows for the window sample
    window_sample = df.iloc[i:i + 3, 3:]  # Exclude columns: subject_number, state, trial_number
    
    # If there are less than 3 rows remaining, pad with previous rows
    if i + 3 > len(df):
        window_sample = df.iloc[max(0, i - (3 - (len(df) - i))):i + 3, 3:]
    
    # Append the window sample to X_data
    X_data.append(window_sample.values)
    
    # Append the state to y_data
    y_data.append(current_trial_state)

# Convert lists to numpy arrays
X_data = np.array(X_data)
y_data = np.array(y_data)

# Display the shapes of X_data and y_data
print('X_data shape:', X_data.shape)
print('y_data shape:', y_data.shape)



In [86]:
import numpy as np
import pandas as pd

# Assuming df is your DataFrame with the specified structure
# Assuming columns: subject_number, state, trial_number, trunk, knee, foot

# Initialize X_data and y_data
X_data = []
y_data = []

# Initialize variables to track the current trial
current_trial_number = None
current_trial_state = None
current_trial_subject = None

# Iterate through rows
for i in range(len(df)):
    row = df.iloc[i]
    
    # Check if any of the specified columns is not NaN
    if not pd.isna(row['subject_number']) or not pd.isna(row['state']) or not pd.isna(row['trial_number']):
        # Update current trial information
        current_trial_number = row['trial_number']
        current_trial_state = row['state']
        current_trial_subject = row['subject_number']
    
    # Extract 3 rows for the window sample
    window_sample = df.iloc[i:i + 3, 3:]  # Exclude columns: subject_number, state, trial_number
    
    # If there are less than 3 rows remaining, pad with previous rows
    if i + 3 > len(df):
        window_sample = df.iloc[max(0, i - (3 - (len(df) - i))):i + 3, 3:]
    
    # Append the window sample to X_data
    X_data.append(window_sample.values)
    
    # Append the state to y_data
    y_data.append(current_trial_state)

# Convert lists to numpy arrays
X_data_array = np.array(X_data)
y_data_array = np.array(y_data)

# Display the shapes of X_data and y_data
print('X_data shape:', X_data_array.shape)
print('y_data shape:', y_data_array.shape)


X_data shape: (25, 3, 3)
y_data shape: (25,)


In [87]:
X_data

[array([[17.5, 56.9, 13. ],
        [23.5, 34.5, 14.1],
        [21.5, 24.5, 15.3]]),
 array([[23.5, 34.5, 14.1],
        [21.5, 24.5, 15.3],
        [20.5, 21.2, 16.1]]),
 array([[21.5, 24.5, 15.3],
        [20.5, 21.2, 16.1],
        [23.5, 34.5, 14.1]]),
 array([[20.5, 21.2, 16.1],
        [23.5, 34.5, 14.1],
        [21.5, 24.5, 15.3]]),
 array([[23.5, 34.5, 14.1],
        [21.5, 24.5, 15.3],
        [20.5, 21.2, 16.1]]),
 array([[21.5, 24.5, 15.3],
        [20.5, 21.2, 16.1],
        [17.5, 56.9, 13. ]]),
 array([[20.5, 21.2, 16.1],
        [17.5, 56.9, 13. ],
        [23.5, 24.5, 14.1]]),
 array([[17.5, 56.9, 13. ],
        [23.5, 24.5, 14.1],
        [21. , 14.5, 12.3]]),
 array([[23.5, 24.5, 14.1],
        [21. , 14.5, 12.3],
        [23.5, 34.5, 14.1]]),
 array([[21. , 14.5, 12.3],
        [23.5, 34.5, 14.1],
        [21.5, 24.5, 15.3]]),
 array([[23.5, 34.5, 14.1],
        [21.5, 24.5, 15.3],
        [20.5, 21.2, 16.1]]),
 array([[21.5, 24.5, 15.3],
        [20.5, 21.2, 16.1]

In [23]:
subject_counts = df_trajectories.groupby('subject_number').size().reset_index(name='row_count')

print(subject_counts)

    subject_number  row_count
0              1.0         40
1              2.0         36
2              3.0         40
3              4.0         12
4              5.0         26
5              6.0         41
6              7.0         33
7              8.0         43
8              9.0         39
9             10.0         41
10            11.0         40
11            12.0         37
12            13.0         36
13            14.0         42
14            15.0         13
15            16.0         42
16            17.0         38
17            18.0         40
18            19.0         40
19            20.0         38
20            21.0         37
21            22.0         36
22            23.0         19
23            24.0         36
24            25.0         20
25            26.0         19


In [81]:
df_trial

Unnamed: 0,subject_number,state,trial_number,com_ap,com_vertical,com_ml,clav_ap,clav_vertical,clav_ml,strn_ap,...,r.mt2_ml,l.knee.medial_ap,l.knee.medial_vertical,l.knee.medial_ml,l.ankle.medial_ap,l.ankle.medial_vertical,l.ankle.medial_ml,l.mt2_ap,l.mt2_vertical,l.mt2_ml
0,1.0,on,7.0,-0.18365,0.93122,-0.29898,-0.088375,1.354099,-0.297642,,...,-0.186403,-0.255949,0.489076,-0.292737,-0.426995,0.084365,-0.28795,-0.313388,0.033047,-0.374591
1,,,,-0.175,0.93024,-0.29837,-0.079641,1.353008,-0.296836,,...,-0.186717,-0.249661,0.488357,-0.292817,-0.426193,0.085893,-0.288378,-0.313234,0.033,-0.374639
2,,,,-0.16634,0.92921,-0.29773,-0.070902,1.351881,-0.296002,,...,-0.187265,-0.243204,0.487578,-0.292935,-0.42536,0.087507,-0.288882,-0.313062,0.032952,-0.374692
3,,,,-0.15764,0.92814,-0.29709,-0.06216,1.350725,-0.295136,,...,-0.188019,-0.236591,0.486746,-0.293075,-0.424483,0.089209,-0.289448,-0.312873,0.032905,-0.374747
4,,,,-0.1489,0.92704,-0.29643,-0.053417,1.349549,-0.294236,,...,-0.18895,-0.229831,0.485869,-0.293218,-0.423545,0.091004,-0.290061,-0.312671,0.032858,-0.374803
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333,,,,2.56194,0.93804,-0.32065,2.645197,1.369729,-0.320529,2.657614,...,-0.27375,2.669958,0.504739,-0.338093,2.414772,0.157927,-0.346371,2.483142,0.072738,-0.440745
334,,,,2.56926,0.9385,-0.32099,2.652581,1.370041,-0.320958,2.664769,...,-0.27381,2.681257,0.50696,-0.337924,2.434938,0.153427,-0.348442,2.506431,0.071252,-0.443223
335,,,,2.57627,0.93906,-0.32125,2.659964,1.370253,-0.321401,2.67192,...,-0.273882,2.692234,0.508981,-0.337602,2.455373,0.148854,-0.350419,2.529949,0.069744,-0.445519
336,,,,2.58364,0.93936,-0.32158,2.667351,1.370364,-0.321857,2.679072,...,-0.273965,2.703018,0.510957,-0.337416,2.47605,0.14425,-0.352288,2.553685,0.068245,-0.44759


In [73]:

       # Repeat process for trajectories
for csv_trajectories in subject.glob('**/*_linear_kinematics.csv'):
    print(csv_trajectories)
        # just like before we save the number of the trial in another dataframe
    trial_str = str(csv_trajectories)
    trial_number = int(trial_str[trial_str.find("_walk_") + 6 : trial_str.find("_linear_kinematics")])
    df_trial_num = pd.Series(trial_number, name='trial_number')

    #read file and delete three first rows (missing info) and the fourth (units).
    csv_traj = pd.read_excel(csv_trajectories, engine='xlrd')
    csv_traj.drop([0, 1, 2, 3, 4], inplace=True)

    #delete time and frame columns
    csv_traj.drop(csv_traj.columns[[0, 1]], axis = 1, inplace = True)

    #set first row as dataframe header
    #set all headers to lower letters
    csv_traj = csv_traj[0:]
    csv_traj.columns = trajectories_header
    csv_traj.columns = csv_traj.columns.str.lower()
    csv_traj.reset_index(drop=True, inplace=True)

    #calculate range of motion for each joint and set values as a df
    #rom = csv_traj.max() - csv_traj.min()
    #df_rom = pd.DataFrame([rom.values], columns=csv_traj.columns)

    # concatenate subject number, state and roms
    #df_trial = pd.concat([df_num_state, df_trial_num, df_rom], axis=1)
    df_trial = pd.concat([df_num_state, df_trial_num, csv_traj], axis=1)
    df_angles = df_angles.append(df_trial, ignore_index=True)

In [63]:
df_angles

Unnamed: 0,subject_number,state,trial_number,trunk_lateral_flexion,trunk_rotation,trunk_flx_extension,left_shoulder_add_abduction,left_shoulder_int_external_rotation,left_shoulder_flx_extension,right_shoulder_add_abduction,...,left_ankle_dorsi_plantarflexion,right_ankle_inv_eversion,right_ankle_add_abduction,right_ankle_dorsi_plantarflexion,left_foot_inv_eversion,left_foot_int_external_rotation,left_foot_df_plantarflexion,right_foot_inv_eversion,right_foot_int_external_rotation,right_foot_df_plantarflexion
0,22,on,15,2.97897,7.90822,3.46405,2.62925,7.44000,3.67538,2.25357,...,28.94538,21.10539,21.09880,26.11941,16.88379,15.08970,67.90324,18.90389,15.11434,70.07257
1,22,on,12,2.95278,6.07138,3.67487,3.08497,5.54828,4.40020,2.24671,...,23.84957,23.18557,22.43639,27.35110,11.67824,12.63517,65.35984,23.52206,15.63506,68.27096
2,22,on,11,2.86238,7.05173,2.70137,2.72333,5.98249,4.07167,2.71655,...,25.56256,20.43074,18.83621,24.30115,16.60968,12.20308,61.42190,17.72578,12.67905,67.35457
3,22,on,16,2.51971,6.33676,4.21162,3.58620,8.06377,6.83211,5.36561,...,26.40672,19.24308,19.30332,25.49302,11.07394,12.02889,68.72693,17.14466,14.70980,69.99992
4,22,on,10,2.13155,6.12791,2.98573,3.33085,7.23728,4.59507,1.71036,...,24.79512,20.42808,18.09145,24.39168,12.56443,12.50767,65.63166,16.15997,12.74538,67.42650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
956,1,on,6,2.52825,5.40877,4.74835,4.62039,12.47673,14.47797,6.03567,...,20.31854,14.95585,13.17234,27.77969,12.09246,12.49548,59.83803,18.82516,12.72124,67.34195
957,1,on,18,5.14639,6.77762,4.43540,2.62209,6.09964,13.27502,8.07956,...,28.65829,15.96603,17.71778,32.40628,17.51061,11.84632,64.85763,21.83822,13.10276,72.81008
958,1,on,12,2.50659,6.05873,4.60405,4.79872,10.49295,16.56628,4.10215,...,27.67690,15.67198,19.29226,31.59534,17.43477,14.79860,65.90461,17.34572,9.90335,75.18902
959,1,on,1,4.69721,3.80865,4.34763,5.93798,9.41827,19.23258,10.50362,...,20.26136,14.34632,10.05948,27.20666,12.97018,15.44581,60.58130,13.90391,13.48655,74.59862


In [61]:
# Repeat process for trajectories
for csv_trajectories in subject.glob('**/*_linear_kinematics.csv'):
    print(csv_trajectories)
    # just like before we save the number of the trial in another dataframe
    trial_str = str(csv_trajectories)
    trial_number = int(trial_str[trial_str.find("_walk_") + 6 : trial_str.find("_linear_kinematics")])
    df_trial_num = pd.Series(trial_number, name='trial_number')

    if "SUB01" in csv_trajectories.name:
        # Read each CSV file into a DataFrame
        df = pd.read_csv(csv_trajectories)
        
        # Append the DataFrame to the 'forPrint' DataFrame
        forPrint = forPrint.append(df, ignore_index=True)


    # read file and delete three first rows (missing info) and the fourth (units).
    csv_traj = pd.read_excel(csv_trajectories, engine='xlrd')
    csv_traj.drop([0, 1, 2, 3, 4], inplace=True)

    # delete time and frame columns
    csv_traj.drop(csv_traj.columns[[0, 1]], axis=1, inplace=True)

    # set first row as dataframe header
    # set all headers to lowercase
    csv_traj = csv_traj[0:]
    csv_traj.columns = trajectories_header
    csv_traj.columns = csv_traj.columns.str.lower()
    csv_traj.reset_index(drop=True, inplace=True)

    # append entire trajectory data without calculating range of motion
    df_trial = pd.concat([df_num_state, df_trial_num, csv_traj], axis=1)
    df_angles = df_angles.append(df_trial, ignore_index=True)
