In [27]:
import pandas as pd
import numpy as np

In [15]:
filepath = '~/Desktop/UCI HAR Dataset/train/Inertial Signals/total_acc_y_train.txt'
df = pd.read_csv(filepath, header=None, delim_whitespace=True)
df.values.shape

(7352, 128)

In [17]:
# load a single txt file as a numpy array
def load_file(filepath):
    df = pd.read_csv(filepath, header=None, delim_whitespace=True)
    return df.values

In [19]:
load_file(filepath).shape

(7352, 128)

In [28]:
# load a group of files, such as x y and z data for a given variable
def load_group(filenames, directory=''):
    loaded = []
    for name in filenames:
        data = load_file(directory + name)
        loaded.append(data)
    # stack group so that features are in the 3rd dimension
    loaded = np.dstack(loaded)
    return loaded

In [29]:
train_dir = '~/Desktop/UCI HAR Dataset/train/Inertial Signals/'
filenames = ['total_acc_x_train.txt', 'total_acc_y_train.txt', 'total_acc_z_train.txt']
total_acc = load_group(filenames, directory=train_dir)
total_acc.shape

(7352, 128, 3)

In [35]:
# load a dataset group, such as train or test
def load_dataset(group, main_data_dir=''):
    filepath = main_data_dir + group + '/Inertial Signals/'
    # get all 9 files in Inertial Signals into a single filename list
    filenames = []
    # total acceleration
    filenames += ['total_acc_x_'+group+'.txt', 'total_acc_y_'+group+'.txt', 'total_acc_z_'+group+'.txt']
    # body acceleration
    filenames += ['body_acc_x_'+group+'.txt', 'body_acc_y_'+group+'.txt', 'body_acc_z_'+group+'.txt']
    # body gyroscope
    filenames += ['body_gyro_x_'+group+'.txt', 'body_gyro_y_'+group+'.txt', 'body_gyro_z_'+group+'.txt']
    # load input data
    X = load_group(filenames, filepath)
    # load class output 
    y = load_file(main_data_dir + group + '/y_'+group+'.txt')
    return X, y

In [36]:
# load train data
trainX, trainy = load_dataset('train', '~/Desktop/UCI HAR Dataset/')
print(trainX.shape, trainy.shape)

(7352, 128, 9) (7352, 1)


In [37]:
# load test data
testX, testy = load_dataset('test', '~/Desktop/UCI HAR Dataset/')
print(testX.shape, testy.shape)

(2947, 128, 9) (2947, 1)


In [64]:
# summarize the balance of class in an output variable column
def class_breakdown(data):
    # convert numpy array to dataframe
    df = pd.DataFrame(data)
    # get value_counts number
    counts = df[0].value_counts()
    # get value_counts percentage
    percent = df[0].value_counts(normalize=True)*100
    # summarize
    for i in np.sort(df[0].unique()):
        print('Class=%d, total=%d, percentage=%.3f' % (i, counts[i], percent[i]))    

In [66]:
# train file class breakdown
print('Train Dataset')
class_breakdown(trainy)

#test file class breakdown
print('Test Dataset')
class_breakdown(testy)

# summarize combined class breakdown
print('Both')
combined = np.vstack((trainy, testy))
class_breakdown(combined)

Train Dataset
Class=1, total=1226, percentage=16.676
Class=2, total=1073, percentage=14.595
Class=3, total=986, percentage=13.411
Class=4, total=1286, percentage=17.492
Class=5, total=1374, percentage=18.689
Class=6, total=1407, percentage=19.138
Test Dataset
Class=1, total=496, percentage=16.831
Class=2, total=471, percentage=15.982
Class=3, total=420, percentage=14.252
Class=4, total=491, percentage=16.661
Class=5, total=532, percentage=18.052
Class=6, total=537, percentage=18.222
Both
Class=1, total=1722, percentage=16.720
Class=2, total=1544, percentage=14.992
Class=3, total=1406, percentage=13.652
Class=4, total=1777, percentage=17.254
Class=5, total=1906, percentage=18.507
Class=6, total=1944, percentage=18.876
