In [1]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import gzip
import io
import os
import pandas as pd

In [7]:
def importData(dataPath):
    user_files = os.listdir(dataPath)
    data_list = []
    for user_file in user_files:
        user = user_file.split(".", 1)[0]
        user_file_path = dataPath + user_file
        
        user_data = read_user_data(user_file_path)
        user_data["User"] = user
        data_list.append(user_data)
    
    data = pd.concat(data_list, axis=0)
    
    return data

In [8]:
def parse_header_of_csv(csv_str):
    # Isolate the headline columns:
    headline = csv_str[:csv_str.index('\n')]
    columns = headline.split(',');

    # The first column should be timestamp:
    assert columns[0] == 'timestamp';
    # The last column should be label_source:
    assert columns[-1] == 'label_source';
    
    # Search for the column of the first label:
    for (ci,col) in enumerate(columns):
        if col.startswith('label:'):
            first_label_ind = ci;
            break;
        pass;

    # Feature columns come after timestamp and before the labels:
    feature_names = columns[1:first_label_ind];
    # Then come the labels, till the one-before-last column:
    label_names = columns[first_label_ind:-1];
    for (li,label) in enumerate(label_names):
        # In the CSV the label names appear with prefix 'label:', but we don't need it after reading the data:
        assert label.startswith('label:');
        label_names[li] = label.replace('label:','');
        pass;
    
    return (feature_names,label_names);

def parse_body_of_csv(csv_str,n_features):
    # Read the entire CSV body into a single numeric matrix:
    full_table = np.loadtxt(io.StringIO(csv_str),delimiter=',',skiprows=1);
    
    # Timestamp is the primary key for the records (examples):
    timestamps = full_table[:,0].astype(int);
    
    # Read the sensor features:
    X = full_table[:,1:(n_features+1)];
    
    # Read the binary label values, and the 'missing label' indicators:
    trinary_labels_mat = full_table[:,(n_features+1):-1]; # This should have values of either 0., 1. or NaN
    M = np.isnan(trinary_labels_mat); # M is the missing label matrix
    Y = np.where(M,0,trinary_labels_mat) > 0.; # Y is the label matrix
    
    return (X,Y,M,timestamps);

def read_user_data(file_path):

    # Read the entire csv file of the user:
    with gzip.open(file_path,'rb') as fid:
        csv_str = fid.read();
        csv_str = csv_str.decode("utf-8") 
        pass;

    (feature_names,label_names) = parse_header_of_csv(csv_str);
    n_features = len(feature_names);
    (X,Y,M,timestamps) = parse_body_of_csv(csv_str,n_features);
    
    data = np.concatenate((X,Y), axis = 1)
    data = pd.DataFrame(data, columns = feature_names + label_names)
    data["Timestamp"] = timestamps

    return data;

In [5]:
file_path = '../data/features_labels/1155FF54-63D3-4AB2-9863-8385D0BD0A13.features_labels.csv.gz'

data = read_user_data(file_path)

In [26]:
dataPath = "../data/features_labels/"

data = importData(dataPath)

In [27]:
data

Unnamed: 0,raw_acc:magnitude_stats:mean,raw_acc:magnitude_stats:std,raw_acc:magnitude_stats:moment3,raw_acc:magnitude_stats:moment4,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,raw_acc:magnitude_spectrum:log_energy_band0,...,STAIRS_-_GOING_UP,STAIRS_-_GOING_DOWN,ELEVATOR,OR_standing,AT_SCHOOL,PHONE_IN_HAND,PHONE_IN_BAG,PHONE_ON_TABLE,WITH_CO-WORKERS,WITH_FRIENDS
1446762297,1.057536,0.040597,-0.048977,0.124759,1.053158,1.057091,1.060935,0.344809,6.683838,5.043598,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1446762357,1.057436,0.006165,-0.009415,0.018645,1.055086,1.057279,1.060143,1.014093,6.684595,5.042748,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1446762417,1.056344,0.006302,-0.004635,0.013525,1.053282,1.056208,1.059165,1.429112,6.684594,5.043642,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1446762487,1.056874,0.004767,-0.002796,0.007088,1.053958,1.057010,1.059996,2.190168,6.684602,5.043075,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1446762548,1.057353,0.005415,0.006585,0.010781,1.054161,1.057100,1.059976,1.827865,6.684599,5.043392,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1446762608,1.056179,0.021680,-0.032814,0.066723,1.051915,1.055795,1.059721,0.693928,6.684394,5.045886,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1446762668,1.056575,0.011453,0.010205,0.034417,1.052840,1.056676,1.059693,0.742608,6.684553,5.043794,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1446762729,1.058040,0.030386,-0.031571,0.083076,1.051942,1.056722,1.060667,0.757440,6.684191,5.043711,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1446762789,1.056299,0.011981,0.022003,0.038909,1.052901,1.055774,1.058716,0.729410,6.684549,5.042270,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1446762848,1.057144,0.014733,-0.015455,0.037629,1.053787,1.056747,1.059711,0.821062,6.684514,5.042216,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [14]:
print(data)

            raw_acc:magnitude_stats:mean  raw_acc:magnitude_stats:std  \
1464129912                      1.011438                     0.012573   
1464129950                      1.011233                     0.009356   
1464130031                      1.013422                     0.018068   
1464130109                      1.014891                     0.016400   
1464130130                      1.017487                     0.022632   
1464130190                      1.012101                     0.007316   
1464130262                      1.011455                     0.008626   
1464130343                      1.010191                     0.008902   
1464130371                      1.011445                     0.028343   
1464130451                      1.012060                     0.008663   
1464130510                      1.012909                     0.015646   
1464130577                      1.011265                     0.008835   
1464130623                      1.011883           