In [1]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import gzip
import io
import os
import pandas as pd

In [2]:
os.chdir('/Users/arianbeckmann/Desktop/Uni/Info/MU-Praktikum')
os.getcwd()

'/Users/arianbeckmann/Desktop/Uni/Info/MU-Praktikum'

In [3]:
def parse_header_of_csv(csv_str):
    # Isolate the headline columns:
    headline = csv_str[:csv_str.index('\n')]
    columns = headline.split(',');

    # The first column should be timestamp:
    assert columns[0] == 'timestamp';
    # The last column should be label_source:
    assert columns[-1] == 'label_source';
    
    # Search for the column of the first label:
    for (ci,col) in enumerate(columns):
        if col.startswith('label:'):
            first_label_ind = ci;
            break;
        pass;

    # Feature columns come after timestamp and before the labels:
    feature_names = columns[1:first_label_ind];
    # Then come the labels, till the one-before-last column:
    label_names = columns[first_label_ind:-1];
    for (li,label) in enumerate(label_names):
        # In the CSV the label names appear with prefix 'label:', but we don't need it after reading the data:
        assert label.startswith('label:');
        label_names[li] = label.replace('label:','');
        pass;
    
    return (feature_names,label_names);

def parse_body_of_csv(csv_str,n_features):
    # Read the entire CSV body into a single numeric matrix:
    full_table = np.loadtxt(io.StringIO(csv_str),delimiter=',',skiprows=1);
    
    # Timestamp is the primary key for the records (examples):
    timestamps = full_table[:,0].astype(int);
    
    # Read the sensor features:
    X = full_table[:,1:(n_features+1)];
    
    # Read the binary label values, and the 'missing label' indicators:
    trinary_labels_mat = full_table[:,(n_features+1):-1]; # This should have values of either 0., 1. or NaN
    M = np.isnan(trinary_labels_mat); # M is the missing label matrix
    Y = np.where(M,0,trinary_labels_mat) > 0.; # Y is the label matrix
    
    return (X,Y,M,timestamps);

def read_user_data(file_path):

    # Read the entire csv file of the user:
    with gzip.open(file_path,'rb') as fid:
        csv_str = fid.read();
        csv_str = csv_str.decode("utf-8") 
        pass;

    (feature_names,label_names) = parse_header_of_csv(csv_str);
    n_features = len(feature_names);
    (X,Y,M,timestamps) = parse_body_of_csv(csv_str,n_features);

    return (X,Y,M,timestamps,feature_names,label_names);

In [13]:
file_path = 'data/features_labels/1155FF54-63D3-4AB2-9863-8385D0BD0A13.features_labels.csv.gz'
file_path_2 = 'data/features_labels/8023FE1A-D3B0-4E2C-A57A-9321B7FC755F.features_labels.csv.gz'

(X,Y,M,timestamps,feature_names,label_names) = read_user_data(file_path);
(X_2,Y_2,M_2,timestamps_2,feature_names_2,label_names_2) = read_user_data(file_path);


In [14]:
print(X_2)
print(X_2.shape)

[[ 1.011438  0.012573  0.023013 ...  1.        0.        0.      ]
 [ 1.011233  0.009356 -0.005622 ...  1.        0.        0.      ]
 [ 1.013422  0.018068 -0.008593 ...  1.        0.        0.      ]
 ...
 [ 0.996011  0.039554 -0.034651 ...  0.        0.        0.      ]
 [ 1.079283  0.404817  0.330815 ...  0.        0.        0.      ]
 [ 1.093802  0.496113  0.392577 ...  0.        0.        0.      ]]
(2685, 225)


In [15]:
feature_names == feature_names_2

True

In [16]:
label_names == label_names_2

True

In [18]:
timestamps == timestamps_2

array([ True,  True,  True, ...,  True,  True,  True])

In [10]:
print(Y)
print(Y.shape)

[[False  True False ... False  True False]
 [False  True False ... False  True False]
 [False  True False ... False  True False]
 ...
 [False  True False ... False False False]
 [False False  True ... False False False]
 [False False  True ... False False False]]
(2685, 51)


In [12]:
print(label_names)
print(len(label_names))

['LYING_DOWN', 'SITTING', 'FIX_walking', 'FIX_running', 'BICYCLING', 'SLEEPING', 'LAB_WORK', 'IN_CLASS', 'IN_A_MEETING', 'LOC_main_workplace', 'OR_indoors', 'OR_outside', 'IN_A_CAR', 'ON_A_BUS', 'DRIVE_-_I_M_THE_DRIVER', 'DRIVE_-_I_M_A_PASSENGER', 'LOC_home', 'FIX_restaurant', 'PHONE_IN_POCKET', 'OR_exercise', 'COOKING', 'SHOPPING', 'STROLLING', 'DRINKING__ALCOHOL_', 'BATHING_-_SHOWER', 'CLEANING', 'DOING_LAUNDRY', 'WASHING_DISHES', 'WATCHING_TV', 'SURFING_THE_INTERNET', 'AT_A_PARTY', 'AT_A_BAR', 'LOC_beach', 'SINGING', 'TALKING', 'COMPUTER_WORK', 'EATING', 'TOILET', 'GROOMING', 'DRESSING', 'AT_THE_GYM', 'STAIRS_-_GOING_UP', 'STAIRS_-_GOING_DOWN', 'ELEVATOR', 'OR_standing', 'AT_SCHOOL', 'PHONE_IN_HAND', 'PHONE_IN_BAG', 'PHONE_ON_TABLE', 'WITH_CO-WORKERS', 'WITH_FRIENDS']
51


In [8]:
print(feature_names)
print(len(feature_names))

['raw_acc:magnitude_stats:mean', 'raw_acc:magnitude_stats:std', 'raw_acc:magnitude_stats:moment3', 'raw_acc:magnitude_stats:moment4', 'raw_acc:magnitude_stats:percentile25', 'raw_acc:magnitude_stats:percentile50', 'raw_acc:magnitude_stats:percentile75', 'raw_acc:magnitude_stats:value_entropy', 'raw_acc:magnitude_stats:time_entropy', 'raw_acc:magnitude_spectrum:log_energy_band0', 'raw_acc:magnitude_spectrum:log_energy_band1', 'raw_acc:magnitude_spectrum:log_energy_band2', 'raw_acc:magnitude_spectrum:log_energy_band3', 'raw_acc:magnitude_spectrum:log_energy_band4', 'raw_acc:magnitude_spectrum:spectral_entropy', 'raw_acc:magnitude_autocorrelation:period', 'raw_acc:magnitude_autocorrelation:normalized_ac', 'raw_acc:3d:mean_x', 'raw_acc:3d:mean_y', 'raw_acc:3d:mean_z', 'raw_acc:3d:std_x', 'raw_acc:3d:std_y', 'raw_acc:3d:std_z', 'raw_acc:3d:ro_xy', 'raw_acc:3d:ro_xz', 'raw_acc:3d:ro_yz', 'proc_gyro:magnitude_stats:mean', 'proc_gyro:magnitude_stats:std', 'proc_gyro:magnitude_stats:moment3', '

In [148]:
X.shape # X feature matrix

(2685, 225)

In [149]:
len(label_names)

51

In [150]:
Y.shape # Y binary label matrix

(2685, 51)

In [156]:
sum(sum(Y+M))

41225

In [163]:
n_examples_per_label = np.sum(Y,axis=0);
labels_and_counts = zip(label_names,n_examples_per_label);
sorted_labels_and_counts = sorted(labels_and_counts,reverse=True,key=lambda pair:pair[1]);
print("How many examples does this user have for each contex-label:")
print ("-"*20)
for (label,count) in sorted_labels_and_counts:
    print("label " + str(label) + " " + str(count) + " minutes")
    pass;

How many examples does this user have for each contex-label:
--------------------
label PHONE_ON_TABLE 2076 minutes
label SITTING 1543 minutes
label OR_indoors 1455 minutes
label LOC_home 1438 minutes
label LYING_DOWN 748 minutes
label TALKING 653 minutes
label SLEEPING 633 minutes
label LOC_main_workplace 591 minutes
label PHONE_IN_POCKET 394 minutes
label EATING 310 minutes
label WATCHING_TV 257 minutes
label SURFING_THE_INTERNET 242 minutes
label OR_standing 231 minutes
label FIX_walking 158 minutes
label OR_outside 126 minutes
label WITH_FRIENDS 109 minutes
label PHONE_IN_HAND 106 minutes
label COMPUTER_WORK 85 minutes
label WITH_CO-WORKERS 82 minutes
label DRESSING 57 minutes
label COOKING 47 minutes
label WASHING_DISHES 46 minutes
label ON_A_BUS 42 minutes
label GROOMING 36 minutes
label DRIVE_-_I_M_THE_DRIVER 35 minutes
label TOILET 35 minutes
label AT_SCHOOL 32 minutes
label IN_A_CAR 30 minutes
label DRINKING__ALCOHOL_ 28 minutes
label IN_A_MEETING 21 minutes
label DRIVE_-_I_M_

In [164]:
def get_label_pretty_name(label):
    if label == 'FIX_walking':
        return 'Walking';
    if label == 'FIX_running':
        return 'Running';
    if label == 'LOC_main_workplace':
        return 'At main workplace';
    if label == 'OR_indoors':
        return 'Indoors';
    if label == 'OR_outside':
        return 'Outside';
    if label == 'LOC_home':
        return 'At home';
    if label == 'FIX_restaurant':
        return 'At a restaurant';
    if label == 'OR_exercise':
        return 'Exercise';
    if label == 'LOC_beach':
        return 'At the beach';
    if label == 'OR_standing':
        return 'Standing';
    if label == 'WATCHING_TV':
        return 'Watching TV'
    
    if label.endswith('_'):
        label = label[:-1] + ')';
        pass;
    
    label = label.replace('__',' (').replace('_',' ');
    label = label[0] + label[1:].lower();
    label = label.replace('i m','I\'m');
    return label;

In [167]:
print ("How many examples does this user have for each contex-label:")
print ("-"*20)
for (label,count) in sorted_labels_and_counts:
    label = get_label_pretty_name(label)
    print ("{} - {} minutes".format(label,count))
    pass;

How many examples does this user have for each contex-label:
--------------------
Phone on table - 2076 minutes
Sitting - 1543 minutes
Indoors - 1455 minutes
At home - 1438 minutes
Lying down - 748 minutes
Talking - 653 minutes
Sleeping - 633 minutes
At main workplace - 591 minutes
Phone in pocket - 394 minutes
Eating - 310 minutes
Watching TV - 257 minutes
Surfing the internet - 242 minutes
Standing - 231 minutes
Walking - 158 minutes
Outside - 126 minutes
With friends - 109 minutes
Phone in hand - 106 minutes
Computer work - 85 minutes
With co-workers - 82 minutes
Dressing - 57 minutes
Cooking - 47 minutes
Washing dishes - 46 minutes
On a bus - 42 minutes
Grooming - 36 minutes
Drive - I'm the driver - 35 minutes
Toilet - 35 minutes
At school - 32 minutes
In a car - 30 minutes
Drinking (alcohol) - 28 minutes
In a meeting - 21 minutes
Drive - I'm a passenger - 20 minutes
Bathing - shower - 11 minutes
Strolling - 6 minutes
Singing - 6 minutes
Shopping - 5 minutes
At a restaurant - 3 min