In [1]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import zipfile

import numpy as np
import gzip
from io import StringIO;
import os

import torch 
from torchvision import datasets
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import seaborn as sns 

from os import listdir
from os.path import isfile, join
import gzip
import shutil
import pickle
import random

In [2]:
root_dir = "ExtraSensory.per_uuid_features_labels/"
def parse_header_of_csv(csv_str):
    # Isolate the headline columns:
    headline = csv_str[:csv_str.index(b'\n')];
    columns = headline.split(b',');

    # The first column should be timestamp:
    assert columns[0] == b'timestamp';
    
    # The last column should be label_source:
    assert columns[-1] == b'label_source';
    
    # Search for the column of the first label:
    for (ci,col) in enumerate(columns):
        if col.startswith(b'label:'):
            first_label_ind = ci;
            break;
        pass;

    # Feature columns come after timestamp and before the labels:
    feature_names = columns[1:first_label_ind];
    
    # Then come the labels, till the one-before-last column:
    label_names = columns[first_label_ind:-1];
    for (li,label) in enumerate(label_names):
        
        # In the CSV the label names appear with prefix 'label:', but we don't need it after reading the data:
        assert label.startswith(b'label:');
        label_names[li] = label.replace(b'label:',b'');
        pass;
    
    return (feature_names, label_names);

def parse_body_of_csv(csv_str,n_features):
    # Read the entire CSV body into a single numeric matrix:
    full_table = np.loadtxt(StringIO(csv_str.decode("utf-8")),delimiter=',',skiprows=1);
    
    # Timestamp is the primary key for the records (examples):
    timestamps = full_table[:,0].astype(int);
    
    # Read the sensor features:
    X = full_table[:,1:(n_features+1)];
    
    # Read the binary label values, and the 'missing label' indicators:
    trinary_labels_mat = full_table[:,(n_features+1):-1]; # This should have values of either 0., 1. or NaN
    M = np.isnan(trinary_labels_mat); # M is the missing label matrix
    Y = np.where(M,0,trinary_labels_mat) > 0.; # Y is the label matrix
    
    return (X,Y,M,timestamps);

'''
Read the data (precomputed sensor-features and labels) for a user.
This function assumes the user's data file is present.
'''
def read_user_data(uuid):
    user_data_file = '%s%s.features_labels.csv.gz' % (root_dir, uuid);

    # Read the entire csv file of the user:
    with gzip.open(user_data_file,'rb') as fid:
        csv_str = fid.read();
        pass;
    
    (feature_names,label_names) = parse_header_of_csv(csv_str);
    n_features = len(feature_names);
    (X,Y,M,timestamps) = parse_body_of_csv(csv_str,n_features);

    return (X,Y,M,timestamps,feature_names,label_names);

onlyfiles = [f for f in listdir(root_dir)]
final_files = [] 
for i in onlyfiles: 
    #x = '/%s%s' % (root_dir, i[0:len(i)-3]) 
    if ".csv" in i and ".gz" not in i:
        final_files.append(i)

In [3]:
raw_d = pd.read_csv('./'+root_dir + final_files[0] )
sensor_data = raw_d.iloc[:,1:225]
labels_data = raw_d.iloc[:,226:-1]

for i in range(1,len(final_files)):
    raw_d = pd.read_csv('./'+root_dir + final_files[i] )
    sensor_data = sensor_data.append(raw_d.iloc[:,1:225],ignore_index=True)
    labels_data = labels_data.append(raw_d.iloc[:,226:-1],ignore_index=True)


In [4]:
sensor_data = sensor_data.fillna(0)
labels_data = labels_data.fillna(0)

In [5]:
per_col_mean = []
per_col_std = []
def normalize_data(data):
    norm_data = pd.DataFrame() 
    for col in data.columns: 
        col_mean = data[col].mean()
        col_std = data[col].std()
        per_col_mean.append(col_mean)
        per_col_std.append(col_std)
        if col_std != 0:
            norm_data[col] = (data[col] - col_mean)/col_std
        else:
            norm_data[col] = data[col]    
    #norm_data.dropna(inplace = True, axis = 'columns' )
    return norm_data
def normalize_test_data(data):
    norm_data = pd.DataFrame() 
    i  = 0
    for col in data.columns: 
        col_mean = per_col_mean[i]
        col_std = per_col_std[i]
        if col_std != 0:
            norm_data[col] = (data[col] - col_mean)/col_std
        else:
            norm_data[col] = data[col]   
        i = i +1
    
    #norm_data.dropna(inplace = True, axis = 'columns' )
    return norm_data


In [6]:
random.seed(1346578)

In [7]:
len(sensor_data)

377346

In [8]:
empty_label_index = []
for i in range(len(labels_data)):
    s = labels_data.loc[i,:].sum()
    if s ==0: 
        empty_label_index.append(i)

In [9]:
labels_data = labels_data.drop(empty_label_index)
sensor_data = sensor_data.drop(empty_label_index)
print(len(sensor_data))

326687


In [10]:
print("Empty labels percentage:",(1-326687/377346)*100," %")

Empty labels percentage: 13.425079370127147  %


In [11]:
dataset_size = len(sensor_data)
all_indices = [i for i in range(dataset_size)]
# random.shuffle(all_indices) #pick random trainig samples
row_train = all_indices[:int(2*dataset_size/3)]
row_test = all_indices[int(2*dataset_size/3):]

train_data = sensor_data.iloc[row_train,:]
test_data = sensor_data.iloc[row_test,:]
train_data = normalize_data(train_data)
test_data = normalize_test_data(test_data)

labels_train=labels_data.iloc[row_train,:]
labels_test=labels_data.iloc[row_test,:]

In [12]:
extra_sens_data_set = {} 
extra_sens_data_set["X_train"] = train_data.to_numpy(dtype=np.float32)
extra_sens_data_set["X_test"] = test_data.to_numpy(dtype=np.float32)
extra_sens_data_set["Y_train"] = labels_train.to_numpy(dtype=np.float32)
extra_sens_data_set["Y_test"] = labels_test.to_numpy(dtype=np.float32)


In [18]:
# with open("./extra_sens_data_set.pkl", "wb") as f:
#     pickle.dump(extra_sens_data_set,f)

input_shape = 224
input_shape_s1 = 14
input_shape_s2 = 16
class SensorDataset(Dataset): 
    
    def __init__(self, sensor_data, transform,labels_data):
        """provide directory with the user's data"""
        self.sensor_data = sensor_data
        self.transform = transform
        self.labels_data = labels_data
        
    def __len__(self):
        return len(self.sensor_data)
        
    # This function can be used to index into the dataset
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
                 

        sample =  self.sensor_data[idx,:]
        labels =  self.labels_data[idx,:]
        
        if self.transform:
            # Transform to tensor
            sample = self.transform(sample.reshape(input_shape_s1,input_shape_s2))
            
        return sample, torch.from_numpy(labels)

In [19]:
def create_train_test_sets(esds):
    img_transform = transforms.Compose([
        transforms.ToTensor()])
    
    train_data = SensorDataset(esds["X_train"], transform = img_transform,labels_data=esds["Y_train"])
    test_data = SensorDataset(esds["X_test"], transform = img_transform,labels_data=esds["Y_test"])
    
    
    train_loader = torch.utils.data.DataLoader(train_data, batch_size = 1, shuffle = False)
    test_loader = torch.utils.data.DataLoader(test_data, batch_size = 1, shuffle=False)
    
    return train_data, test_data, train_loader, test_loader
train_data, test_data, train_loader, test_loader = create_train_test_sets(esds=extra_sens_data_set)

In [20]:
X_train_raw = torch.zeros(len(train_loader),1,224)
X_test_raw = torch.zeros(len(test_loader),1,224)

Y_train_encd = torch.zeros(len(train_loader),1,51)
Y_test_encd = torch.zeros(len(test_loader),1,51)

def generate_featuers():
    with torch.no_grad():
        cnt = 0

        # Iterate through the test dataset (we are using this data for validation, too)
        for batch_features, y_obs in train_loader:
            # Reshape mini-batch data to [N, input_shape] matrix
            batch_features = batch_features.view(-1, input_shape)
            # Use the model

            X_train_raw[cnt,:,:] =batch_features
            Y_train_encd[cnt,:,:]=y_obs
            cnt+=1
            
        cnt = 0
        # Iterate through the test dataset (we are using this data for validation, too)
        for batch_features, y_obs in test_loader:
            # Reshape mini-batch data to [N, input_shape] matrix
            batch_features = batch_features.view(-1, input_shape)
            # Use the model
            X_test_raw[cnt,:,:] =batch_features
            Y_test_encd[cnt,:,:]=y_obs
            cnt+=1
            
generate_featuers()

In [21]:
extra_sens_data_set_with_features = {} 

extra_sens_data_set_with_features["X_train"] = X_train_raw
extra_sens_data_set_with_features["X_test"] = X_test_raw
extra_sens_data_set_with_features["Y_train"] = Y_train_encd
extra_sens_data_set_with_features["Y_test"] = Y_test_encd

with open("./extra_sens_data_set_with_features.pkl", "wb") as f:
    pickle.dump(extra_sens_data_set_with_features,f)