<a href="https://colab.research.google.com/github/bhavanabalraj/Platys-AL/blob/main/ES-Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [173]:
import gzip
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch import optim

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [174]:
def parse_header_of_csv(user_context_data):

    columns = list(user_context_data.columns)
    #print(user_context_data.columns)

    # The first column should be timestamp:
    assert columns[0] == 'timestamp';
    # The last column should be label_source:
    assert columns[-1] == 'label_source';
    
    # Search for the column of the first label:
    for (ci,col) in enumerate(columns):
        if col.startswith('label:'):
            first_label_ind = ci;
            break;
        pass;

    # Feature columns come after timestamp and before the labels:
    feature_names = columns[1:first_label_ind]
    # Then come the labels, till the one-before-last column:
    label_names = columns[first_label_ind:-1]

    for (li,label) in enumerate(label_names):
        # In the CSV the label names appear with prefix 'label:', but we don't need it after reading the data:
        assert label.startswith('label:')
        label_names[li] = label.replace('label:','')
        pass
    
    return (feature_names,label_names);

In [175]:
def parse_body_of_csv(user_context_data,n_features):
    # # Read the entire CSV body into a single numeric matrix:
    # full_table = np.loadtxt(StringIO.StringIO(csv_str),delimiter=',',skiprows=1);
    
    # Timestamp is the primary key for the records (examples):
    timestamps = user_context_data['timestamp']
    
    # Read the sensor features:
    X = user_context_data.iloc[:,1:(n_features+1)]
    
    # Read the binary label values, and the 'missing label' indicators:
    # trinary_labels_mat = user_context_data[:,(n_features+1):-1]; # This should have values of either 0., 1. or NaN
    # M = np.isnan(trinary_labels_mat); # M is the missing label matrix
    # Y = np.where(M,0,trinary_labels_mat) > 0.; # Y is the label matrix

    Y = user_context_data.iloc[:,(n_features+1):-1]; # This should have values of either 0., 1. or NaN
    M = np.isnan(Y)

    return (X,Y,M,timestamps)

In [176]:
def read_user_data(uuid):
    user_data_file = '/content/drive/MyDrive/ESDataset/%s.features_labels.csv.gz' % uuid;

    # # Read the entire csv file of the user:
    # with gzip.open(user_data_file,'rb') as fid:
    #     csv_str = fid.read_csv();
    #     pass;

    # (feature_names,label_names) = parse_header_of_csv(csv_str);
    # n_features = len(feature_names);
    # (X,Y,M,timestamps) = parse_body_of_csv(csv_str,n_features);

    # return (X,Y,M,timestamps,feature_names,label_names);

    user_context_data = pd.read_csv(user_data_file, compression='gzip', header=0, sep=',', quotechar='"', error_bad_lines=False)

    (feature_names,label_names) = parse_header_of_csv(user_context_data)
    n_features = len(feature_names)
    (X,Y,M,timestamps) = parse_body_of_csv(user_context_data,n_features)

    print("Total features: ", n_features)
    return (X,Y,M,timestamps,feature_names,label_names)

    #return user_context_data

In [177]:
uuid_1 = '1155FF54-63D3-4AB2-9863-8385D0BD0A13';
(X1,Y1,timestamps1,M1,feature_names,label_names) = read_user_data(uuid_1)

uuid_2 = '0A986513-7828-4D53-AA1F-E02D6DF9561B';
(X2,Y2,timestamps2,M2,feature_names,label_names) = read_user_data(uuid_2)

X = X1.append(X2)
Y = Y1.append(Y2)
timestamps = timestamps1.append(timestamps2)
M = M1.append(M2)

# multi_label = Y[(Y['label:SITTING'] == 1.0) & (Y['label:AT_SCHOOL'] == 1.0)]

Total features:  225
Total features:  225


In [178]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [179]:
# Classifier
class Network(nn.Module):
    
    def __init__(self):
        
        super(Network, self).__init__()
        # Inputs to hidden layer linear transformation
        self.h1 = nn.Linear(225, 64)   
        self.activation = nn.ReLU()           
        self.output = nn.Linear(64, 51)
        
    def forward(self, x):
               
        # Hidden layer with Leaky ReLU activation
        x = self.activation(self.h1(x))
        
        # Output layer with sigmoid activation
        #x = F.softmax(self.output(x), dim=1)
        # x = torch.sigmoid(self.output(x))
        x = self.output(x) 
        return x

# Create model params and config
model = Network()

criterion = nn.BCEWithLogitsLoss()
# criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

epochs = 250
train_losses, test_losses = [], []
batch_size = 32

#Train model
def trainAndTestModel(X_train, y_train, X_test, y_test):
    
    trainLoaderSize = len(X_train)/batch_size
    
    for e in range(epochs):
        
        running_loss = 0 
        
        permutation = torch.randperm(X_train.size()[0])
        
        for i in range(0, X_train.size()[0], batch_size):
            
            optimizer.zero_grad()
            
            indices = permutation[i:i+batch_size]
            batch_x, batch_y = X_train[indices], y_train[indices]
            
            log_ps = model(batch_x)          
            loss = criterion(log_ps.squeeze(), batch_y)
            loss.backward()
            optimizer.step()
        
            running_loss += loss.item()
         
        else:
        
            train_losses.append(running_loss/trainLoaderSize)
            print("Epoch: {}/{}".format(e+1, epochs))   
            print("Training Loss: {:.3f}".format(running_loss/trainLoaderSize))
            validateModel(X_test, y_test)    

In [180]:
def validateModel(X_test, y_test):
            
        test_loss = 0
        accuracy = 0
        total = 0.
        correct = 0.
        
        testLoaderSize = len(X_test)/batch_size

        #permutation = torch.randperm(X_test.size()[0])
    
        for i in range(0, X_test.size()[0], batch_size):
        
            # Turn off gradients for validation, saves memory and computations
            with torch.no_grad():
                
                model.eval()
                #indices = X_test[i:i+batch_size]
                #indices = torch.range(i, i+batch_size, dtype=torch.long)
                #batch_x, batch_y = X_test[indices], y_test[indices]
                batch_x, batch_y = X_test[i:i+batch_size], y_test[i:i+batch_size]
                log_ps = model(batch_x)
                probs = torch.sigmoid(log_ps).cpu()
                probs = np.round(probs)
                # total += batch_y.size(0)
                # correct += (probs == batch_y).sum().item()
                # accuracy = correct / total
                test_loss += criterion(log_ps.squeeze(), batch_y)
                
                # ps = torch.exp(log_ps)

                # top_p, top_class = ps.topk(1, dim=1)
                # equals = top_class == batch_y.view(*top_class.shape)
                # accuracy += torch.mean(equals.type(torch.FloatTensor))

                # Naive accuracy (correct classification rate):
                accuracy = torch.mean((probs == batch_y).float())
                
                # Count occorrences of true-positive, true-negative, false-positive, and false-negative:
                tp = torch.sum(np.logical_and(probs, batch_y));
                tn = torch.sum(np.logical_and(np.logical_not(probs),np.logical_not(batch_y)));
                fp = torch.sum(np.logical_and(probs,np.logical_not(batch_y)));
                fn = torch.sum(np.logical_and(np.logical_not(probs),batch_y));
                
                # Sensitivity (=recall=true positive rate) and Specificity (=true negative rate):
                sensitivity = float(tp) / (tp+fn);
                specificity = float(tn) / (tn+fp);
                
                # Balanced accuracy is a more fair replacement for the naive accuracy:
                balanced_accuracy = (sensitivity + specificity) / 2.;
                
                # Precision:
                # Beware from this metric, since it may be too sensitive to rare labels.
                # In the ExtraSensory Dataset, there is large skew among the positive and negative classes,
                # and for each label the pos/neg ratio is different.
                # This can cause undesirable and misleading results when averaging precision across different labels.
                precision = float(tp) / (tp+fp);
                
                print("-"*10);
                print('Accuracy*:         %.2f' % accuracy);
                print('Sensitivity (TPR): %.2f' % sensitivity);
                print('Specificity (TNR): %.2f' % specificity);
                print('Balanced accuracy: %.2f' % balanced_accuracy);
                print('Precision**:       %.2f' % precision);
                print("-"*10);
      
        model.train()
        
        test_losses.append(test_loss/testLoaderSize)

        print("Test Loss: {:.3f}.. ".format(test_loss/testLoaderSize),
             "Test Accuracy: {:.3f}".format(accuracy/testLoaderSize))


In [182]:
# Prepare training and test data
# M = np.isnan(Y)

# label_ind = label_names.index('SITTING')
# missing_label = M.iloc[:,label_ind]
# existing_label = np.logical_not(missing_label)

# # Select only the examples that are not missing the target label:
# X = X[existing_label.values]
# Y = Y[existing_label.values]

# Also, there may be missing sensor-features (represented in the data as NaN).
# You can handle those by imputing a value of zero (since we standardized, this is equivalent to assuming average value).
# You can also further select examples - only those that have values for all the features.
# For this tutorial, let's use the simple heuristic of zero-imputation:
X[np.isnan(X)] = 0.
Y[np.isnan(Y)] = 0.

X_train = torch.tensor(X[:2000].values)
# y_train = torch.tensor(Y.iloc[:2000,label_ind].values, dtype=torch.float32).flatten()
y_train = torch.tensor(Y.iloc[:2000].values, dtype=torch.float32)
# print(y_train)


X_test = torch.tensor(X[2001:].values)
# y_test = torch.tensor(Y.iloc[2001:,label_ind].values, dtype=torch.float32).flatten()
y_test = torch.tensor(Y.iloc[2001:].values, dtype=torch.float32)


# Train and Evaluate Model
trainAndTestModel(X_train.float(), y_train, X_test.float(), y_test)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Accuracy*:         0.73
Sensitivity (TPR): 0.50
Specificity (TNR): 0.73
Balanced accuracy: 0.62
Precision**:       0.07
----------
----------
Accuracy*:         0.69
Sensitivity (TPR): 0.27
Specificity (TNR): 0.72
Balanced accuracy: 0.49
Precision**:       0.05
----------
----------
Accuracy*:         0.68
Sensitivity (TPR): 0.31
Specificity (TNR): 0.73
Balanced accuracy: 0.52
Precision**:       0.12
----------
----------
Accuracy*:         0.78
Sensitivity (TPR): 0.37
Specificity (TNR): 0.80
Balanced accuracy: 0.58
Precision**:       0.08
----------
----------
Accuracy*:         0.81
Sensitivity (TPR): 0.24
Specificity (TNR): 0.85
Balanced accuracy: 0.54
Precision**:       0.09
----------
----------
Accuracy*:         0.86
Sensitivity (TPR): 0.56
Specificity (TNR): 0.86
Balanced accuracy: 0.71
Precision**:       0.09
----------
----------
Accuracy*:         0.89
Sensitivity (TPR): 0.62
Specificity (TNR): 0.90
Balanced ac

KeyboardInterrupt: ignored

In [None]:
A = torch.tensor([1,0,1,1])
B = torch.tensor([1,0,0,1])

ans = (A == B).sum().item()
print(ans)