In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import random
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import metrics

In [2]:
def show_full_data(data, row_size=None, column_size=None, col_width=-1):
    """
    Shows all rows and columns instead of showing only some part 
    and hiding other parts for large data.
    """
    with pd.option_context('display.max_rows', row_size, 
                           'display.max_columns', column_size, 
                           'display.max_colwidth', col_width):
        display(data)

# Read Data

In [3]:
df = pd.read_csv('combined_samples/combined_data.csv')
# Eliminate first unnecessary column.
df = df.iloc[:, 1:]
df.head()

Unnamed: 0,timestamp,conversation,bt_level_avg,bt_level_std,bt_total_devices_around,bt_total_far,bt_total_farther,bt_total_near,bt_total_nearer,wifi_level_avg,...,phone_locked,activity_inference_0,activity_inference_1,activity_inference_2,activity_inference_3,audio_inference_0,audio_inference_1,audio_inference_2,audio_inference_3,STRESSED
0,2013-03-27 04:00:00,0.0,,,,,,,,-77.0,...,0.0,599.0,0.0,0.0,0.0,274.0,0.0,185.0,140.0,
1,2013-03-27 04:10:00,0.0,,,,,,,,,...,0.0,600.0,0.0,0.0,0.0,557.0,1.0,42.0,0.0,
2,2013-03-27 04:20:00,0.0,,,,,,,,-71.5,...,0.0,600.0,0.0,0.0,0.0,457.0,0.0,143.0,0.0,
3,2013-03-27 04:30:00,0.0,,,,,,,,,...,208.0,600.0,0.0,0.0,0.0,564.0,1.0,35.0,0.0,
4,2013-03-27 04:40:00,200.0,,,,,,,,-62.0,...,600.0,600.0,0.0,0.0,0.0,127.0,94.0,379.0,0.0,


# Create Dataframe with Same Length Instances

In [4]:
sequence_length = 36

In [5]:
def create_same_length_instances(df, length=72, label='STRESSED'):
    df = df.drop(columns=['timestamp'])
    full_data = pd.DataFrame()
    indexes = list(df[df[label].notnull()].index)
    start = 0
    for i in indexes:
        if i - start >= length:
            instance = df.iloc[i-length+1:i+1, :]
            full_data = full_data.append(instance, ignore_index=True, sort=False)
        else:
            index_diff = i - start
            instance = df.iloc[i-index_diff+1:i+1, :]
            back_fill = np.empty((length-index_diff, df.shape[1]))
            back_fill.fill(np.nan)
            back_fill = pd.DataFrame(back_fill, columns=df.columns)
            instance = back_fill.append(instance, ignore_index=True, sort=False)
            full_data = full_data.append(instance, ignore_index=True, sort=False)
        start = i
    return full_data

In [6]:
df_same = create_same_length_instances(df, length=sequence_length)

In [7]:
print('Total sample size:', df_same[df_same.STRESSED.notnull()].shape[0], '\n')
print('Each class size:\n' + str(df_same.STRESSED.value_counts()))

Total sample size: 2347 

Each class size:
1.0    1614
0.0     733
Name: STRESSED, dtype: int64


In [8]:
df_same[df_same.STRESSED.notnull()].head()

Unnamed: 0,conversation,bt_level_avg,bt_level_std,bt_total_devices_around,bt_total_far,bt_total_farther,bt_total_near,bt_total_nearer,wifi_level_avg,wifi_level_std,...,phone_locked,activity_inference_0,activity_inference_1,activity_inference_2,activity_inference_3,audio_inference_0,audio_inference_1,audio_inference_2,audio_inference_3,STRESSED
35,0.0,,,,,,,,-61.0,,...,0.0,600.0,0.0,0.0,0.0,595.0,2.0,3.0,0.0,1.0
71,0.0,,,,,,,,,,...,0.0,600.0,0.0,0.0,0.0,356.0,2.0,242.0,0.0,1.0
107,0.0,-89.0,5.618846,7.0,3.0,3.0,1.0,0.0,-84.0,8.447316,...,600.0,598.0,2.0,0.0,0.0,600.0,0.0,0.0,0.0,1.0
143,569.0,,,,,,,,,,...,600.0,600.0,0.0,0.0,0.0,49.0,510.0,41.0,0.0,1.0
179,600.0,,,,,,,,-76.0,20.126268,...,0.0,600.0,0.0,0.0,0.0,40.0,536.0,24.0,0.0,0.0


# Normalize Data

In [9]:
def normalize(df, label_col='STRESSED'):
    features = df_same.drop(columns=label_col)
    features_norm = (features - features.mean(axis=0)) / (features.max(axis=0) - features.min(axis=0))
    df.loc[:, df.columns != label_col] = features_norm
    return df

In [10]:
df_norm = normalize(df_same)
show_full_data(df_norm.head())

Unnamed: 0,conversation,bt_level_avg,bt_level_std,bt_total_devices_around,bt_total_far,bt_total_farther,bt_total_near,bt_total_nearer,wifi_level_avg,wifi_level_std,wifi_total_devices_around,wifi_total_far,wifi_total_near,wifi_total_nearer,phone_in_dark,phone_charging,phone_locked,activity_inference_0,activity_inference_1,activity_inference_2,activity_inference_3,audio_inference_0,audio_inference_1,audio_inference_2,audio_inference_3,STRESSED
0,0.720337,-0.065832,,-0.091525,-0.062286,-0.085798,-0.002133,-0.027764,-0.003055,0.010469,0.002031,-0.012494,0.015376,-0.074376,-0.365731,-0.198378,-0.456264,0.065718,-0.067289,-0.015943,0.017532,-0.405645,0.347095,0.05857,-6.3e-05,
1,0.720337,,,,,,,,-0.023196,0.033203,0.002031,0.006374,-0.006363,-0.074376,-0.365731,-0.198378,-0.456264,0.114051,-0.067289,-0.015943,-0.030802,-0.105645,0.240428,-0.134763,-6.3e-05,
2,0.653671,-0.066242,0.079029,-0.066525,-0.062286,-0.014369,-0.002133,-0.027764,0.013729,0.031754,0.019273,-0.012494,0.037116,-0.074376,-0.365731,-0.198378,-0.456264,0.097385,-0.058956,-0.015943,-0.022468,-0.013978,0.138761,-0.124763,-6.3e-05,
3,0.122004,,,,,,,,-0.005069,0.004229,0.122721,0.025242,0.102333,0.198351,-0.365731,-0.198378,-0.456264,-0.660949,0.702711,-0.010943,-0.030802,-0.313978,0.067095,0.246904,-6.3e-05,
4,0.720337,-0.066053,0.086446,-0.041525,-0.062286,-0.014369,-0.002133,-0.001449,0.007015,0.014397,0.105479,0.04411,0.058855,0.016533,-0.365731,-0.198378,-0.456264,0.114051,-0.067289,-0.015943,-0.030802,-0.407312,0.573761,-0.16643,-6.3e-05,


# Fill Empty Values

In [11]:
def fill_nulls(df, label_col='STRESSED'):
    df.loc[:, df.columns != label_col] = df.loc[:, df.columns != label_col].fillna(0)
    return df

In [12]:
df_filled = fill_nulls(df_norm)
show_full_data(df_filled.head())

Unnamed: 0,conversation,bt_level_avg,bt_level_std,bt_total_devices_around,bt_total_far,bt_total_farther,bt_total_near,bt_total_nearer,wifi_level_avg,wifi_level_std,wifi_total_devices_around,wifi_total_far,wifi_total_near,wifi_total_nearer,phone_in_dark,phone_charging,phone_locked,activity_inference_0,activity_inference_1,activity_inference_2,activity_inference_3,audio_inference_0,audio_inference_1,audio_inference_2,audio_inference_3,STRESSED
0,0.720337,-0.065832,0.0,-0.091525,-0.062286,-0.085798,-0.002133,-0.027764,-0.003055,0.010469,0.002031,-0.012494,0.015376,-0.074376,-0.365731,-0.198378,-0.456264,0.065718,-0.067289,-0.015943,0.017532,-0.405645,0.347095,0.05857,-6.3e-05,
1,0.720337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.023196,0.033203,0.002031,0.006374,-0.006363,-0.074376,-0.365731,-0.198378,-0.456264,0.114051,-0.067289,-0.015943,-0.030802,-0.105645,0.240428,-0.134763,-6.3e-05,
2,0.653671,-0.066242,0.079029,-0.066525,-0.062286,-0.014369,-0.002133,-0.027764,0.013729,0.031754,0.019273,-0.012494,0.037116,-0.074376,-0.365731,-0.198378,-0.456264,0.097385,-0.058956,-0.015943,-0.022468,-0.013978,0.138761,-0.124763,-6.3e-05,
3,0.122004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.005069,0.004229,0.122721,0.025242,0.102333,0.198351,-0.365731,-0.198378,-0.456264,-0.660949,0.702711,-0.010943,-0.030802,-0.313978,0.067095,0.246904,-6.3e-05,
4,0.720337,-0.066053,0.086446,-0.041525,-0.062286,-0.014369,-0.002133,-0.001449,0.007015,0.014397,0.105479,0.04411,0.058855,0.016533,-0.365731,-0.198378,-0.456264,0.114051,-0.067289,-0.015943,-0.030802,-0.407312,0.573761,-0.16643,-6.3e-05,


# Create Tensor Instances

## X Data

In [13]:
def create_instances(df, length=72, label='STRESSED'):
    indexes = list(df[df[label].notnull()].index)
    data = df.drop(columns=[label])
    all_data = []
    for i in indexes:
        start = i-length+1
        all_data.append([torch.from_numpy(data.iloc[j, :].values).type(torch.float32) for j in range(start, i+1)])
    return all_data

In [14]:
data = create_instances(df_filled, length=sequence_length)

In [15]:
print('Total number of instances:', len(data))
print("One sample's sequence length:", len(data[0]))
print("Feature size:", data[0][0].shape)

Total number of instances: 2347
One sample's sequence length: 36
Feature size: torch.Size([25])


## y data

In [16]:
y = df_filled.loc[df_filled.STRESSED.notnull(), 'STRESSED'].values.tolist()

In [17]:
print("Firts 10 values of X:", y[:10], end="")

Firts 10 values of X: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0]

# Create Randomly Chosen Train and Test Data with Specific Size

In [18]:
def create_random_sets(x, y, train_size=1000, seed=1, balanced_test=False):
    random.seed(seed)
    one_class_size = int(train_size / 2)
    all_one_indexes = [i for i, x in enumerate(y) if x == 1]
    all_zero_indexes = [i for i, x in enumerate(y) if x == 0]
    ones = random.sample(all_one_indexes, one_class_size)
    zeros = random.sample(all_zero_indexes, one_class_size)
    train_indexes = random.sample(ones+zeros, train_size)
    new_x = [x[i] for i in train_indexes]
    new_y = [y[i] for i in train_indexes]
    if balanced_test == False:
        test_x = [x[i] for i in range(len(y)) if i not in train_indexes]
        test_y = [y[i] for i in range(len(y)) if i not in train_indexes]
    else:
        not_used_ones = list(set(all_one_indexes) - set(ones))
        not_used_zeros = list(set(all_zero_indexes) - set(zeros))
        test_sample_size = min(len(not_used_ones), len(not_used_zeros))
        test_ones = random.sample(not_used_ones, test_sample_size)
        test_zeros = random.sample(not_used_zeros, test_sample_size)
        test_indexes = random.sample(test_ones+test_zeros, test_sample_size*2)
        test_x = [x[i] for i in test_indexes]
        test_y = [y[i] for i in test_indexes]
    return new_x, new_y, test_x, test_y

In [19]:
X_train, y_train, X_test, y_test = create_random_sets(data, y, train_size=1200, seed=1, balanced_test=True)

# LSTM

In [20]:
class LSTMStress(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim= 1):
        super(LSTMStress, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim) 
        #self.lstm2 = nn.LSTM(hidden_dim, hidden_dim)
        self.fc1 = nn.Linear(hidden_dim, 64) # fully-connected layer weights and bias
        self.fc2 = nn.ReLU() # fully-connected layer non-linearity
        self.fc3 = nn.Linear(64, output_dim)
        
    def forward(self, x):
        _, final_state = self.lstm(x.view(len(x), 1, -1))
        final_hidden = final_state[0]
        output = final_hidden.view(1,-1) # since your batchsize is 1
        unnormalized_scores = self.fc3(self.fc2(self.fc1(output)))
        return unnormalized_scores

In [21]:
def create_input_for_LSTM(instance, device="cuda:0"):
    cat_inputs = torch.cat(instance).view(len(instance), 1, -1).to(device)
    cat_inputs = cat_inputs.type(torch.float32)
    return cat_inputs

In [22]:
# logits = model(input)
def acc(logits, ygolds):
    with torch.no_grad():
        probs = torch.sigmoid(logits)
        ypred = probs >= 0.5
        num_of_corrects = (ygolds.byte() == ypred).sum().item()
        return num_of_corrects / logits.shape[0]

In [23]:
SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [25]:
# initialize model: 
input_dim = len(X_train[0][0])
hidden_dim = 64
device = "cuda:0"
model = LSTMStress(input_dim, hidden_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.BCEWithLogitsLoss();
epochs = 500

all_loss = []
all_train_acc = []
all_test_acc = []

for epoch in range(epochs):
    tot_loss = 0
    train_corrects = 0
    
    for (i, model_input) in enumerate(X_train):
        instance, ygold = model_input, y_train[i]
        ygold = torch.tensor(ygold, dtype=torch.float32, device=device).view(1,-1)
#         cat_inputs = torch.cat(i0).view(len(i0), 1, -1).to(device)
#         cat_inputs = cat_inputs.type(torch.float32)
        cat_inputs = create_input_for_LSTM(instance, device=device)
        scores = model(cat_inputs)
        acc_score = acc(scores, ygold)
        if acc_score == 1.0:
            train_corrects += 1
        optimizer.zero_grad()
        loss = criterion(scores, ygold)
        tot_loss += loss.item()
        loss.backward()
        optimizer.step()
        
    test_corrects = 0
    tot_test_loss = 0
    for (j, test_input) in enumerate(X_test):
        test_instance, testygold = test_input, y_test[j]
        testygold = torch.tensor(testygold, dtype=torch.float32, device=device).view(1,-1)
        test_cat_inputs = create_input_for_LSTM(test_instance, device=device)
        test_scores = model(test_cat_inputs)
        test_acc_score = acc(test_scores, testygold)
        if test_acc_score == 1.0:
            test_corrects += 1
        
    if epoch % 5 == 0:   
        print("Loss:", tot_loss/len(X_train), '---',
              "Train Acc:", train_corrects/len(X_train), '---',
              "Test Acc:", test_corrects/len(X_test))
    
    all_loss.append(tot_loss/len(X_train))
    all_train_acc.append(train_corrects/len(X_train))
    all_test_acc.append(test_corrects/len(X_test))

Loss: 0.695443042293191 --- Train Acc: 0.5133333333333333 --- Test Acc: 0.5
Loss: 0.6961376624554396 --- Train Acc: 0.515 --- Test Acc: 0.5300751879699248
Loss: 0.6919965840379397 --- Train Acc: 0.5041666666666667 --- Test Acc: 0.5263157894736842
Loss: 0.6819539993815124 --- Train Acc: 0.5341666666666667 --- Test Acc: 0.5112781954887218
Loss: 0.6739412267754475 --- Train Acc: 0.5758333333333333 --- Test Acc: 0.5263157894736842


In [None]:
plt.plot(all_loss)

In [None]:
plt.plot(all_train_acc)

In [None]:
plt.plot(all_test_acc)