In [48]:
import pandas as pd
import numpy as np
import pandas_profiling
import torch
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def show_full_data(data, row_size=None, column_size=None, col_width=-1):
    """
    Shows all rows and columns instead of showing only some part 
    and hiding other parts for large data.
    """
    with pd.option_context('display.max_rows', row_size, 
                           'display.max_columns', column_size, 
                           'display.max_colwidth', col_width):
        display(data)

# Read Data

In [3]:
df = pd.read_csv('combined_samples/combined_data.csv')

In [4]:
# Eliminate first unnecessary row.
df = df.iloc[:, 1:]

In [5]:
df.head()

Unnamed: 0,timestamp,conversation,bt_level_avg,bt_level_std,bt_total_devices_around,bt_total_far,bt_total_farther,bt_total_near,bt_total_nearer,wifi_level_avg,...,phone_locked,activity_inference_0,activity_inference_1,activity_inference_2,activity_inference_3,audio_inference_0,audio_inference_1,audio_inference_2,audio_inference_3,STRESSED
0,2013-03-27 04:00:00,0.0,,,,,,,,-77.0,...,0.0,599.0,0.0,0.0,0.0,274.0,0.0,185.0,140.0,
1,2013-03-27 04:10:00,0.0,,,,,,,,,...,0.0,600.0,0.0,0.0,0.0,557.0,1.0,42.0,0.0,
2,2013-03-27 04:20:00,0.0,,,,,,,,-71.5,...,0.0,600.0,0.0,0.0,0.0,457.0,0.0,143.0,0.0,
3,2013-03-27 04:30:00,0.0,,,,,,,,,...,208.0,600.0,0.0,0.0,0.0,564.0,1.0,35.0,0.0,
4,2013-03-27 04:40:00,200.0,,,,,,,,-62.0,...,600.0,600.0,0.0,0.0,0.0,127.0,94.0,379.0,0.0,


# Create Dataframe with Same Length Instances

In [6]:
def create_same_length_instances(df, length=72, label='STRESSED'):
    df = df.drop(columns=['timestamp'])
    full_data = pd.DataFrame()
    indexes = list(df[df[label].notnull()].index)
    start = 0
    for i in indexes:
        if i - start >= length:
            instance = df.iloc[i-length+1:i+1, :]
            full_data = full_data.append(instance, ignore_index=True, sort=False)
        else:
            index_diff = i - start
            instance = df.iloc[i-index_diff+1:i+1, :]
            back_fill = np.empty((length-index_diff, df.shape[1]))
            back_fill.fill(np.nan)
            back_fill = pd.DataFrame(back_fill, columns=df.columns)
            instance = back_fill.append(instance, ignore_index=True, sort=False)
            full_data = full_data.append(instance, ignore_index=True, sort=False)
        start = i
    return full_data

In [7]:
df_same = create_same_length_instances(df, length=72)

In [8]:
# def create_numpy_array(df, label='STRESSED'):
#     indexes = list(df[df[label].notnull()].index)
#     labels = df.loc[indexes, 'STRESSED'].values
#     data = df.drop(columns=['STRESSED'])
#     all_data = np.empty((np.empty((indexes[1] - indexes[0], data.shape[1])), len(indexes)))
#     start = 0
#     j = 0
#     for i in indexes:
#         instance = np.array(data.iloc[start:i+1, :].values, labels[j])
#         if j == 0:
#             all_data[j] = instance
#         else:
#             all_data.append(instance, axis=0)
#         start = i+1
#         j += 1
#     return all_data, labels

In [9]:
# create_numpy_array(uza)

In [10]:
df_same[df_same.STRESSED.notnull()].shape

(2347, 26)

In [11]:
df_same.STRESSED.value_counts()

1.0    1614
0.0     733
Name: STRESSED, dtype: int64

In [12]:
df_same[df_same.STRESSED.notnull()].head()

Unnamed: 0,conversation,bt_level_avg,bt_level_std,bt_total_devices_around,bt_total_far,bt_total_farther,bt_total_near,bt_total_nearer,wifi_level_avg,wifi_level_std,...,phone_locked,activity_inference_0,activity_inference_1,activity_inference_2,activity_inference_3,audio_inference_0,audio_inference_1,audio_inference_2,audio_inference_3,STRESSED
71,0.0,,,,,,,,-61.0,,...,0.0,600.0,0.0,0.0,0.0,595.0,2.0,3.0,0.0,1.0
143,0.0,,,,,,,,,,...,0.0,600.0,0.0,0.0,0.0,356.0,2.0,242.0,0.0,1.0
215,0.0,-89.0,5.618846,7.0,3.0,3.0,1.0,0.0,-84.0,8.447316,...,600.0,598.0,2.0,0.0,0.0,600.0,0.0,0.0,0.0,1.0
287,569.0,,,,,,,,,,...,600.0,600.0,0.0,0.0,0.0,49.0,510.0,41.0,0.0,1.0
359,600.0,,,,,,,,-76.0,20.126268,...,0.0,600.0,0.0,0.0,0.0,40.0,536.0,24.0,0.0,0.0


# Normalize Data

In [13]:
def normalize(df, label_col='STRESSED'):
    features = df_same.drop(columns=label_col)
    features_norm = (features - features.mean(axis=0)) / (features.max(axis=0) - features.min(axis=0))
    df.loc[:, df.columns != label_col] = features_norm
    return df

In [14]:
df_norm = normalize(df_same)

In [15]:
df_norm.tail()

Unnamed: 0,conversation,bt_level_avg,bt_level_std,bt_total_devices_around,bt_total_far,bt_total_farther,bt_total_near,bt_total_nearer,wifi_level_avg,wifi_level_std,...,phone_locked,activity_inference_0,activity_inference_1,activity_inference_2,activity_inference_3,audio_inference_0,audio_inference_1,audio_inference_2,audio_inference_3,STRESSED
168979,-0.242789,0.599175,,-0.089087,-0.059428,-0.074681,-0.036086,0.00032,,,...,-0.479851,-0.602776,0.01124,0.090643,0.500928,-0.644548,-0.146252,0.790838,,
168980,-0.242789,,,,,,,,,,...,-0.479851,-0.591109,0.147907,0.085643,0.357595,-0.644548,-0.146252,0.790838,,
168981,-0.037789,,,,,,,,,,...,-0.479851,-0.301109,0.149573,0.09231,0.059262,-0.644548,-0.141252,0.785838,,
168982,-0.242789,,,,,,,,,,...,-0.479851,-0.549442,0.35624,0.138977,0.054262,-0.644548,-0.147918,0.792504,,
168983,-0.242789,0.599175,-0.190635,-0.064087,-0.059428,-0.074681,-0.036086,0.026636,,,...,-0.479851,0.008891,-0.01376,-0.014357,0.019262,-0.644548,-0.144585,0.789171,,0.0


# Fill Nan Values

In [16]:
def fill_nulls(df, label_col='STRESSED'):
    df.loc[:, df.columns != label_col] = df.loc[:, df.columns != label_col].fillna(0)
    return df

In [17]:
df_filled = fill_nulls(df_norm)

In [18]:
df_filled.head()

Unnamed: 0,conversation,bt_level_avg,bt_level_std,bt_total_devices_around,bt_total_far,bt_total_farther,bt_total_near,bt_total_nearer,wifi_level_avg,wifi_level_std,...,phone_locked,activity_inference_0,activity_inference_1,activity_inference_2,activity_inference_3,audio_inference_0,audio_inference_1,audio_inference_2,audio_inference_3,STRESSED
0,0.757211,-0.059292,0.012913,0.135913,0.065572,-0.012181,0.101845,0.026636,0.050303,-0.045424,...,-0.479851,0.103891,-0.060427,-0.014357,-0.029072,-0.237882,0.385415,-0.147496,-3.6e-05,
1,0.745545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.479851,0.103891,-0.060427,-0.014357,-0.029072,-0.081215,0.243748,-0.162496,-3.6e-05,
2,0.435545,-0.059324,0.057321,0.110913,0.065572,0.050319,0.067362,0.00032,0.025127,-0.041241,...,-0.479851,0.103891,-0.060427,-0.014357,-0.029072,-0.587882,0.557082,0.030838,-3.6e-05,
3,0.757211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.091883,-0.069595,...,-0.479851,-0.371109,0.414573,-0.014357,-0.029072,-0.599548,0.667082,-0.067496,-3.6e-05,
4,0.147211,-0.05945,-0.1085,-0.014087,0.065572,-0.074681,-0.001603,-0.025995,-0.087937,-0.081701,...,-0.479851,-0.896109,0.939573,-0.014357,-0.029072,-0.236215,0.273748,-0.037496,-3.6e-05,


# Create Instances

In [19]:
def create_instances(df, length=72, label='STRESSED'):
    indexes = list(df[df[label].notnull()].index)
    data = df.drop(columns=[label])
    all_data = []
    for i in indexes:
        start = i-length+1
        all_data.append([torch.from_numpy(data.iloc[j, :].values) for j in range(start, i+1)])
    return all_data

In [20]:
data = create_instances(df_filled)

In [21]:
len(data)

2347

In [22]:
len(data[2])

72

In [23]:
y = df_filled.loc[df_filled.STRESSED.notnull(), 'STRESSED'].values.tolist()

In [24]:
len(y)

2347

In [120]:
def create_random_sets(x, y, data_size=1000, seed=1):
    random.seed(seed)
    one_class_size = int(data_size / 2)
    one_indexes = [i for i, x in enumerate(y) if x == 1]
    zero_indexes = [i for i, x in enumerate(y) if x == 0]
    ones = random.sample(one_indexes, one_class_size)
    zeros = random.sample(zero_indexes, one_class_size)
    train_indexes = random.sample(ones+zeros, data_size)
    new_x = [x[i] for i in train_indexes]
    new_y = [y[i] for i in train_indexes]
    test_x = [x[i] for i in range(len(y)) if i not in train_indexes]
    test_y = [y[i] for i in range(len(y)) if i not in train_indexes]
    return new_x, new_y, test_x, test_y

In [129]:
X_train, y_train, X_test, y_test = create_random_sets(data, y)

In [39]:
i0 = data[0]; # each item in i0 is an input for each time step
def create_input_for_LSTM(instance, device="cuda:0"):
    cat_inputs = torch.cat(instance).view(len(i0), 1, -1).to(device)
    cat_inputs = cat_inputs.type(torch.float32)
    return cat_inputs

In [26]:
hidden_dim = 128
lstm1 = torch.nn.LSTM(25, hidden_dim)
cat_inputs = torch.cat(i0).view(len(i0), 1, -1).to("cuda:0")
cat_inputs = cat_inputs.type(torch.float32)
lstm1.to("cuda:0");
_, state = lstm1(cat_inputs) # this is the output for lstm;
hidden, _ = state;

In [27]:
fc1 = torch.nn.Linear(128, 1).to("cuda:0")
output= fc1(hidden).view(1, -1)
criterion = torch.nn.BCEWithLogitsLoss();
output.shape
loss = criterion(output, torch.tensor(y[0], dtype=torch.float32, device="cuda:0").view(1,-1))

In [28]:
import torch.nn as nn

In [33]:
class LstmTester(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim= 1):
        super(LstmTester, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim) 
        self.fc1 = nn.Linear(hidden_dim, 64) # fully-connected layer weights and bias
        self.fc2 = nn.ReLU() # fully-connected layer non-linearity
        self.fc3 = nn.Linear(64, output_dim)
        
    def forward(self, x):
        _, final_state = self.lstm(x.view(len(x), 1, -1))
        final_hidden = final_state[0]
        output = final_hidden.view(1,-1) # since your batchsize is 1
        unnormalized_scores = self.fc3(self.fc2(self.fc1(output)))
        return unnormalized_scores

In [34]:
# initialize model : 
device = "cuda:0"
m = LstmTester(25,128).to(device)
optimizer = torch.optim.Adam(m.parameters())
for i in range(30):
    tot_loss = 0
    for (i, minput) in enumerate(data):
        i0, ygold = minput, y[i]
        ygold = torch.tensor(ygold, dtype=torch.float32, device=device).view(1,-1)
        cat_inputs = torch.cat(i0).view(len(i0), 1, -1).to(device)
        cat_inputs = cat_inputs.type(torch.float32)
        optimizer.zero_grad()
        loss = criterion(m(cat_inputs), ygold)
        tot_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(tot_loss/len(data))

0.6328675698692046
0.6202095730391318
0.6188138847369867
0.6163737343367892
0.6182747476745729
0.6140724856074714
0.6136097076234281
0.6194913712207332
0.619909731048216
0.618826018374145
0.6179172423198774
0.612250177287324
0.6128187730873573
0.6119350554888991
0.6074020916843667
0.6062063742335767
0.602862573180762
0.6020459184077067
0.5975325011220075
0.593733799256524
0.6067163227489087
0.5909544044413533
0.5900878476522209
0.5810883875783397
0.6017944140772049
0.5777996314935722
0.5736657772408497
0.5571681032483352


KeyboardInterrupt: 

m

In [35]:
# logits = model(input)
def acc(logits, ygolds):
    with torch.no_grad():
        probs = torch.sigmoid(logits)
        ypred = probs >= 0.5
        num_of_corrects = (ygolds.byte() == ypred).sum().item()
        return num_of_corrects / logits.shape[0]

In [133]:
corrects = 0
for i in range(len(data)):
    my_input = create_input_for_LSTM(data[i])
    t = acc(m(my_input),torch.tensor(y[i], dtype=torch.float32, device=device).view(1,-1))
    if t == 1.0 and y[i] == 1:
        corrects += 1
print(corrects)

1552


In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import accuracy_score

In [30]:
# accuracy_score(y_true, y_pred)

In [31]:
SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
class YasinTagger(nn.module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        self.lstm = nn.LSTM(input_dim, hidden_dim)
        self.fc1 = nn.Linear(hidden_dim, 2)

    def forward(self, x):
        _, final_hidden = self.lstm(x)
        unnormalized_scores = self.fc1(final_hidden) # TODO: check dimensions
        return unnormalized_scores

In [None]:
def main():
    model = YasinTagger(26, 128, 2)
    # create your fake inputs
    T = 72 # time sequence (length of your input)
    B = 1 # minibatching (batchsize)
    #inputs = [torch.randn(1, 3) for _ in range(T)]
    cat_input = torch.cat(inputs).view(len(inputs),1, -1)
    scores =  model(cat_input)
    criterion = torch.nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters())
    optimizer.zero_grad()
    loss = criterio(scores, your_true_labels)
    loss.backward()
    optimizer.step()
    # TODO:
    # 1 make sure about dimensions
    # 2 implement accuracy (you can convert numpy arrays and use sci-kitlearn)

    # your train loop here
    epochs = 5
    for i in range(epochs):
        for sample in data:
            cat_input = torch.cat(inputs).view(len(inputs),1, -1)
            scores = model(cat_input)
            criterion = torch.nn.BCEWithLogitsLoss()
            optimizer = torch.optim.Adam(model.parameters())
            optimizer.zero_grad()
            loss = criterion(scores, your_true_labels)

In [None]:
new_arr = df.bt_level_avg.fillna(0).values

In [None]:
plt.figure()
plt.plot(new_arr)
plt.show()