In [23]:
import json
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import keras.callbacks
from keras.layers import Dense, Dropout, Activation, TimeDistributed, Input, concatenate
from keras.layers.recurrent import LSTM
from keras.models import Sequential, Model
from sklearn.model_selection import train_test_split
import pandas as pd
from itertools import groupby
import numpy.ma as ma
import catboost
from sklearn import metrics
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint
import os

In [4]:
event_types = ["bc613fb9d5", "bd18c260dd", "78a254eb1a", "60d7fad2cc", 
               "9f449c8a24", "44f0b93123", "c7863fbab6", "287a406e15", 
               "87d86c4ba1", "718ac49d0b", "1bea63552c", "4739c12685", 
               "09dff9a4e6", "e1218bb17f", "30fe294f41", "f3adcadc86", 
               "8fb049c69a", "a29c238412", "e5e18713a0", "bbfc7ae3f7", 
               "9c67e951dd", "5d5d31ecb1", "81b0435926", "8ccd550d04", 
               "416674c7cf", "96c40ef2e4", "221f9b90a3", "4234879f4b", 
               "444d9e80a6", "8b6000cce4", "d8c799feca"]

In [5]:
def process_row(train_part):
    train_part = json.loads(train_part)
    x_row = []
    y_row = []
    for data_point in train_part:
        x_data_point = [
            np.mean(data_point["lengths_history"]),
            data_point['user_state']["age"],
            data_point['user_state']['height'],
            data_point['user_state']['weight'],
            data_point['user_state']['period_estimate'],
            data_point['user_state']['luteal_estimate'],
            data_point['user_state']['cycle_estimate'],
            int(data_point['period_passed'])
        ]
        evts_map = {x['type'] : np.mean(x['value']) for x in data_point['events']}
        for evt in event_types:
            if evt in evts_map:
                x_data_point.append(np.mean(evts_map[evt]))
            else:
                x_data_point.append(0)
        x_data_point = [float('nan') if x is None else x for x in x_data_point]
        x_row.append(np.array(x_data_point))
        if 'label' in data_point:
            lbl = data_point['label']
            y_row.append(lbl)
    return np.array(x_row), np.array(y_row)

In [9]:
train_parts = []
y_train_parts = []
test_parts = []
i = 0
with open('/Users/vita/Downloads/flo/train_subsample.jsonl', 'r') as train_file, \
        open('/Users/vita/Downloads/flo/test_subsample.jsonl', 'r') as test_file:
    for train_part, test_part in zip(tqdm(train_file), test_file):
        i = i + 1
#         if i > 25000:
#             break
            
        train_x_row, train_y_row = process_row(train_part)
        train_parts.append(train_x_row)
        y_train_parts.append(train_y_row)

#         test_x_row, test_y_row = process_row(test_part)
#         test_parts.append(process_row(test_part))
    y_train_parts = np.array(y_train_parts)
    train_parts = np.array(train_parts)
    test_parts = np.array(test_parts)


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)

16it [00:00, 155.36it/s][A
36it [00:00, 175.25it/s][A
56it [00:00, 183.18it/s][A
75it [00:00, 184.66it/s][A
45865it [04:00, 190.98it/s]
110it [00:00, 174.75it/s]Exception in thread Thread-4:
Traceback (most recent call last):
  File "/usr/local/Cellar/python3/3.6.2/Frameworks/Python.framework/Versions/3.6/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/local/Cellar/python3/3.6.2/Frameworks/Python.framework/Versions/3.6/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

129593it [10:36, 203.66it/s]


In [10]:
X_train, X_test, y_train, y_test = train_test_split(train_parts, y_train_parts, test_size=0.3)

In [11]:
# fill nan
def fill_nan(data):
    return np.array([np.where(np.isnan(x), ma.array(x, mask=np.isnan(x)).mean(axis=0), x) for x in data])

X_train = fill_nan(X_train)
X_test = fill_nan(X_test)

In [12]:
X_train = pad_sequences(X_train)
y_train = pad_sequences(y_train)
y_train = np.reshape(y_train, (y_train.shape[0], y_train.shape[1], 1))
X_train.shape, y_train.shape

((90715, 280, 39), (90715, 280, 1))

In [13]:
X_test = pad_sequences(X_test, maxlen=X_train.shape[1])
y_test = pad_sequences(y_test, maxlen=X_train.shape[1])
y_test = np.reshape(y_test, (y_test.shape[0], y_test.shape[1], 1))
X_test.shape, y_test.shape

((38878, 280, 39), (38878, 280, 1))

In [14]:
scaler = MinMaxScaler(feature_range=(0, 1))
shp = X_train.shape
X_train_scaled = np.reshape(X_train, (-1, shp[2]))
X_train_scaled = scaler.fit_transform(X_train_scaled)
X_train_scaled = np.reshape(X_train_scaled, shp)
X_train_scaled.shape



(90715, 280, 39)

In [15]:
shp = X_test.shape
X_test_scaled = np.reshape(X_test, (-1, shp[2]))
X_test_scaled = scaler.transform(X_test_scaled)
X_test_scaled = np.reshape(X_test_scaled, shp)
X_test_scaled.shape

(38878, 280, 39)

In [16]:
X_train_antrop = np.array(X_train_scaled[:,:,0:7])
X_train_evt = np.array(X_train_scaled[:,:,7:])

X_test_antrop = np.array(X_test_scaled[:,:,0:7])
X_test_evt = np.array(X_test_scaled[:,:,7:])

In [17]:
def create_model():
    inputs = Input(shape=(X_train.shape[1], X_train.shape[2]))    
    x = LSTM(4, return_sequences=True)(inputs)
    outputs = TimeDistributed(Dense(1))(x)
    model = Model(inputs, outputs)
    return model

In [18]:
def create_model_2():
    evts_input = Input(shape=(X_train_evt.shape[1], X_train_evt.shape[2]))    
    lstm_out = LSTM(32, return_sequences=True)(evts_input)
    lstm_out = Dropout(.5)(lstm_out)
    input_2 = Input(shape=(X_train_antrop.shape[1], X_train_antrop.shape[2]))    
    x = keras.layers.concatenate([lstm_out, input_2])
    x = Dense(64, activation='relu')(x)
    x = Dropout(.25)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(.25)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(.25)(x)
    outputs = TimeDistributed(Dense(1))(x)
    model = Model(inputs=[evts_input, input_2], outputs=outputs)
    return model

In [60]:
checkpoint_dir = "rnn_2_weights"
filepath = checkpoint_dir + "/weights-{epoch:02d}-{loss:.2f}-{val_loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, 
                             save_best_only=False, mode='auto', save_weights_only=True)
if not os.path.exists(checkpoint_dir):
    os.mkdir(checkpoint_dir)

model_2 = create_model_2()
model_2.load_weights('rnn_2_weights/weights-07-1.75-1.32.hdf5')
model_2.compile('adam', 'mean_absolute_error')
model_2.fit([X_train_evt, X_train_antrop], y_train, batch_size=64, nb_epoch=20, 
            validation_split=0.1, verbose=1, callbacks=[checkpoint])

  if sys.path[0] == '':


Train on 81643 samples, validate on 9072 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
  256/81643 [..............................] - ETA: 787s - loss: 1.6489

KeyboardInterrupt: 

In [61]:
# pred = model.predict(X_train_scaled[1:2])
pred = model_2.predict([X_test_evt, X_test_antrop])
# list(zip(pred[0,:,0], y_train[0,:,0]))
# list(zip(pred, y_train[0:1]))

In [62]:
def metric(real, predicted):
    assert len(real) == len(predicted)
    
    users_mae_values = []
    for real_labels, predicted_labels in zip(real, predicted):
        assert len(real_labels) == len(predicted_labels)
        real_labels, predicted_labels = np.array(real_labels), np.array(predicted_labels)
        
        user_mae = np.mean(np.abs(real_labels - predicted_labels))
        users_mae_values.append(user_mae)
    
    return np.mean(users_mae_values)

In [63]:
real = []
predicted = []
for pair in zip(pred[:,:,0], y_test[:,:,0]):
    ind = next((i for i, v in enumerate(pair[1]) if v > 0), -1)
    predicted.append(pair[0][ind:])
    real.append(pair[1][ind:])

In [64]:
print("RNN = {}".format(metric(real, predicted)))

RNN = 2.9029015900173714


In [66]:
i=5
X_test[i, -1, 0:7], real[i], predicted[i]

(array([ 30,  33, 154,  55,   9,   0,   0], dtype=int32),
 array([25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
        25, 25, 25, 25, 25, 25, 25, 25, 25, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 47, 47, 47, 47, 47, 47, 47,
        47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
        47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
        47, 47, 47, 47, 47, 47, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
        27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27], dtype=int32),
 array([ 24.15056992,  24.23118019,  24.44118118,  24.35130882,
         24.42392731,  24.76114655,  24.70909882,  24.72706985,
         24.89683151,  25.2110424 ,  25.30344772,  25.25684357,
         25.11906624,  24.91732025,  24.81240845,  24.72669792,
         24.66095734,  24.6485939 ,  24.64139938,  24.77434921,
         24.62004662,  24.69812012,  24.75393677,  24.79447365,
         24.865

In [33]:
X_test[0,-1,:7]

array([ 31,  19, 165,   0,   5,   0,  28], dtype=int32)

In [None]:
# scaler.inverse_transform(X_train_antrop[0:10])