In [1]:
import json
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import keras.callbacks
from keras.layers import Dense, Dropout, Activation, TimeDistributed, Input, concatenate
from keras.layers.recurrent import LSTM
from keras.models import Sequential, Model
from sklearn.model_selection import train_test_split
import pandas as pd
from itertools import groupby
import numpy.ma as ma
import catboost
from sklearn import metrics
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
event_types = ["bc613fb9d5", "bd18c260dd", "78a254eb1a", "60d7fad2cc", 
               "9f449c8a24", "44f0b93123", "c7863fbab6", "287a406e15", 
               "87d86c4ba1", "718ac49d0b", "1bea63552c", "4739c12685", 
               "09dff9a4e6", "e1218bb17f", "30fe294f41", "f3adcadc86", 
               "8fb049c69a", "a29c238412", "e5e18713a0", "bbfc7ae3f7", 
               "9c67e951dd", "5d5d31ecb1", "81b0435926", "8ccd550d04", 
               "416674c7cf", "96c40ef2e4", "221f9b90a3", "4234879f4b", 
               "444d9e80a6", "8b6000cce4", "d8c799feca"]

In [3]:
def process_row(train_part):
    train_part = json.loads(train_part)
    x_row = []
    y_row = []
    for data_point in train_part:
        x_data_point = [
            np.mean(data_point["lengths_history"]),
            data_point['user_state']["age"],
            data_point['user_state']['height'],
            data_point['user_state']['weight'],
            data_point['user_state']['period_estimate'],
            data_point['user_state']['luteal_estimate'],
            data_point['user_state']['cycle_estimate'],
            int(data_point['period_passed'])
        ]
        evts_map = {x['type'] : np.mean(x['value']) for x in data_point['events']}
        for evt in event_types:
            if evt in evts_map:
                x_data_point.append(np.mean(evts_map[evt]))
            else:
                x_data_point.append(0)
        x_data_point = [float('nan') if x is None else x for x in x_data_point]
        x_row.append(np.array(x_data_point))
        if 'label' in data_point:
            lbl = data_point['label']
            y_row.append(lbl)
    return np.array(x_row), np.array(y_row)

In [47]:
train_parts = []
y_train_parts = []
test_parts = []
i = 0
with open('/Users/vita/Downloads/flo/train_subsample.jsonl', 'r') as train_file, \
        open('/Users/vita/Downloads/flo/test_subsample.jsonl', 'r') as test_file:
    for train_part, test_part in zip(tqdm(train_file), test_file):
        i = i + 1
#         if i > 25000:
#             break
            
        train_x_row, train_y_row = process_row(train_part)
        train_parts.append(train_x_row)
        y_train_parts.append(train_y_row)

#         test_x_row, test_y_row = process_row(test_part)
#         test_parts.append(process_row(test_part))
    y_train_parts = np.array(y_train_parts)
    train_parts = np.array(train_parts)
    test_parts = np.array(test_parts)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
129593it [10:04, 214.43it/s]


In [48]:
X_train, X_test, y_train, y_test = train_test_split(train_parts, y_train_parts, test_size=0.3)

In [67]:
# fill nan
def fill_nan(data):
    return np.array([np.where(np.isnan(x), ma.array(x, mask=np.isnan(x)).mean(axis=0), x) for x in data])

X_train = fill_nan(X_train)
X_test = fill_nan(X_test)

In [50]:
X_train = pad_sequences(X_train)
y_train = pad_sequences(y_train)
y_train = np.reshape(y_train, (y_train.shape[0], y_train.shape[1], 1))
X_train.shape, y_train.shape

((90715, 280, 39), (90715, 280, 1))

In [68]:
X_test = pad_sequences(X_test, maxlen=X_train.shape[1])
y_test = pad_sequences(y_test, maxlen=X_train.shape[1])
y_test = np.reshape(y_test, (y_test.shape[0], y_test.shape[1], 1))
X_test.shape, y_test.shape

((38878, 280, 39), (38878, 280, 1))

In [51]:
scaler = MinMaxScaler(feature_range=(0, 1))
shp = X_train.shape
X_train_scaled = np.reshape(X_train, (-1, shp[2]))
X_train_scaled = scaler.fit_transform(X_train_scaled)
X_train_scaled = np.reshape(X_train_scaled, shp)
X_train_scaled.shape



(90715, 280, 39)

In [69]:
shp = X_test.shape
X_test_scaled = np.reshape(X_test, (-1, shp[2]))
X_test_scaled = scaler.transform(X_test_scaled)
X_test_scaled = np.reshape(X_test_scaled, shp)
X_test_scaled.shape

(38878, 280, 39)

In [70]:
X_train_antrop = np.array(X_train_scaled[:,:,0:7])
X_train_evt = np.array(X_train_scaled[:,:,7:])

X_test_antrop = np.array(X_test_scaled[:,:,0:7])
X_test_evt = np.array(X_test_scaled[:,:,7:])

In [65]:
def create_model():
    inputs = Input(shape=(X_train.shape[1], X_train.shape[2]))    
    x = LSTM(4, return_sequences=True)(inputs)
#     x = Dropout(.5)(x)
#     x = Dense(512)(x)
#     x = Dropout(.5)(x)
#     x = Dense(512)(x)
#     x = Dropout(.5)(x)
    outputs = TimeDistributed(Dense(1))(x)
    model = Model(inputs, outputs)
    model.compile('adam', 'mean_absolute_error')
    return model

In [53]:
def create_model_2():
    evts_input = Input(shape=(X_train_evt.shape[1], X_train_evt.shape[2]))    
    lstm_out = LSTM(32, return_sequences=True)(evts_input)
    lstm_out = Dropout(.5)(lstm_out)
    input_2 = Input(shape=(X_train_antrop.shape[1], X_train_antrop.shape[2]))    
    x = keras.layers.concatenate([lstm_out, input_2])
    x = Dense(64, activation='relu')(x)
    x = Dropout(.25)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(.25)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(.25)(x)
    outputs = TimeDistributed(Dense(1))(x)
    model = Model(inputs=[evts_input, input_2], outputs=outputs)
    model.compile('adam', 'mean_absolute_error')
    return model

In [59]:
# model = create_model()
# model.fit(X_train_scaled, y_train, batch_size=32, nb_epoch=10, validation_split=0.1, verbose=1)
model_2 = create_model_2()
model_2.fit([X_train_evt, X_train_antrop], y_train, batch_size=64, nb_epoch=20, validation_split=0.1, verbose=1)

  after removing the cwd from sys.path.


Train on 81643 samples, validate on 9072 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

KeyboardInterrupt: 

In [73]:
# pred = model.predict(X_train_scaled[1:2])
pred = model_2.predict([X_test_evt, X_test_antrop])
# list(zip(pred[0,:,0], y_train[0,:,0]))
# list(zip(pred, y_train[0:1]))

In [72]:
ind = 7
list(zip(pred[ind,:,0], y_test[ind,:,0]))

[(-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.3719934e-05, 0),
 (-6.37199

In [74]:
def metric(real, predicted):
    assert len(real) == len(predicted)
    
    users_mae_values = []
    for real_labels, predicted_labels in zip(real, predicted):
        assert len(real_labels) == len(predicted_labels)
        real_labels, predicted_labels = np.array(real_labels), np.array(predicted_labels)
        
        user_mae = np.mean(np.abs(real_labels - predicted_labels))
        users_mae_values.append(user_mae)
    
    return np.mean(users_mae_values)

In [87]:
y_test[:,:,0]

array([[ 0,  0,  0, ..., 27, 27, 27],
       [ 0,  0,  0, ..., 25, 25, 25],
       [ 0,  0,  0, ..., 25, 25, 25],
       ..., 
       [ 0,  0,  0, ..., 31, 31, 31],
       [ 0,  0,  0, ..., 29, 29, 29],
       [ 0,  0,  0, ..., 25, 25, 25]], dtype=int32)

In [93]:
real = []
predicted = []
for pair in zip(pred[:,:,0], y_test[:,:,0]):
    ind = next((i for i, v in enumerate(pair[0]) if v > 0), -1)
    predicted.append(pair[0][ind:])
    real.append(pair[1][ind:])

In [100]:
i=4
real[i], predicted[i]

(array([49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49,
        49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49,
        49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 29,
        29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
        29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 36, 36, 36, 36, 36, 36,
        36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
        36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 30, 30, 30,
        30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
        30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31,
        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
        31, 31, 31, 31, 31, 31, 31, 31, 34, 34, 34, 34, 34, 34, 34, 34, 34,
        34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
        34, 34, 34, 34, 34, 34, 34, 34], dtype=int32),
 array([ 38.7779274 ,  38.7114219

In [95]:
print("RNN = {}".format(metric(real, predicted)))

RNN = 2.9013877300328494
