In [3]:
import json
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import keras.callbacks
from keras.layers import Dense, Dropout, Activation, TimeDistributed, Input
from keras.layers.recurrent import LSTM
from keras.models import Sequential, Model
from sklearn.model_selection import train_test_split
import pandas as pd
from itertools import groupby
import numpy.ma as ma
import catboost
from sklearn import metrics
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding

In [4]:
event_types = ["bc613fb9d5", "bd18c260dd", "78a254eb1a", "60d7fad2cc", 
               "9f449c8a24", "44f0b93123", "c7863fbab6", "287a406e15", 
               "87d86c4ba1", "718ac49d0b", "1bea63552c", "4739c12685", 
               "09dff9a4e6", "e1218bb17f", "30fe294f41", "f3adcadc86", 
               "8fb049c69a", "a29c238412", "e5e18713a0", "bbfc7ae3f7", 
               "9c67e951dd", "5d5d31ecb1", "81b0435926", "8ccd550d04", 
               "416674c7cf", "96c40ef2e4", "221f9b90a3", "4234879f4b", 
               "444d9e80a6", "8b6000cce4", "d8c799feca"]

In [5]:
def process_row(train_part):
    train_part = json.loads(train_part)
    x_row = []
    y_row = []
    for data_point in train_part:
        x_data_point = [
            np.mean(data_point["lengths_history"]),
            data_point['user_state']["age"],
            data_point['user_state']['height'],
            data_point['user_state']['weight'],
            data_point['user_state']['period_estimate'],
            data_point['user_state']['luteal_estimate'],
            data_point['user_state']['cycle_estimate'],
            int(data_point['period_passed'])
        ]
        evts_map = {x['type'] : np.mean(x['value']) for x in data_point['events']}
        for evt in event_types:
            if evt in evts_map:
                x_data_point.append(np.mean(evts_map[evt]))
            else:
                x_data_point.append(0)
        x_data_point = [float('nan') if x is None else x for x in x_data_point]
        x_row.append(np.array(x_data_point))
        if 'label' in data_point:
            lbl = data_point['label']
            y_row.append(lbl)
    return np.array(x_row), np.array(y_row)

In [None]:
train_parts = []
y_train_parts = []
test_parts = []
i = 0
with open('/Users/vita/Downloads/flo/train_subsample.jsonl', 'r') as train_file, \
        open('/Users/vita/Downloads/flo/test_subsample.jsonl', 'r') as test_file:
    for train_part, test_part in zip(tqdm(train_file), test_file):
        i = i + 1
#         if i > 20000:
#             break
            
        train_x_row, train_y_row = process_row(train_part)
        train_parts.append(train_x_row)
        y_train_parts.append(train_y_row)

#         test_x_row, test_y_row = process_row(test_part)
#         test_parts.append(process_row(test_part))
    y_train_parts = np.array(y_train_parts)
    train_parts = np.array(train_parts)
    test_parts = np.array(test_parts)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
86729it [09:04, 159.21it/s]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_parts, y_train_parts, test_size=0.3)

In [None]:
# fill nan
def fill_nan(data):
    return np.array([np.where(np.isnan(x), ma.array(x, mask=np.isnan(x)).mean(axis=0), x) for x in data])

X_train = fill_nan(X_train)
# X_test = fill_nan(X_test)

In [None]:
X_train = pad_sequences(X_train)
y_train = pad_sequences(y_train)

y_train = np.reshape(y_train, (y_train.shape[0], y_train.shape[1], 1))

X_train.shape, y_train.shape

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
shp = X_train.shape
X_train_scaled = np.reshape(X_train, (-1, shp[2]))
X_train_scaled = scaler.fit_transform(X_train_scaled)
X_train_scaled = np.reshape(X_train_scaled, shp)
X_train_scaled.shape

In [None]:
def create_model():
    inputs = Input(shape=(X_train.shape[1], X_train.shape[2]))    
    x = LSTM(16, return_sequences=True)(inputs)
    x = Dropout(.25)(x)
    outputs = TimeDistributed(Dense(1))(x)
    model = Model(inputs, outputs)
    model.compile('adam', 'mean_absolute_error')
    return model

In [None]:
model = create_model()
model.fit(X_train_scaled, y_train, batch_size=128, nb_epoch=100, validation_split=0.1, verbose=1)

In [50]:
pred = model.predict(X_train_scaled[0:1])
list(zip(pred[0,:,0], y_train[0,:,0]))
# list(zip(pred, y_train[0:1]))

[(0.41366661, 0),
 (0.26722085, 0),
 (0.15752387, 0),
 (0.079743207, 0),
 (0.029780686, 0),
 (0.003290236, 0),
 (-0.0054576993, 0),
 (-0.0037497282, 0),
 (0.0011374354, 0),
 (0.004301846, 0),
 (0.0043039322, 0),
 (0.002217114, 0),
 (-0.00017303228, 0),
 (-0.0016187429, 0),
 (-0.0017358661, 0),
 (-0.00076162815, 0),
 (0.00081145763, 0),
 (0.0025081038, 0),
 (0.0039942861, 0),
 (0.0050982237, 0),
 (0.0057778955, 0),
 (0.0060732961, 0),
 (0.0060620904, 0),
 (0.0058327317, 0),
 (0.0054649711, 0),
 (0.0050249696, 0),
 (0.0045601726, 0),
 (0.0041049123, 0),
 (0.0036794543, 0),
 (0.0032948256, 0),
 (0.0029571652, 0),
 (0.0026658773, 0),
 (0.0024186969, 0),
 (0.0022124052, 0),
 (0.0020418763, 0),
 (0.0019027591, 0),
 (0.0017901063, 0),
 (0.0016999841, 0),
 (0.0016283989, 0),
 (0.0015724301, 0),
 (0.0015295148, 0),
 (0.001496017, 0),
 (0.0014712214, 0),
 (0.0014528632, 0),
 (0.001439929, 0),
 (0.0014311671, 0),
 (0.0014253259, 0),
 (0.0014223456, 0),
 (0.0014208555, 0),
 (0.0014212132, 0),
 (0.