In [487]:
import json
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import keras.callbacks
from keras.layers import Dense, Dropout, Activation, TimeDistributed
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from sklearn.model_selection import train_test_split
import pandas as pd
from itertools import groupby
import numpy.ma as ma
import catboost
from sklearn import metrics

In [614]:
train_parts = []
test_parts = []
i = 0
with open('/Users/vita/Downloads/flo/train_subsample.jsonl', 'r') as train_file, \
        open('/Users/vita/Downloads/flo/test_subsample.jsonl', 'r') as test_file:
    for train_part, test_part in zip(tqdm(train_file), test_file):
        i = i + 1
        if i > 100:
            break
        train_part = json.loads(train_part)
        test_part = json.loads(test_part)
        train_parts.append(train_part)
        test_parts.append(test_part)

97it [00:00, 481.04it/s]


In [3]:
event_types = ["bc613fb9d5", "bd18c260dd", "78a254eb1a", "60d7fad2cc", 
               "9f449c8a24", "44f0b93123", "c7863fbab6", "287a406e15", 
               "87d86c4ba1", "718ac49d0b", "1bea63552c", "4739c12685", 
               "09dff9a4e6", "e1218bb17f", "30fe294f41", "f3adcadc86", 
               "8fb049c69a", "a29c238412", "e5e18713a0", "bbfc7ae3f7", 
               "9c67e951dd", "5d5d31ecb1", "81b0435926", "8ccd550d04", 
               "416674c7cf", "96c40ef2e4", "221f9b90a3", "4234879f4b", 
               "444d9e80a6", "8b6000cce4", "d8c799feca"]

In [399]:
def metric(real, predicted):
    assert len(real) == len(predicted)
    
    users_mae_values = []
    for real_labels, predicted_labels in zip(real, predicted):
        assert len(real_labels) == len(predicted_labels)
        real_labels, predicted_labels = np.array(real_labels), np.array(predicted_labels)
        
        user_mae = np.mean(np.abs(real_labels - predicted_labels))
        users_mae_values.append(user_mae)
    
    return np.mean(users_mae_values)

### baseline - среднее всех предыдущих циклов

In [458]:
def predict_mean(parts):
    real = []
    predicted = []
    for train_part in tqdm(parts):
        days = map(lambda x: (x['lengths_history'], x['label']), train_part)
        days = filter(lambda x: len(x[0]) > 0, days)
        days = map(lambda x: (np.mean(x[0]), x[1]), days)
        days = np.array(list(days)).T
        if days.shape[0] > 0:
            real.append(days[1])
            predicted.append(days[0])
    return real, predicted
real, predicted = predict_mean(train_parts)

100%|██████████| 1000/1000 [00:01<00:00, 817.05it/s]


In [459]:
print("baseline = {}".format(metric(real, predicted)))

baseline = 4.540725539546519


### dataframe

In [615]:
data_train, data_test = train_test_split(train_parts, test_size=0.3)

In [555]:
def prep_dataset(parts):
    X_all = []
    y_all = []
    cycle_dp_lens = []
    for train_part in tqdm(parts):
        cycle_dp_len = []
        for key, group in groupby(train_part, lambda x: x['cycle_num']):
            x_cycle = []
            ccl_len = 0
            for data_point in group:
                ccl_len = ccl_len + 1
                x_data_point = [
                    np.mean(data_point["lengths_history"]),
                    data_point['user_state']["age"],
                    data_point['user_state']['height'],
                    data_point['user_state']['weight'],
                    data_point['user_state']['period_estimate'],
                    data_point['user_state']['luteal_estimate'],
                    data_point['user_state']['cycle_estimate']
#                     int(data_point['period_passed'])
                ]
                evts_map = {x['type'] : np.mean(x['value']) for x in data_point['events']}
                for evt in event_types:
                    if evt in evts_map:
                        x_data_point.append(np.mean(evts_map[evt]))
                    else:
                        x_data_point.append(0)
                x_data_point = [float('nan') if x is None else x for x in x_data_point]
                label = data_point['label']
                x_cycle.append(np.array(x_data_point))
            X_all.append(np.array(x_cycle))
            y_all.append(label)
            cycle_dp_len.append(ccl_len)
        cycle_dp_lens.append(cycle_dp_len)
    X_all = np.array(X_all)
    y_all = np.array(y_all)
    return X_all, y_all, cycle_dp_lens
    
X_test, y_test, test_cycle_lens = prep_dataset(data_test)
X_train, y_train, train_cycle_lens = prep_dataset(data_train)


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)

  1%|          | 26/3000 [00:00<00:11, 249.38it/s][A
  2%|▏         | 48/3000 [00:00<00:12, 231.23it/s][A
  2%|▏         | 70/3000 [00:00<00:12, 227.20it/s][A
  3%|▎         | 90/3000 [00:00<00:13, 220.10it/s][A
  4%|▍         | 117/3000 [00:00<00:12, 225.65it/s][A
  5%|▍         | 141/3000 [00:00<00:12, 227.21it/s][A
  5%|▌         | 162/3000 [00:00<00:13, 214.87it/s][A
  6%|▋         | 190/3000 [00:00<00:12, 220.07it/s][A
  7%|▋         | 214/3000 [00:00<00:12, 221.18it/s][A
  8%|▊         | 244/3000 [00:01<00:12, 223.35it/s][A
  9%|▉         | 267/3000 [00:01<00:12, 220.74it/s][A
 10%|▉         | 293/3000 [00:01<00:12, 223.56it/s][A
 11%|█         | 316/3000 [00:01<00:12, 222.59it/s][A
 11%|█▏        | 340/3000 [00:01<00:11, 222.19it/s][A
 12%|█▏        | 367/3000 [00:01<00:11, 225.07it/s][A
 13%|█▎        | 393/3000 [00:01<00:11, 225.90it/s][A
 14%|█▍        | 417/3000 [00:01<00:11, 224.41it/s][A
 15%|█▍  

In [636]:
train_parts[0]

[{'cycle_num': 0,
  'events': [],
  'global_index': 0,
  'label': 25.96,
  'lengths_history': [],
  'period_passed': False,
  'user_state': {'age': 36.22,
   'cycle_estimate': 28,
   'height': 170.0,
   'luteal_estimate': None,
   'period_estimate': 5,
   'weight': None}},
 {'cycle_num': 0,
  'events': [],
  'global_index': 1,
  'label': 25.96,
  'lengths_history': [],
  'period_passed': False,
  'user_state': {'age': 36.22,
   'cycle_estimate': 28,
   'height': 170.0,
   'luteal_estimate': None,
   'period_estimate': 5,
   'weight': None}},
 {'cycle_num': 0,
  'events': [],
  'global_index': 2,
  'label': 25.96,
  'lengths_history': [],
  'period_passed': False,
  'user_state': {'age': 36.22,
   'cycle_estimate': 28,
   'height': 170.0,
   'luteal_estimate': None,
   'period_estimate': 5,
   'weight': None}},
 {'cycle_num': 0,
  'events': [],
  'global_index': 3,
  'label': 25.96,
  'lengths_history': [],
  'period_passed': True,
  'user_state': {'age': 36.22,
   'cycle_estimate': 28,

In [631]:
def prep_dataframe(parts):
    X_all = []
    cycle_dp_lens = []
    for train_part in tqdm(parts):
        cycle_dp_len = []
        for key, group in groupby(train_part, lambda x: x['cycle_num']):
            x_cycle = {}
            ccl_len = 0
            group = list(group)
            
            data_point = group[0]
            x_cycle['age'] = data_point['user_state']["age"]
            x_cycle['height'] = data_point['user_state']["height"]
            x_cycle['weight'] = data_point['user_state']["weight"]
            x_cycle['period_estimate'] = data_point['user_state']["period_estimate"]
            x_cycle['luteal_estimate'] = data_point['user_state']["luteal_estimate"]
            x_cycle['cycle_estimate'] = data_point['user_state']["cycle_estimate"]
            x_cycle['luteal_estimate'] = data_point['user_state']["luteal_estimate"]
            x_cycle['label'] = data_point["label"]
            
            x_cycle['prev_len_mean'] = np.mean(data_point['lengths_history'])
            
            X_all.append(x_cycle)
            cycle_dp_len.append(ccl_len)
        cycle_dp_lens.append(cycle_dp_len)
    return pd.DataFrame(X_all), cycle_dp_lens
    
df_test, test_cycle_lens = prep_dataframe(data_test)
df_train, train_cycle_lens = prep_dataframe(data_train)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 30/30 [00:00<00:00, 6761.37it/s]
100%|██████████| 70/70 [00:00<00:00, 5882.85it/s]


In [633]:
df_train.head(100)

Unnamed: 0,age,cycle_estimate,height,label,luteal_estimate,period_estimate,prev_len_mean,weight
0,16.17,,160.02,30.00,,5.0,,
1,15.41,29.0,161.00,28.00,,5.0,,49.60
2,15.49,29.0,161.00,22.00,,5.0,28.000000,49.60
3,18.02,30.0,162.00,29.00,,5.0,,54.79
4,18.10,30.0,162.00,28.00,,5.0,29.000000,54.79
5,18.18,30.0,162.00,31.00,,5.0,28.500000,54.79
6,18.26,30.0,162.00,29.96,,5.0,29.333333,54.79
7,18.34,30.0,162.00,28.00,,5.0,29.490000,54.79
8,18.42,30.0,162.00,32.00,,5.0,29.192000,54.79
9,18.51,30.0,162.00,32.00,,5.0,29.660000,54.79


In [556]:
max_cycle_len = max([x.shape[0] for x in X_all])

# сделать все циклы одной длинны
def fill_cycle_to_max(ccl):
    if ccl.shape[0] == max_cycle_len:
        return ccl
    cycle_mean = [np.mean(x) for x in ccl.T]
    for i in range(max_cycle_len - ccl.shape[0]):
        ccl = np.vstack((cycle_mean, ccl))
    return ccl

X_train = np.array([fill_cycle_to_max(x) for x in X_train])
X_test = np.array([fill_cycle_to_max(x) for x in X_test])

60


In [557]:
# fill nan
def fill_nan(x):
    shp = x.shape
    cols = x.reshape(-1, x.shape[2])
    cols = np.where(np.isnan(cols), ma.array(cols, mask=np.isnan(cols)).mean(axis=0), cols)
    x = cols.reshape(shp)
    return x

X_train = fill_nan(X_train)
X_test = fill_nan(X_test)

### gb

In [576]:
scaler = MinMaxScaler(feature_range=(0, 1))

# X_train_1 = X_train.reshape(X_train.shape[0], -1)
X_train_1 = X_train[:,0,:]
X_train_1 = scaler.fit_transform(X_train_1)

# X_test_1 = X_test.reshape(X_test.shape[0], -1)
X_test_1 = X_test[:,0,:]
X_test_1 = scaler.transform(X_test_1)

cbr = catboost.CatBoostRegressor()
cbr.fit(X_train_1, y_train)

<catboost.core.CatBoostRegressor at 0x1fda1e748>

In [577]:
preds = cbr.predict(X_test_1)

In [578]:
preds[:10]

array([ 32.38697369,  30.43245206,  32.43529136,  30.3610592 ,
        33.76024973,  31.46038591,  34.40076977,  28.80294867,
        26.67866034,  26.67634087])

In [579]:
y_test[:10]

array([ 35.  ,  30.  ,  34.  ,  33.96,  36.  ,  33.  ,  37.  ,  23.  ,
        20.  ,  21.11])

In [580]:
real = []
predicted = []
ind = 0
for row in test_cycle_lens:
    if len(row) == 0:
        continue
    row_pred = []
    row_real = []
    for el in row:        
        row_pred = row_pred + [preds[ind]] * el
        row_real = row_real + [y_test[ind]] * el
    predicted.append(row_pred)
    real.append(row_real)
    ind = ind + 1

In [581]:
print("catboost = {}".format(metric(real, predicted)))

catboost = 2.427640868661121


In [582]:
imp = np.argsort(cbr.feature_importances_)[::-1]
imp

array([10,  0, 26, 12, 14, 16, 18,  6,  7, 29, 13, 24, 21, 20, 30, 19, 11,
       17, 15,  8, 28, 23, 22,  9, 27,  1, 25, 32, 31, 33,  3, 37, 36,  5,
        2, 35,  4, 34])

In [585]:
[(i, X_test_1[0][i], cbr.feature_importances_[i]) for i in imp]

[(10, 0.0, 14.958418176585939),
 (0, 0.32841945217869473, 9.812488407837233),
 (26, 0.028571428571428571, 7.156810984446003),
 (12, 0.028571428571428571, 6.39689035091215),
 (14, 0.0, 6.112480908913668),
 (16, 0.057142857142857141, 5.308356774617925),
 (18, 0.0, 4.734013080063212),
 (6, 0.16666666666666663, 4.255960613430969),
 (7, 0.0, 3.9494878258180273),
 (29, 0.0, 3.804458189301016),
 (13, 0.0, 3.5921153972526265),
 (24, 0.0, 3.3052103509063757),
 (21, 0.028571428571428571, 3.284505044195603),
 (20, 0.0, 3.142613208505591),
 (30, 0.0, 2.7756417405252183),
 (19, 0.0, 2.569039453743105),
 (11, 0.057142857142857141, 2.1141815843837906),
 (17, 0.0, 2.0922836888647116),
 (15, 0.0, 1.7737189536266031),
 (8, 0.028571428571428571, 1.6692034510847447),
 (28, 0.0, 1.5827593557814559),
 (23, 0.0, 1.3001768435082253),
 (22, 0.0, 1.1897079602916618),
 (9, 0.057142857142857141, 0.8145995892242759),
 (27, 0.0, 0.6465697515445686),
 (1, 0.29990779160903619, 0.2991207627168384),
 (25, 0.0, 0.201488

### first RNN

In [590]:
data_points_count = 60
features_count = X_train.shape[2]

In [610]:
model = Sequential()
model.add(LSTM(4, input_shape=(features_count, 1)))
# model.add(Dropout(0.2))
model.add(Dense(1, activation= 'linear'))
model.compile(loss='mean_squared_error', optimizer='adam')

In [613]:
counter = 0
for row in tqdm(zip(X_train, y_train)):
    x = row[0].reshape(60,38,1)
    y = row[1]
    print(x.shape, y)
    model.fit(x, y, epochs=1, batch_size=x.shape[0], verbose=0, shuffle=False)
    model.reset_states()
    if counter % 250 == 0:
        testPredict = model.predict(X_test)
        y_test_rnn = [y_test]
        testScore = math.sqrt(mean_squared_error(y_test_rnn[0], testPredict[:, 0]))
        print('After {} steps testScore = {}'.format(counter, testScore))

0it [00:00, ?it/s]

(60, 38, 1) 29.0





ValueError: Error when checking target: expected dense_6 to have 2 dimensions, but got array with shape ()

### gb 2