In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config Completer.use_jedi = False  # to make autocompletion faster

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import gc
%matplotlib


In [None]:
tf.__version__

In [None]:
"""
# load kaggle environment if in google colab
from google.colab import files
files.upload() #upload kaggle.json
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json
"""

In [None]:
"""
!kaggle competitions download -c web-traffic-time-series-forecasting
!yes|unzip web-traffic-time-series-forecasting.zip
!mkdir logs
!mkdir saved_model
"""

## Make dataset

In [None]:
from webtraffic_utils import *
output_len = 62

In [None]:
# features like (access, spectral tones) created by feature_engineering.ipynb

from ast import literal_eval
df_feat = pd.read_csv("features_computed.csv.zip",converters={"tones":literal_eval}).set_index("Page")
weekly_tone = df_feat["tones"].apply(lambda x: (np.abs(np.array(x)-1./7.)<1e-2).any()).rename("week")

In [None]:
df_ds = pd.read_csv("train_2.csv.zip", header=0).set_index("Page").fillna(0).astype(np.int32)

[repeat last value](#repeat-last-value)  
[linear model](#linear-model)  
[RNN](#RNN)  

## Exploration

In [None]:
# remarkable pages

rem_pages = [
    'Acier_inoxydable_fr.wikipedia.org_desktop_all-agents',
]
#rem_pages = df_ds.loc[weekly_tone & (df_feat["access"] == "desktop_all-agents")].index



page = rem_pages[0]
traffic_t =  df_ds.loc[page].values[:-1].astype(int)
f,vax = plt.subplots(1,3, figsize=(20,4))
fax = vax.flat


ax=next(fax)
ax.plot(traffic_t)
ax.set_title("time traffic")


ax=next(fax)
ax.plot(estimated_autocorrelation(traffic_t))
ax.set_xticks([0,365,2*365])
ax.set_title("autocorrelation")
ax.grid()

ax=next(fax)
plot_spectrest(traffic_t, ax)
ax.set_title("spectral estimation")




plt.suptitle(page, fontsize=15)
plt.show()

## Models

[repeat last value](#repeat-last-value)  
[linear model](#linear-model)  
[RNN](#RNN)  

### repeat last value

In [None]:
import tensorflow_probability as tfp

class Median(tf.keras.layers.Layer):
    def __init__(self, median_depth=40):
        super().__init__()
        self.median_depth =median_depth
            
    def call(self, inputs):
        #print(inputs)
        Xtraff = tfp.stats.percentile(inputs[:,-self.median_depth:], 50.0, 
                                      interpolation='lower', axis=1)
        return tf.tile(tf.expand_dims(Xtraff,axis=1), [1,output_len])

In [None]:
I_traffic = tf.keras.layers.Input(shape=(None,))
I_page = tf.keras.layers.Input(shape=(), dtype=object)
outputs = Median(40)(I_traffic)  
med = tf.keras.models.Model([I_page,I_traffic],outputs)

med.compile(loss=SmapeLoss(), metrics=[SmapeMetric()])

features, target = get_model_inputs(df_ds)
med.evaluate(features, target, batch_size=1000)

med.save("saved_model/median", save_format='tf')
_=gc.collect()

In [None]:
features, target = get_model_inputs(df_ds.loc[~weekly_tone])
med.evaluate(features, target, batch_size=1000)


In [None]:
plot_check_result(df_ds, rem_pages[0], [med])

### linear model

In [None]:
def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.05)
lr_cb = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [None]:
tf.keras.backend.clear_session()
tb_cb = create_tb_cb("linear")
Ldelay=100

traffic = tf.keras.layers.Input(shape=(1000,))
page = tf.keras.layers.Input(shape=())

outputs = tf.keras.layers.Dense(units=output_len,input_dim=Ldelay)(traffic[:,-Ldelay:])

model_linear = tf.keras.Model(inputs=[page, traffic], outputs=[outputs])

model_linear.summary()

model_linear.compile(loss=SmapeLoss(), optimizer=tf.optimizers.Adam(learning_rate=1e-4), 
                     metrics=[SmapeMetric()])

In [None]:
es_cb = tf.keras.callbacks.EarlyStopping(monitor='smape', min_delta=0.1, patience=5, verbose=0, restore_best_weights=True)

#ds1.cache()
features, target = get_model_inputs(df_ds[weekly_tone])
feat_val, target_val = get_model_inputs(df_ds.iloc[:,:-50][weekly_tone])
model_linear.fit(features, target, epochs=100, callbacks=[tb_cb, es_cb], batch_size=32, validation_data=(feat_val, target_val))
model_linear.save("saved_model/model_linear", save_format='tf')

In [None]:
plot_check_result(df_ds, rem_pages[0], [model_linear, med])

In [None]:
weights = model_linear.get_layer("dense").get_weights()[0]

f,ax = plt.subplots()
ax.plot(np.abs(weights[:,0]))
ax.grid()

### RNN

In [None]:
#!rm -Rf logs/*

In [None]:
Nneurons = 20
Nlayers = 1
MaxTs = 100
usePastYear = False
useMetadata = False
Seq2seq = True

In [None]:
tf.random.set_seed(42)

simn = 'Ts'+str(MaxTs)+'-Nn'+str(Nneurons)+'-Nl'+str(Nlayers)

tf.keras.backend.clear_session()
tb_cb = create_tb_cb(simn)

model_rnn = get_rnn_model(Seq2seq, Nneurons, Nlayers, MaxTs, usePastYear, useMetadata)

#model_rnn.summary()

In [None]:
es_cb = tf.keras.callbacks.EarlyStopping(monitor='val_smape', min_delta=0.1, patience=5, verbose=0, restore_best_weights=True)

weekly_pages = df_ds.loc[weekly_tone & (df_feat["access"] == "desktop_all-agents")].index

features, target = get_model_inputs(df_ds.loc[weekly_tone], 
                                    return_seq=Seq2seq*MaxTs)
feat_val, target_val = get_model_inputs(df_ds.iloc[:,:-62].loc[weekly_tone], 
                                        return_seq=Seq2seq*MaxTs)

model_rnn.fit(features, target, epochs=1, callbacks=[tb_cb, es_cb] , batch_size=1, validation_data=(feat_val, target_val))
model_rnn.save("saved_model/model_rnn")
del feat_val, target_val, features, target
gc.collect()

### Mixed model

For each web page, select the best between median and rnn 

In [None]:
med = tf.keras.models.load_model('saved_model/median', custom_objects={'SmapeMetric': SmapeMetric,'SmapeLoss': SmapeLoss})
model_linear = tf.keras.models.load_model('saved_model/model_linear', custom_objects={'SmapeMetric': SmapeMetric,'SmapeLoss': SmapeLoss})
model_rnn = tf.keras.models.load_model('saved_model/model_rnn', custom_objects={'SmapeMetric': SmapeMetric,'SmapeLoss': SmapeLoss, 
                                                                                'OneHotEncodingLayer':OneHotEncodingLayer})

In [None]:
# build a seq2vec model with the same weights to save memand cpu in predict
rnn_seq2vec = get_rnn_model(False, Nneurons=Nneurons, Nlayers=Nlayers, max_delay=MaxTs)
rnn_seq2vec.get_layer("gru0").set_weights(model_rnn.get_layer("gru0").get_weights())
rnn_seq2vec.get_layer("dense0").set_weights(model_rnn.get_layer("td").get_weights())
#rnn_seq2vec.summary()

In [None]:
def smape_row(df_train, model):
    features, ytrue = get_model_inputs(df_train.iloc[:,-MaxTs-output_len:])
    def _smape_row(A, F):
        return np.mean(100 * (2 * np.abs(F - A) / (np.abs(A) + np.abs(F) + np.finfo(float).eps)), axis=1)
    return pd.Series(_smape_row(model.predict(features, batch_size=1000, verbose=1), ytrue), index=df_train.index)

In [None]:
# find best choice between linear and median, on cv_sz folds
cv_sz = 3
decal = 50

best_accu = pd.Series(0, index=df_ds.index)
for ii in range(cv_sz):
    last_samp = df_ds.shape[1]-1-ii*decal
    df_train = df_ds.iloc[:,last_samp-MaxTs-output_len:last_samp]
    smape_scores =[]
    for ii, model in enumerate([med, rnn_seq2vec]):
        smape_scores.append(smape_row(df_train, model).rename("model_"+str(ii)))
    print("fold : {}, mixed smape : {:.1f}".format( ii, pd.concat(smape_scores, axis=1).min(axis=1).mean()))
    best_accu = best_accu + (pd.concat(smape_scores, axis=1).idxmin(axis=1) == "model_0").astype(float)/cv_sz
    
_ = gc.collect()


In [None]:
med_select = (best_accu>0).rename("feat_median_sel")*1
med_select.to_csv("median_select.csv")

In [None]:
tf.keras.backend.clear_session()
tb_cb = create_tb_cb("mixed")

traffic = tf.keras.layers.Input(shape=(MaxTs,), dtype=tf.int32)
page = tf.keras.layers.Input(shape=(), dtype=object)
algo_select = tf.keras.layers.Input(shape=(), dtype=tf.int32)


o_lin = rnn_seq2vec([page, traffic])
o_med = med([page, traffic])
    
outputs =  o_lin * tf.tile(tf.expand_dims(tf.cast(1-algo_select, o_lin.dtype),1),[1,62])
outputs =  outputs + o_med * tf.tile(tf.expand_dims(tf.cast(algo_select, o_med.dtype),1),[1,62])

mixed_model = tf.keras.Model(inputs=[page, traffic, algo_select], outputs=[outputs])

mixed_model.compile(loss=SmapeLoss(), optimizer=tf.optimizers.Adam(learning_rate=1e-4), metrics=[SmapeMetric()])

In [None]:
features, ytrue = get_model_inputs(df_ds.iloc[:,-MaxTs-output_len -50:-50].join(med_select))
mixed_model.evaluate(features, ytrue, batch_size=1000)

In [None]:
features, ytrue = get_model_inputs(df_ds.iloc[:,-MaxTs-output_len -50:-50])
med.evaluate(features, ytrue, batch_size=1000)

In [None]:
mixed_model.save("saved_model/mixed_model", save_format='tf')

In [None]:
del features, ytrue
gc.collect()

In [None]:
plot_check_result(df_ds.iloc[:,-MaxTs-62:].join(med_select), rem_pages[0], [mixed_model, med])

### ouput

In [None]:
key = pd.read_csv("key_2.csv.zip").set_index("Page")

In [None]:
key["Visits"] = 0

In [None]:
def output_form(features, _model=None):
    """ return a serie indexed by Page """
    out_date = pd.date_range(start="2017-09-13", end="2017-11-13", freq="1D").strftime("%Y-%m-%d").to_list()
    num_pred = np.clip(_model.predict(features, batch_size=1000, verbose=1).astype(int), a_min=0, a_max=None)
    index = df_ds.index  # np.char.decode(features[0].numpy().astype(bytes))
    ret = pd.DataFrame(num_pred, columns=out_date, index=index).stack().rename("Visits")
    ret.index = [ii[0]+"_"+ii[1] for ii in ret.index]
    return ret

In [None]:
features = [tf.convert_to_tensor(df_ds.index), df_ds.values[:,-MaxTs:], tf.convert_to_tensor(med_select)]

Visits_pred = output_form(features, mixed_model)
Visits_pred.head()

In [None]:
key.loc[Visits_pred.index, "Visits"] = Visits_pred.astype(int)

In [None]:
key.to_csv("subm_mixed.csv", encoding='utf-8', index=False)

In [None]:
!kaggle competitions submit -f subm_mixed.csv -m "mixed med/rnn (fav med)"  web-traffic-time-series-forecasting

In [None]:
del Visits_pred, key ,features
gc.collect()