In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config Completer.use_jedi = False  # to make autocompletion faster

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
tf.__version__

In [None]:
"""
# load kaggle environment if in google colab
from google.colab import files
files.upload() #upload kaggle.json
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json
!mkdir logs
"""

In [None]:
"""
!kaggle competitions download -c web-traffic-time-series-forecasting
!yes|unzip web-traffic-time-series-forecasting.zip
!yes|unzip train_2.csv.zip
!tail -n +2 train_2.csv|shuf --random-source train_2.csv > train_2_shuffled.csv
!head -n -10000 train_2_shuffled.csv > train_set.csv
!tail -n 10000 train_2_shuffled.csv > validation_set.csv
"""

## Make dataset

In [None]:
output_len = 62
normalize_ds = False
batch_size = 4096

def process_line(line, normalize_ds=False):
    line = tf.io.decode_csv(line, record_defaults=[""]+[0.]*803)
    # categorical features 
    agent = tf.strings.split(line[0], sep="_")[-1]
    access = tf.strings.split(line[0], sep="_")[-2]
    project = tf.strings.split(line[0], sep="_")[-3]
    traffic = tf.stack(line[1:])
    page = tf.strings.split(line[0], sep="_")[-4]

    if normalize_ds:
        traffic = traffic / (tf.reduce_max(traffic) +1e-10)
    return (agent, access, project, traffic[:-output_len], page), traffic[-output_len:]
    #return tf.stack(line[1:-62]), tf.stack(line[-62:])

    
def make_dataset(ds0,nmax=None, normalize_ds=False , batch_size = 32):
    ds1 = ds0.map(lambda x: process_line(x, normalize_ds))
    if nmax is not None:
        ds1 = ds1.take(nmax)
    return ds1.batch(batch_size).prefetch(1).cache()

ds = make_dataset(tf.data.TextLineDataset("train_set.csv"), normalize_ds=normalize_ds, batch_size=batch_size)
val_ds = make_dataset(tf.data.TextLineDataset("validation_set.csv"), normalize_ds=normalize_ds , batch_size=batch_size)
ds_short = make_dataset(tf.data.TextLineDataset("train_set.csv"), normalize_ds=normalize_ds,nmax=50000, batch_size=batch_size)


##### categories

In [None]:
ds_cat = tf.data.TextLineDataset("train_set.csv").map(lambda x: process_line(x, normalize_ds)).batch(50000).take(1)
np_cat = list(ds_cat.as_numpy_iterator())

In [None]:
from webtraffic_utils import OneHotEncodingLayer

In [None]:
onehotAgent = OneHotEncodingLayer()
onehotAgent.adapt(list(np.unique(np_cat[0][0][0])))
onehotAccess = OneHotEncodingLayer()
onehotAccess.adapt(list(np.unique(np_cat[0][0][1])))
onehotProject = OneHotEncodingLayer()
onehotProject.adapt(list(np.unique(np_cat[0][0][2])))

In [None]:
onehotProject.get_config()

[repeat last value](#repeat-last-value)  
[linear model](#linear-model)  
[RNN](#RNN)  

##### training utils

In [None]:
def smape(A, F):
    return tf.reduce_mean(2 * tf.math.abs(F - A) / (tf.math.abs(A) + tf.math.abs(F) + 1e-16)) * 100 

def smape_reg(A, F):
    epsilon = 1e-3
    summ = tf.maximum(tf.abs(A) + tf.abs(F) + epsilon, 0.5 + epsilon)
    return tf.abs(A - F) / summ * 2.0 * 100

def smape_np(A, F):
    return 100/A.size * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F) + np.finfo(float).eps))

In [None]:
#tensorboard callbacks
from datetime import datetime

def create_tb_cb(model_name):
    return tf.keras.callbacks.TensorBoard(log_dir="./logs/"+model_name+"-"+datetime.now().strftime("%H-%M-%S"),
                                          histogram_freq=10
                                         )

In [None]:
def plot_check_result(x_check, predict_func, ax):
    """
    Args:
        x_check: np.array
    """
    pred = predict_func(x_check[:-output_len])
    ax.plot(x_check)
    ax.plot(np.arange(output_len)+len(x_check)-output_len, pred)


## Exploration

In [None]:
def estimated_autocorrelation(x):
    """
    http://stackoverflow.com/q/14297012/190597
    http://en.wikipedia.org/wiki/Autocorrelation#Estimation
    """
    n = len(x)
    variance = x.var()
    x = x-x.mean()
    r = np.correlate(x, x, mode = 'full')[-n:]
    assert np.allclose(r, np.array([(x[:n-k]*x[-(n-k):]).sum() for k in range(n)]))
    result = r/(variance*(np.arange(n, 0, -1)))
    return result

In [None]:
def plot_spectrest(x, ax):
    fft = tf.signal.rfft(x-np.mean(x))
    T = len(fft)
    ax.plot(np.abs(fft))
    ax.set_yscale("log")
    ax.grid()
    ax.set_xscale("log")
    ax.set_xticks([2*T/7., 2*T/30.5, 2*T/365.])
    ax.set_xticklabels(["weekly", "monthly", "yearly"], rotation=30)

In [None]:
# identify some interessant line
df_examples = pd.read_csv("train_2.csv.zip", header=0, nrows=50000).set_index("Page")
agent = pd.Series(df_examples.index.map(lambda x: x.split("_")[-1]), df_examples.index)
access = pd.Series(df_examples.index.map(lambda x: x.split("_")[-2]), df_examples.index)
project = pd.Series(df_examples.index.map(lambda x: x.split("_")[-3]), df_examples.index)
df_allagent = df_examples.loc[agent=="all-agents"]
iter_allagent = df_allagent.iterrows()

In [None]:
agent.nunique(),access.nunique(),project.nunique()

In [None]:
# remarkable pages
rem_pages = [
    'Acier_inoxydable_fr.wikipedia.org_desktop_all-agents',
]
page = rem_pages[0]
traffic_t =  df_examples.loc[page].values
f,vax = plt.subplots(1,3, figsize=(20,4))
fax = vax.flat


ax=next(fax)
ax.plot(traffic_t)
ax.set_title("time traffic")


ax=next(fax)
ax.plot(estimated_autocorrelation(traffic_t))
ax.set_xticks([0,365,2*365])
ax.set_title("autocorrelation")
ax.grid()

ax=next(fax)
plot_spectrest(traffic_t, ax)
ax.set_title("spectral estimation")




plt.suptitle(page, fontsize=15)
plt.show()

## Models

[repeat last value](#repeat-last-value)  
[linear model](#linear-model)  
[RNN](#RNN)  

### repeat last value

In [None]:
class RepeatLastValue(tf.keras.Model):
    def call(self, inputs):
        #print(inputs)
        Xtraff = inputs[3]
        return tf.tile(Xtraff[:,-2:-1], tf.constant([1,output_len], tf.int32))

In [None]:
rlv = RepeatLastValue()
rlv.compile(loss=smape_reg, metrics=[smape,"mae"])

In [None]:
rlv.evaluate(ds)

In [None]:
def rlv_estimator(x):
    """
    Args:
        x np.array, len=741
    """
    fake_feature = np.array([b""], dtype=object)
    fnorm = 1.0
    if normalize_ds:
        fnorm = np.max(x)
    return rlv.predict((fake_feature, fake_feature, fake_feature, x.reshape(1,-1)/fnorm))[0] * fnorm

In [None]:
f, ax = plt.subplots()
plot_check_result(df_examples.loc[page].values, rlv_estimator, ax)
ax.set_title(page)
plt.show()

### linear model

In [None]:
tf.keras.backend.clear_session()
tb_cb = create_tb_cb("linear")

class preprocessing(tf.keras.layers.Layer):
    def call(self, inputs):
        return inputs[:,-150:]

inputs = tf.keras.layers.Input(shape=(741,))
agent = tf.keras.layers.Input(shape=())
access = tf.keras.layers.Input(shape=())
project = tf.keras.layers.Input(shape=())
page = tf.keras.layers.Input(shape=())
x = preprocessing()(inputs)
outputs = tf.keras.layers.Dense(units=output_len)(x)

model_linear = tf.keras.Model(inputs=[agent, access, project, inputs, page], outputs=[outputs])

model_linear.summary()

In [None]:
model_linear.compile(loss=smape_reg, optimizer=tf.optimizers.Adam(learning_rate=1e-4), 
                     metrics=[smape,"mae"],)

In [None]:
def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.05)
lr_cb = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [None]:
class CustomCallback(tf.keras.callbacks.Callback):
    def on_train_batch_end(self, batch, logs=None):
        keys = list(logs.keys())
        #print("...Training: end of batch {}; got log keys: {}".format(batch, keys))
        weights = self.model.get_layer("dense").get_weights()[0]
        print(" weights mean ", np.mean(weights), " max ", np.max(weights))

In [None]:
#ds1.cache()
model_linear.fit(ds, epochs=100, callbacks=[tb_cb, lr_cb], validation_data=val_ds)

In [None]:
weights = model_linear.get_layer("dense").get_weights()[0]

f,ax = plt.subplots()
ax.plot(np.abs(weights[:,0]))
ax.grid()

In [None]:
def linear_estimator(x):
    """
    Args:
        x np.array, len=741
    """
    fake_feature = np.array([b""], dtype=object)
    return model_linear.predict((fake_feature, fake_feature, fake_feature,[x.reshape(1,-1)]))[0]

#linear_estimator(df_examples.loc[page].values[:-output_len])

In [None]:
f, ax = plt.subplots()
plot_check_result(df_examples.loc[page].values, linear_estimator, ax)
ax.set_title(page)
plt.show()

### RNN

In [None]:
#!rm -Rf logs/*

In [None]:
class normalize_rnn(tf.keras.layers.Layer):
    def call(self, inputs):
        fact = tf.reduce_max(inputs, axis=1, keepdims=True)
        ret = tf.divide(inputs, fact + 1e-10) 
        return ret, fact


class denormalize_rnn(tf.keras.layers.Layer):
    def call(self, inputs, fact):
        #ret = tf.maximum(tf.floor(tf.multiply(inputs, fact)), 0)
        ret = tf.multiply(inputs, fact)
        return ret

#xtry = tf.constant(np.random.randint(-1000, 1000, size=(10,5)), dtype=tf.float32)
#xn, fact = normalize_rnn()(xtry)
#denormalize_rnn()(xn,fact),xtry

In [None]:
Nneurons = 20
Nlayers = 2
MaxTs = 200
usePastYear = True
useMetadata = True
tf.random.set_seed(42)

simn = 'Ts'+str(MaxTs)+'-Nn'+str(Nneurons)+'-Nl'+str(Nlayers)

tf.keras.backend.clear_session()
tb_cb = create_tb_cb(simn)

class preprocessing_rnn(tf.keras.layers.Layer):
    def call(self, inputs, agent1h, access1h):
        ret = inputs[:,-MaxTs:,np.newaxis]
        if useMetadata:
            agent_broadcast = tf.tile(agent1h[:,np.newaxis,:],[1,MaxTs,1])
            ret = tf.concat([ret, agent_broadcast], axis=2)
            access_broadcast = tf.tile(access1h[:,np.newaxis,:],[1,MaxTs,1])
            ret = tf.concat([ret, access_broadcast], axis=2)
            
        if usePastYear:
            pastYear = inputs[:, -MaxTs-365+output_len:-365+output_len, np.newaxis]
            ret = tf.concat([ret, pastYear], axis=2)
        return ret

inputs = tf.keras.layers.Input(shape=(741,))
I_agent = tf.keras.layers.Input(shape=(), dtype=object)
agent1h = onehotAgent(I_agent)
I_access = tf.keras.layers.Input(shape=(), dtype=object)
access1h = onehotAccess(I_access)

I_project = tf.keras.layers.Input(shape=(), dtype=object)
I_page = tf.keras.layers.Input(shape=())


x, factors = normalize_rnn()(inputs)
x = preprocessing_rnn()(x, agent1h, access1h)
for ii in range(Nlayers-1):
    x = tf.keras.layers.GRU(Nneurons, return_sequences=True)(x)
x = tf.keras.layers.GRU(Nneurons)(x)
x= tf.keras.layers.Dense(output_len)(x)
outputs= denormalize_rnn()(x, factors)

model_rnn = tf.keras.Model(inputs=[I_agent, I_access, I_project, inputs, I_page], outputs=[outputs])

model_rnn.summary()

In [None]:
model_rnn.compile(loss=smape_reg, optimizer=tf.optimizers.Adam(learning_rate=1e-4),metrics=[smape])

In [None]:
model_rnn.fit(ds_short, epochs=100, callbacks=[tb_cb], validation_data=val_ds)

In [None]:
def rnn_estimator(x):
    """
    Args:
        x np.array, len=741
    """
    fake_feature = np.array([b""], dtype=object)
    fnorm = 1.0
    if normalize_ds:
        fnorm = np.max(x)
    return model_rnn.predict((fake_feature, fake_feature, fake_feature,[x.reshape(1,-1)/fnorm]))[0] * fnorm


In [None]:
%matplotlib inline
f, ax = plt.subplots()
plot_check_result(df_examples.loc[page].values, rnn_estimator, ax)
ax.set_title(page)
plt.show()

#### error analysis

In [None]:
def get_ds_pred(ds_short):
    pred = model_rnn.predict(ds_short)
    lds = list(ds_short.as_numpy_iterator())
    agent = np.concatenate([batch[0][0] for batch in lds])
    access = np.concatenate([batch[0][1] for batch in lds])
    ytrue = np.concatenate([batch[1] for batch in lds])
    xtrain = np.concatenate([batch[0][3] for batch in lds])
    return pred, xtrain, ytrue

In [None]:
def smape_row(A, F):
    return np.mean(100 * (2 * np.abs(F - A) / (np.abs(A) + np.abs(F) + np.finfo(float).eps)), axis=1)

In [None]:
pred_train, x_train, y_train = get_ds_pred(ds_short)
pred_val, x_val, y_val = get_ds_pred(val_ds)

In [None]:
f, ax =plt.subplots()
kwargs = {"alpha": 0.5, "bins": 50}
ax.hist(smape_row(pred_train, y_train),**kwargs, label="train")
ax.hist(smape_row(pred_val, y_val),**kwargs, label="validation")
ax.set_title(np.mean(smape_row(pred, ytrue)))
plt.show()


In [None]:
xsm = smape_row(pred_train, y_train)
np.mean(xsm[xsm < 100])

In [None]:
vsmape = pd.Series(smape_row(pred_train, y_train))
vsmape[(vsmape>150) ].head(20)

In [None]:
%matplotlib notebook
%matplotlib notebook
import matplotlib.pyplot as plt

In [None]:
%matplotlib
ii= 49
f, ax = plt.subplots()
plot_check_result(np.r_[x_train[ii], y_train[ii]], rnn_estimator, ax)
#ax.set_title(page)
#ax.set_yscale("log")
#plt.show()

In [None]:
pred_train[ii].shape

### ouput

In [None]:
key = pd.read_csv("key_2.csv.zip").set_index("Page")

In [None]:
key["Visits"] = None
key.head()

In [None]:
def output_form(_df, _estimator=None):
    """ return a serie indexed by Page """
    out_date = pd.date_range(start="2017-09-13", end="2017-11-13", freq="1D").strftime("%Y-%m-%d").to_list()
    num_hist = _df.drop(columns="Page").fillna(0).values
    num_pred = _estimator(num_hist)
    ret = pd.DataFrame(num_pred, columns=out_date, index=_df["Page"]).stack().rename("Visits")
    ret.index = [ii[0]+"_"+ii[1] for ii in ret.index]
    return ret

chunk = pd.read_csv("train_2.csv.zip", nrows=10000)
Visits_pred = output_form(chunk, linear_estimator)

In [None]:
df_chunk = pd.read_csv("train_2.csv.zip", chunksize=10000)

for ii, chunk in enumerate(df_chunk):
    print("Prediction {}".format(ii))
    predictions = output_form(chunk, rnn_predict).astype(int)
    key.loc[predictions.index, "Visits"] = predictions.astype(int)

In [None]:
key.to_csv("subm_gru.csv", encoding='utf-8', index=False)