In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config Completer.use_jedi = False  # to make autocompletion faster

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
tf.__version__

In [None]:
"""
# load kaggle environment if in google colab
from google.colab import files
files.upload() #upload kaggle.json
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json
!mkdir logs
"""

In [None]:
"""
!kaggle competitions download -c web-traffic-time-series-forecasting
!yes|unzip web-traffic-time-series-forecasting.zip
!yes|unzip train_2.csv.zip
!tail -n +2 train_2.csv|shuf --random-source train_2.csv > train_2_shuffled.csv
!head -n -10000 train_2_shuffled.csv > train_set.csv
!tail -n 10000 train_2_shuffled.csv > validation_set.csv
"""

## Make dataset

In [None]:
from webtraffic_utils import *

In [None]:
output_len = 62
normalize_ds = False
batch_size = 32

def process_line(line, normalize_ds=False):
    line = tf.io.decode_csv(line, record_defaults=[""]+[0.]*803)
    # categorical features 
    page = line[0]
    traffic = tf.stack(line[1:])

    if normalize_ds:
        traffic = traffic / (tf.reduce_max(traffic) +1e-10)
    return (page, traffic[:-output_len]), traffic[-output_len:]
    #return tf.stack(line[1:-62]), tf.stack(line[-62:])

    
def make_dataset(ds0,nmax=None, normalize_ds=False , batch_size = 32):
    ds1 = ds0.map(lambda x: process_line(x, normalize_ds))
    if nmax is not None:
        ds1 = ds1.take(nmax)
    return ds1.batch(batch_size).prefetch(1).cache()

ds = make_dataset(tf.data.TextLineDataset("train_set.csv"), normalize_ds=normalize_ds, batch_size=batch_size)
val_ds = make_dataset(tf.data.TextLineDataset("validation_set.csv"), normalize_ds=normalize_ds , batch_size=batch_size)
ds_short = make_dataset(tf.data.TextLineDataset("train_set.csv"), normalize_ds=normalize_ds,nmax=10000, batch_size=batch_size)


In [None]:
#df = pd.read_csv("train_2.csv.zip", header=0, nrows=1000).fillna(0)
#list(ds_from_dataframe(df).take(1).as_numpy_iterator())

##### categories

In [None]:
df_cat = pd.read_csv("train_set.csv", header=None, nrows=1000)
voc_access = df_cat[0].apply(lambda x: "_".join(x.split("_")[-2:])).unique()
voc_project = df_cat[0].apply(lambda x: x.split("_")[-3]).unique()

In [None]:
onehotAccess = OneHotEncodingLayer(voc_access)
onehotProject = OneHotEncodingLayer(voc_project)

In [None]:
onehotAccess(tf.constant([b'mobile-web_all-agents', b'all-access_all-agents','desktop_all-agents', 'all-access_spider',"tutu"],dtype=object))

[repeat last value](#repeat-last-value)  
[linear model](#linear-model)  
[RNN](#RNN)  

## Exploration

In [None]:
# identify some interessant line
df_examples = pd.read_csv("train_2.csv.zip", header=0, nrows=50000)

In [None]:
# remarkable pages
rem_pages = [
    'Acier_inoxydable_fr.wikipedia.org_desktop_all-agents',
]
page = rem_pages[0]
traffic_t =  df_examples.set_index("Page").loc[page].values
f,vax = plt.subplots(1,3, figsize=(20,4))
fax = vax.flat


ax=next(fax)
ax.plot(traffic_t)
ax.set_title("time traffic")


ax=next(fax)
ax.plot(estimated_autocorrelation(traffic_t))
ax.set_xticks([0,365,2*365])
ax.set_title("autocorrelation")
ax.grid()

ax=next(fax)
plot_spectrest(traffic_t, ax)
ax.set_title("spectral estimation")




plt.suptitle(page, fontsize=15)
plt.show()

## Models

[repeat last value](#repeat-last-value)  
[linear model](#linear-model)  
[RNN](#RNN)  

### repeat last value

In [None]:
import tensorflow_probability as tfp

class Median(tf.keras.Model):
    def __init__(self, median_depth=40):
        super().__init__()
        self.median_depth =median_depth
            
    def call(self, inputs):
        #print(inputs)
        Xtraff = tfp.stats.percentile(inputs[1][:,-self.median_depth:], 50.0, interpolation='lower', axis=1)
        return tf.tile(tf.expand_dims(Xtraff,axis=1), [1,output_len])
    
med = Median(40)
med.compile(loss=smape_reg, metrics=[smape,"mae"])
#med.evaluate(ds)
med.evaluate(val_ds)

In [None]:
plot_check_result(df_examples, rem_pages[0], med)

### linear model

In [None]:
tf.keras.backend.clear_session()
tb_cb = create_tb_cb("linear")

class preprocessing(tf.keras.layers.Layer):
    def call(self, inputs):
        return inputs[:,-150:]

traffic = tf.keras.layers.Input(shape=(741,))
page = tf.keras.layers.Input(shape=())
x = preprocessing()(traffic)
outputs = tf.keras.layers.Dense(units=output_len)(x)

model_linear = tf.keras.Model(inputs=[page, traffic], outputs=[outputs])

model_linear.summary()

In [None]:
model_linear.compile(loss=smape_reg, optimizer=tf.optimizers.Adam(learning_rate=1e-4), 
                     metrics=[smape,"mae"],)

In [None]:
def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.05)
lr_cb = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [None]:
es_cb = tf.keras.callbacks.EarlyStopping(monitor='val_smape', min_delta=0.1, patience=5, verbose=0, restore_best_weights=True)

#ds1.cache()
model_linear.fit(ds, epochs=20, callbacks=[tb_cb, lr_cb, es_cb], validation_data=val_ds)

In [None]:
plot_check_result(df_examples, rem_pages[0], model_linear)

In [None]:
weights = model_linear.get_layer("dense").get_weights()[0]

f,ax = plt.subplots()
ax.plot(np.abs(weights[:,0]))
ax.grid()

### RNN

In [None]:
#!rm -Rf logs/*

In [None]:
class normalize_rnn(tf.keras.layers.Layer):
    def call(self, inputs):
        fact = tf.reduce_max(inputs, axis=1, keepdims=True)
        ret = tf.divide(inputs, fact + 1e-10) 
        return ret, fact


class denormalize_rnn(tf.keras.layers.Layer):
    def call(self, inputs, fact):
        #ret = tf.maximum(tf.floor(tf.multiply(inputs, fact)), 0)
        ret = tf.multiply(inputs, fact)
        return ret

#xtry = tf.constant(np.random.randint(-1000, 1000, size=(10,5)), dtype=tf.float32)
#xn, fact = normalize_rnn()(xtry)
#denormalize_rnn()(xn,fact),xtry

In [None]:
Nneurons = 20
Nlayers = 2
MaxTs = 150
usePastYear = True
useMetadata = False
tf.random.set_seed(42)

simn = 'Ts'+str(MaxTs)+'-Nn'+str(Nneurons)+'-Nl'+str(Nlayers)

tf.keras.backend.clear_session()
tb_cb = create_tb_cb(simn)

class preprocessing_rnn(tf.keras.layers.Layer):
    def call(self, inputs, access1h):
        ret = inputs[:,-MaxTs:,np.newaxis]
        if useMetadata:
            access_broadcast = tf.tile(access1h[:,np.newaxis,:],[1,MaxTs,1])
            ret = tf.concat([ret, access_broadcast], axis=2)
            
        if usePastYear:
            pastYear = inputs[:, -MaxTs-365+output_len:-365+output_len, np.newaxis]
            ret = tf.concat([ret, pastYear], axis=2)
        return ret

I_traffic = tf.keras.layers.Input(shape=(741,))
I_page = tf.keras.layers.Input(shape=(), dtype=object)

access1h = onehotAccess(I_page)

x, factors = normalize_rnn()(I_traffic)
x = preprocessing_rnn()(x, access1h)
for ii in range(Nlayers-1):
    x = tf.keras.layers.GRU(Nneurons, return_sequences=True)(x)
x = tf.keras.layers.GRU(Nneurons)(x)
x= tf.keras.layers.Dense(output_len)(x)
outputs= denormalize_rnn()(x, factors)

model_rnn = tf.keras.Model(inputs=[I_page, I_traffic], outputs=[outputs])

model_rnn.summary()

In [None]:
model_rnn.compile(loss=smape_reg, optimizer=tf.optimizers.Adam(learning_rate=1e-4),metrics=[smape])

In [None]:
es_cb = tf.keras.callbacks.EarlyStopping(monitor='smape', min_delta=0.1, patience=5, verbose=0, restore_best_weights=True)
model_rnn.fit(ds_highpop, epochs=100, callbacks=[tb_cb, es_cb]) #, validation_data=val_ds)

In [None]:
plot_check_result(df_examples, rem_pages[0], model_rnn)

#### error analysis

In [None]:
s_median = pd.Series(np.median(df_examples.drop(columns="Page").values[:,:-output_len], axis=1), df_examples.index)


f, ax = plt.subplots()
#ax.hist(np.log1p(s_median), bins=50)
#ax.hist(s_median, cumulative=True, bins=100, density=True)
#ax.boxplot(np.random.randn(100))
ax.boxplot(s_median.dropna(), showfliers=False)
ax.grid()
ax.set_title("median traffic distribution")
plt.show()

In [None]:
ds_lowpop = ds_from_dataframe(df_examples[s_median<200]).batch(32)
ds_highpop = ds_from_dataframe(df_examples[s_median>=200]).batch(32)

In [None]:
print("linear low popularity: ", model_linear.evaluate(ds_lowpop, verbose=0))
print("linear high popularity: ", model_linear.evaluate(ds_highpop, verbose=0))
print("median low popularity: ", med.evaluate(ds_lowpop, verbose=0))
print("median high popularity: ", med.evaluate(ds_highpop, verbose=0))
print("rnn low popularity: ", model_rnn.evaluate(ds_lowpop, verbose=0))
print("rnn high popularity: ", model_rnn.evaluate(ds_highpop, verbose=0))

In [None]:
def get_ds_pred(ds_short):
    pred = model_rnn.predict(ds_short)
    lds = list(ds_short.as_numpy_iterator())
    agent = np.concatenate([batch[0][0] for batch in lds])
    access = np.concatenate([batch[0][1] for batch in lds])
    ytrue = np.concatenate([batch[1] for batch in lds])
    xtrain = np.concatenate([batch[0][3] for batch in lds])
    return pred, xtrain, ytrue

In [None]:
def smape_row(A, F):
    return np.mean(100 * (2 * np.abs(F - A) / (np.abs(A) + np.abs(F) + np.finfo(float).eps)), axis=1)

In [None]:
pred_train, x_train, y_train = get_ds_pred(ds_short)
pred_val, x_val, y_val = get_ds_pred(val_ds)

In [None]:
f, ax =plt.subplots()
kwargs = {"alpha": 0.5, "bins": 50}
ax.hist(smape_row(pred_train, y_train),**kwargs, label="train")
ax.hist(smape_row(pred_val, y_val),**kwargs, label="validation")
ax.set_title(np.mean(smape_row(pred, ytrue)))
plt.show()


In [None]:
xsm = smape_row(pred_train, y_train)
np.mean(xsm[xsm < 100])

In [None]:
vsmape = pd.Series(smape_row(pred_train, y_train))
vsmape[(vsmape>150) ].head(20)

In [None]:
%matplotlib notebook
%matplotlib notebook
import matplotlib.pyplot as plt

In [None]:
%matplotlib
ii= 49
f, ax = plt.subplots()
plot_check_result(np.r_[x_train[ii], y_train[ii]], rnn_estimator, ax)
#ax.set_title(page)
#ax.set_yscale("log")
#plt.show()

In [None]:
pred_train[ii].shape

### ouput

In [None]:
key = pd.read_csv("key_2.csv.zip").set_index("Page")

In [None]:
key["Visits"] = 0
key.info()

In [None]:
def output_form(_df, _model=None):
    """ return a serie indexed by Page """
    out_date = pd.date_range(start="2017-09-13", end="2017-11-13", freq="1D").strftime("%Y-%m-%d").to_list()
    I_page = np.array(_df["Page"])
    I_traffic = _df.fillna(0).drop(columns="Page").values[:,-741:]
    num_pred = np.clip(_model.predict((I_page, I_traffic)).astype(int), a_min=0, a_max=None)
    ret = pd.DataFrame(num_pred, columns=out_date, index=_df["Page"]).stack().rename("Visits")
    ret.index = [ii[0]+"_"+ii[1] for ii in ret.index]
    return ret

chunk = pd.read_csv("train_2.csv.zip", nrows=10000)
Visits_pred = output_form(chunk, med)
Visits_pred

In [None]:
df_chunk = pd.read_csv("train_2.csv.zip", chunksize=10000)

for ii, chunk in enumerate(df_chunk):
    print("Prediction {}".format(ii))
    predictions = output_form(chunk, med).astype(int)
    key.loc[predictions.index, "Visits"] = predictions.astype(int)

In [None]:
key.to_csv("subm_med.csv", encoding='utf-8', index=False)

In [None]:
!kaggle competitions submit -f subm_med.csv -m "median 40days"  web-traffic-time-series-forecasting