In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config Completer.use_jedi = False  # to make autocompletion faster

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import gc

In [None]:
tf.__version__

In [None]:
"""
# load kaggle environment if in google colab
from google.colab import files
files.upload() #upload kaggle.json
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json
!mkdir logs
"""

In [None]:
"""
!kaggle competitions download -c web-traffic-time-series-forecasting
!yes|unzip web-traffic-time-series-forecasting.zip
!yes|unzip train_2.csv.zip
!tail -n +2 train_2.csv|shuf --random-source train_2.csv > train_2_shuffled.csv
!head -n -10000 train_2_shuffled.csv > train_set.csv
!tail -n 10000 train_2_shuffled.csv > validation_set.csv
"""

## Make dataset

In [None]:
from webtraffic_utils import *
output_len = 62
normalize_ds = False
batch_size = 4096

In [None]:
# features like (access, spectral tones) created by feature_engineering.ipynb

from ast import literal_eval
df_feat = pd.read_csv("features_computed.csv.zip",converters={"tones":literal_eval}).set_index("Page")
weekly_tone = df_feat["tones"].apply(lambda x: (np.abs(np.array(x)-1./7.)<1e-2).any()).rename("week")

In [None]:
df_ds = pd.read_csv("train_2.csv.zip", header=0).set_index("Page").fillna(0).astype(np.int32)

# add features to df
add_feats = pd.concat([weekly_tone], axis=1)
add_feats.rename(columns={ii:"feat_"+str(ii) for ii in add_feats.columns}, inplace=True)
df_ds = pd.concat([df_ds, add_feats], axis=1)
add_feats.columns

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test, week_train, week_test = train_test_split(df_ds, weekly_tone, test_size=20000)

[repeat last value](#repeat-last-value)  
[linear model](#linear-model)  
[RNN](#RNN)  

## Exploration

In [None]:
# remarkable pages
rem_pages = [
    'Acier_inoxydable_fr.wikipedia.org_desktop_all-agents',
    '2NE1_zh.wikipedia.org_all-access_spider',
    '3C_zh.wikipedia.org_all-access_spider'
]
page = rem_pages[0]
traffic_t =  df_train.drop(columns=add_feats.columns).loc[page].values[:-1].astype(int)
f,vax = plt.subplots(1,3, figsize=(20,4))
fax = vax.flat


ax=next(fax)
ax.plot(traffic_t)
ax.set_title("time traffic")


ax=next(fax)
ax.plot(estimated_autocorrelation(traffic_t))
ax.set_xticks([0,365,2*365])
ax.set_title("autocorrelation")
ax.grid()

ax=next(fax)
plot_spectrest(traffic_t, ax)
ax.set_title("spectral estimation")




plt.suptitle(page, fontsize=15)
plt.show()

## Models

[repeat last value](#repeat-last-value)  
[linear model](#linear-model)  
[RNN](#RNN)  

### repeat last value

In [None]:
import tensorflow_probability as tfp

class Median(tf.keras.layers.Layer):
    def __init__(self, median_depth=40):
        super().__init__()
        self.median_depth =median_depth
            
    def call(self, inputs):
        #print(inputs)
        Xtraff = tfp.stats.percentile(inputs[:,-self.median_depth:], 50.0, interpolation='lower', axis=1)
        return tf.tile(tf.expand_dims(Xtraff,axis=1), [1,output_len])

In [None]:
I_traffic = tf.keras.layers.Input(shape=(741,))
I_page = tf.keras.layers.Input(shape=(), dtype=object)
outputs = Median(40)(I_traffic)  
med = tf.keras.models.Model([I_page,I_traffic],outputs)

med.compile(loss=SmapeLoss(), metrics=[SmapeMetric()])
features, target = get_model_inputs(df_train)

med.evaluate(features, target, batch_size=1000)
med.save("saved_model/median", save_format='tf')
_=gc.collect()

In [None]:
plot_check_result(df_ds, rem_pages[0], [med])

### linear model

In [None]:
def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.05)
lr_cb = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [None]:
tf.keras.backend.clear_session()
tb_cb = create_tb_cb("linear")

class preprocessing(tf.keras.layers.Layer):
    def call(self, inputs):
        return inputs[:,-150:]

traffic = tf.keras.layers.Input(shape=(741,))
page = tf.keras.layers.Input(shape=())
x = preprocessing()(traffic)
outputs = tf.keras.layers.Dense(units=output_len)(x)

model_linear = tf.keras.Model(inputs=[page, traffic], outputs=[outputs])

#model_linear.summary()

model_linear.compile(loss=SmapeLoss(), optimizer=tf.optimizers.Adam(learning_rate=1e-4), 
                     metrics=[SmapeMetric()])

In [None]:
es_cb = tf.keras.callbacks.EarlyStopping(monitor='smape', min_delta=0.1, patience=5, verbose=0, restore_best_weights=True)

#ds1.cache()
features, target = get_model_inputs(df_train)
model_linear.fit(features, target, epochs=100, callbacks=[tb_cb, es_cb], batch_size=32) #, validation_data=val_ds)
model_linear.save("saved_model/model_linear_nw", save_format='tf')

In [None]:
plot_check_result(df_train, rem_pages[0], [model_linear, med])

In [None]:
weights = model_linear.get_layer("dense").get_weights()[0]

f,ax = plt.subplots()
ax.plot(np.abs(weights[:,0]))
ax.grid()

### mixed model

In [None]:
features, target = get_model_inputs(df_train)
feat_val, target_val = get_model_inputs(df_test)


tf.keras.backend.clear_session()
tb_cb = create_tb_cb("linear")

class preprocessing(tf.keras.layers.Layer):
    def call(self, inputs):
        return inputs[:,-150:]

traffic = tf.keras.layers.Input(shape=(741,))
page = tf.keras.layers.Input(shape=())
weekly = tf.keras.layers.Input(shape=())
x = preprocessing()(traffic)
o_lin = tf.keras.layers.Dense(units=output_len)(x)
o_med = Median(40)(traffic)
    
outputs =  o_lin * tf.tile(tf.expand_dims(tf.cast(weekly, o_lin.dtype),1),[1,62])
outputs =  outputs + o_med * tf.tile(tf.expand_dims(tf.cast(1-weekly, o_lin.dtype),1),[1,62])

    
mixed_model = tf.keras.Model(inputs=[page, traffic, weekly], outputs=[outputs])

mixed_model.compile(loss=SmapeLoss(), optimizer=tf.optimizers.Adam(learning_rate=1e-4), 
                     metrics=[SmapeMetric()])


In [None]:
es_cb = tf.keras.callbacks.EarlyStopping(monitor='val_smape', min_delta=0.1, patience=5, verbose=0, restore_best_weights=True)
mixed_model.fit(features, target, epochs=100, callbacks=[tb_cb, es_cb], batch_size=32, validation_data=(feat_val, target_val))

In [None]:
mixed_model.save("saved_model/mixed_model", save_format='tf')

In [None]:
gc.collect()

In [None]:
plot_check_result(df_train, rem_pages[0], [mixed_model, med])

### RNN

In [None]:
#!rm -Rf logs/*

In [None]:
class normalize_rnn(tf.keras.layers.Layer):
    def call(self, inputs):
        fact = tf.reduce_max(inputs, axis=1, keepdims=True)
        ret = tf.divide(inputs, fact + 1e-10) 
        return ret, fact


class denormalize_rnn(tf.keras.layers.Layer):
    def call(self, inputs, fact):
        #ret = tf.maximum(tf.floor(tf.multiply(inputs, fact)), 0)
        ret = tf.multiply(inputs, fact)
        return ret

#xtry = tf.constant(np.random.randint(-1000, 1000, size=(10,5)), dtype=tf.float32)
#xn, fact = normalize_rnn()(xtry)
#denormalize_rnn()(xn,fact),xtry

In [None]:
voc_access = np.unique(df_ds.index.map(lambda x: "_".join(x.split("_")[-2:])))
voc_project = np.unique(df_ds.index.map(lambda x: x.split("_")[-3]))
onehotAccess = OneHotEncodingLayer(voc_access, name="ohAccess")
onehotProject = OneHotEncodingLayer(voc_project, name="ohProject")

In [None]:
Nneurons = 20
Nlayers = 1
MaxTs = 150
usePastYear = False
useMetadata = False
tf.random.set_seed(42)

simn = 'Ts'+str(MaxTs)+'-Nn'+str(Nneurons)+'-Nl'+str(Nlayers)

tf.keras.backend.clear_session()
tb_cb = create_tb_cb(simn)

class preprocessing_rnn(tf.keras.layers.Layer):
    def call(self, inputs, access1h):
        ret = inputs[:,-MaxTs:,np.newaxis]
        if useMetadata:
            access_broadcast = tf.tile(access1h[:,np.newaxis,:],[1,MaxTs,1])
            ret = tf.concat([ret, access_broadcast], axis=2)
            
        if usePastYear:
            pastYear = inputs[:, -MaxTs-365+output_len:-365+output_len, np.newaxis]
            ret = tf.concat([ret, pastYear], axis=2)
        return ret

I_page = tf.keras.layers.Input(shape=(), dtype=object)
I_traffic = tf.keras.layers.Input(shape=(741,))
weekly = tf.keras.layers.Input(shape=())

access1h = onehotAccess(I_page)

x, factors = normalize_rnn()(I_traffic)
x = preprocessing_rnn()(x, access1h)
for ii in range(Nlayers-1):
    x = tf.keras.layers.GRU(Nneurons, return_sequences=True)(x)
x = tf.keras.layers.GRU(Nneurons)(x)
x= tf.keras.layers.Dense(output_len)(x)
outputs= denormalize_rnn()(x, factors)

model_rnn = tf.keras.Model(inputs=[I_page, I_traffic, weekly], outputs=[outputs])

model_rnn.summary()

In [None]:
model_rnn.compile(loss=SmapeLoss(), optimizer=tf.optimizers.Adam(learning_rate=1e-4), metrics=[SmapeMetric()])

In [None]:
es_cb = tf.keras.callbacks.EarlyStopping(monitor='smape', min_delta=0.1, patience=5, verbose=0, restore_best_weights=True)
features, target = get_model_inputs(df_train.loc[week_train])
model_rnn.fit(features, target, epochs=100, callbacks=[tb_cb, es_cb] , batch_size=128) #, validation_data=val_ds)

In [None]:
model_rnn.save("saved_model/model_rnn")

In [None]:
restore = tf.keras.models.load_model('saved_model/model_rnn', custom_objects={'SmapeMetric': SmapeMetric,'SmapeLoss': SmapeLoss, 
                                                                                'OneHotEncodingLayer':OneHotEncodingLayer})

In [None]:
plot_check_result(df_train, rem_pages[0], [model_rnn, restore])

### ouput

In [None]:
key = pd.read_csv("key_2.csv.zip").set_index("Page")

In [None]:
key["Visits"] = 0
key.info()

In [None]:
def output_form(features, _model=None):
    """ return a serie indexed by Page """
    out_date = pd.date_range(start="2017-09-13", end="2017-11-13", freq="1D").strftime("%Y-%m-%d").to_list()
    num_pred = np.clip(_model.predict(features, batch_size=1000, verbose=1).astype(int), a_min=0, a_max=None)
    index = df_ds.index  # np.char.decode(features[0].numpy().astype(bytes))
    ret = pd.DataFrame(num_pred, columns=out_date, index=index).stack().rename("Visits")
    ret.index = [ii[0]+"_"+ii[1] for ii in ret.index]
    return ret

features = [tf.convert_to_tensor(df_ds.index), df_ds.values, tf.convert_to_tensor(weekly_tone)]

Visits_pred = output_form(features, mixed_model)
Visits_pred.head()

In [None]:
#set(key.index) == set(Visits_pred.index)
(weekly_tone).mean()

In [None]:
key.loc[Visits_pred.index, "Visits"] = Visits_pred.astype(int)

In [None]:
key.to_csv("subm_mixed.csv", encoding='utf-8', index=False)

In [None]:
!kaggle competitions submit -f subm_mixed.csv -m "mixed med/lin (final)"  web-traffic-time-series-forecasting

In [None]:
del Visits_pred, key ,features
gc.collect()