In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
%config Completer.use_jedi = False  # to make autocompletion faster

In [None]:
import tensorflow as tf
import pandas as pd
import gc

In [None]:
from webtraffic_utils import *

### models

In [None]:
med = tf.keras.models.load_model('saved_model/median', custom_objects={'SmapeMetric': SmapeMetric,'SmapeLoss': SmapeLoss})
model_linear = tf.keras.models.load_model('saved_model/model_linear', custom_objects={'SmapeMetric': SmapeMetric,'SmapeLoss': SmapeLoss})

In [None]:
model_rnn = tf.keras.models.load_model('saved_model/model_rnn', custom_objects={'SmapeMetric': SmapeMetric,'SmapeLoss': SmapeLoss, 
                                                                                'OneHotEncodingLayer':OneHotEncodingLayer})

In [None]:
mixed_model = tf.keras.models.load_model('saved_model/mixed_model', custom_objects={'SmapeMetric': SmapeMetric,'SmapeLoss': SmapeLoss, 
                                                                                'OneHotEncodingLayer':OneHotEncodingLayer})

In [None]:
rnn_simple = get_rnn_model(False)
rnn_simple.get_layer("gru0").set_weights(model_rnn.get_layer("gru0").get_weights())
rnn_simple.get_layer("dense0").set_weights(model_rnn.get_layer("td").get_weights())

In [None]:
rnn_delay = [ii for ii in model_rnn.get_config().get("layers") if ii.get("class_name") == "preprocessing_rnn"][0]["config"]["max_delay"]
win_size_rnn = 62+rnn_delay

### dataset

In [None]:
# features like (access, spectral tones) created by feature_engineering.ipynb

from ast import literal_eval
df_feat = pd.read_csv("features_computed.csv.zip",converters={"tones":literal_eval}).set_index("Page")
weekly_tone = df_feat["tones"].apply(lambda x: (np.abs(np.array(x)-1./7.)<1e-2).any())

In [None]:
df_train = pd.read_csv("train_2.csv.zip", header=0).set_index("Page").fillna(0).astype(np.int32)
#df_train = pd.read_csv("train_2.csv.zip", header=0).dropna().set_index("Page").astype(np.int32).reset_index()

In [None]:
df_valid = df_train.loc[df_feat["median"]>400]

In [None]:
for ii in range(1,100,10):
    inputs, target= get_model_inputs(df_valid.iloc[:,:-ii])
    med.evaluate(inputs, target, batch_size=1000)
    gc.collect()

In [None]:
for ii in range(1,100,10):
    inputs, target= get_model_inputs(df_valid.iloc[:, -win_size_rnn-ii:-ii])
    rnn_simple.evaluate(inputs, target, batch_size=1000)
    gc.collect()

### error analysis

###### example

In [None]:
plot_check_result(df_train.iloc[:,:-100], 'Acier_inoxydable_fr.wikipedia.org_desktop_all-agents', [med, model_linear, rnn_simple])

###### model comparison

In [None]:
def smape_row(df_train, model):
    features, ytrue = get_model_inputs(df_train)
    def _smape_row(A, F):
        return np.mean(100 * (2 * np.abs(F - A) / (np.abs(A) + np.abs(F) + np.finfo(float).eps)), axis=1)
    return pd.Series(_smape_row(model.predict(features, batch_size=1000, verbose=1), ytrue), index=df_train.index)

In [None]:
df_valid = df_train.iloc[:,-win_size_rnn:].loc[df_feat["median"]>400]
smape_rnn = smape_row(df_valid, rnn_simple).rename("smape_rnn")
smape_med = smape_row(df_valid, med).rename("smape_med")
smape_lin = smape_row(df_valid, model_linear).rename("smape_lin")

print(smape_rnn.mean(), smape_med.mean())
_ = gc.collect()

In [None]:
f, ax = plt.subplots()
kwargs = {"bins": 50, "alpha": 0.3, "density": True}
ax.hist(smape_rnn, **kwargs, label ="rnn")
ax.hist(smape_med, **kwargs, label ="med")
ax.hist(smape_lin, **kwargs, label ="lin")
ax.legend()

plt.show()

In [None]:
pd.concat([smape_rnn, smape_med], axis=1).min(axis=1).mean()

In [None]:
%matplotlib

In [None]:
f,ax = plt.subplots()

ax.scatter(df_feat["median"],smape_med-smape_rnn, s=0.1)
ax.set_xlim((0,10000))

In [None]:
df_comp = pd.concat([df_feat["median"], (smape_med-smape_rnn).rename("diff")], axis=1)
vc=pd.cut(df_comp["median"], np.arange(0,15000,200))
df_comp.groupby(vc).mean()["diff"].plot(kind="bar")

In [None]:
(df_feat["median"]>400).mean()

##### outliers

In [None]:
def outliers(row):
    Q1 = row.quantile(0.5)
    Q3 = row.quantile(0.75)
    return (row.max()-Q3)/(Q3-Q1+1e-10)

In [None]:
df_train.head().apply(outliers, axis=1) #.quantile([0.5,0.75],axis=1).T

In [None]:
outliers = df_train.apply(outliers, axis=1)