In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
"""
# load kaggle environment
from google.colab import files
files.upload() #upload kaggle.json
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json
!mkdir logs

!kaggle competitions download -c web-traffic-time-series-forecasting
!unzip train_2.csv.zip
"""

## load dataset

In [None]:
import random
random.seed(1111)
p = 0.2  # 1% of the lines
# keep the header, then take only 1% of lines
# if random from [0,1] interval is greater than 0.01 the row will be skipped
df = pd.read_csv(
         "train_2.csv.zip",
         header=0, 
         skiprows=lambda i: i>0 and random.random() > p
)
df = df.sample(frac=1).reset_index(drop=True).fillna(0)
Page = df.Page
agent = Page.apply(lambda x: x.split("_")[-1])
access = Page.apply(lambda x: x.split("_")[-2])
wikiproject = Page.apply(lambda x: x.split("_")[-3])
page_name = Page.apply(lambda x: "_".join(x.split("_")[:-3]))
df.drop(columns=["Page"], inplace=True)

##### train test split

In [None]:
Mtraffic = df.values
#Mtraffic = Mtraffic/Mtraffic.max()

output_len = 62

Ltst = 1000
Ltr = Mtraffic.shape[0] - 2*Ltst
shift_start = 0

x_train, y_train = Mtraffic[:Ltr,shift_start:-output_len], Mtraffic[:Ltr,-output_len:]
x_valid, y_valid = Mtraffic[Ltr:Ltr+Ltst, shift_start:-output_len], Mtraffic[Ltr:Ltr+Ltst,-output_len:]
x_test, y_test = Mtraffic[Ltr+Ltst:, shift_start:-output_len], Mtraffic[Ltr+Ltst:,-output_len:]

x_train.shape, y_train.shape
x_train.shape, x_valid.shape, y_train.shape

##### training utils

In [None]:
def smape(A, F):
    return tf.reduce_mean(2 * tf.math.abs(F - A) / (tf.math.abs(A) + tf.math.abs(F) + 1e-16)) * 100 

def smape_reg(A, F):
    epsilon = 0.1
    summ = tf.maximum(tf.abs(A) + tf.abs(F) + epsilon, 0.5 + epsilon)
    return tf.abs(A - F) / summ * 2.0 * 100


In [None]:
def smape_np(A, F):
    return 100/A.size * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F) + np.finfo(float).eps))

In [None]:
#tensorboard callbacks
from datetime import datetime
datetime.now().strftime("%H-%M-%S")
def create_tb_cb(model_name):
    return tf.keras.callbacks.TensorBoard(log_dir="./logs/"+model_name+"-"+datetime.now().strftime("%H-%M-%S"),
                                          histogram_freq=10
                                         )

In [None]:
def plot_check_result(df_check, predict_func, ax):
    Ntsteps = x_train.shape[1]
    pred = predict_func(df_check.values[:,:-62])
    print("smape ", smape_np(pred, df_check.values[:,-62:]))
    df_ref = df_check.reset_index(drop=True).T.reset_index(drop=True)
    df_pred = pd.DataFrame(pred.T,index=np.arange(0,62)+Ntsteps)
    df_pred.plot(ax=ax, style=".-")
    #ax.set_prop_cycle(None)
    df_ref.plot(ax=ax)

[linear model](#linear-model)  
[RNN](#RNN)  


## Exploration

In [None]:
f, ax = plt.subplots()
ax.plot(agent)
ax.plot(access)
ax.plot(wikiproject)

In [None]:
f, ax = plt.subplots()
line = ax.plot(df.loc[agent!="spider"].head().T.reset_index(drop=True))
ax.legend(line, Page.head())
plt.show()

### SMAPE

In [None]:
def last_repeated_estimator(ts_prev):
    """ ts_prev : matrix (m, Ts) """
    return np.tile(ts_prev[:,-1].reshape(-1,1), (1,62))

In [None]:
Dnum = df.fillna(0).values[:,5:]

In [None]:
smape_np(np.zeros_like(Dnum), Dnum)

In [None]:
Mmean = np.tile(np.mean(Dnum, 1).reshape(-1,1) , (1,Dnum.shape[1]))
smape_np(Mmean, Dnum)

### Autocorrelation

In [None]:
def estimated_autocorrelation(x):
    """
    http://stackoverflow.com/q/14297012/190597
    http://en.wikipedia.org/wiki/Autocorrelation#Estimation
    """
    n = len(x)
    variance = x.var()
    x = x-x.mean()
    r = np.correlate(x, x, mode = 'full')[-n:]
    assert np.allclose(r, np.array([(x[:n-k]*x[-(n-k):]).sum() for k in range(n)]))
    result = r/(variance*(np.arange(n, 0, -1)))
    return result

In [None]:
df_allagent = df.loc[agent=="all-agents"]

f, ax = plt.subplots()
ax.plot(estimated_autocorrelation(df_allagent.iloc[20951,:].values))
#ax.plot(estimated_autocorrelation(df_allagent.iloc[1,:].values))
#ax.plot(estimated_autocorrelation(df_allagent.iloc[2,:].values))
#ax.plot(estimated_autocorrelation(df_allagent.iloc[3,:].values))
ax.scatter(365,0,s=10,c="r")
ax.grid()

In [None]:
f, ax = plt.subplots()
ax.plot(df_allagent.iloc[20951,:].values)


In [None]:
df_allagent.shape

In [None]:
df_allagent.mean(axis=1).sort_values(ascending=False).head(15000)

## Models

[repeat last value](#repeat-last-value)  
[linear model](#linear-model)

###### repeat last value

In [None]:
def repeat_lv(X):
    return np.tile(X[:,-1].reshape(-1,1), reps=(1,output_len))

smape_np(repeat_lv(x_train), y_train), smape_np(repeat_lv(x_valid), y_valid), smape_np(repeat_lv(x_test), y_test)


In [None]:
repeat_lv(x_train)

### linear model

In [None]:
tf.keras.backend.clear_session()
tb_cb = create_tb_cb("linear")

In [None]:
model_linear = tf.keras.Sequential([
    tf.keras.layers.Dense(units=output_len)])

In [None]:
model_linear.compile(loss=smape_reg, optimizer=tf.optimizers.Adam(learning_rate=1e-4), 
                     metrics=[smape,"mae"])

In [None]:
def scheduler(epoch, lr):
    if epoch < 40:
        return lr
    else:
        return lr * tf.math.exp(-0.05)
lr_cb = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [None]:
Ntr = 50000
tmp_x = x_train[:Ntr,:]; tmp_y = y_train[:Ntr,:]
model_linear.fit(tmp_x, tmp_y, epochs=100, batch_size=32, 
                 validation_data= (x_valid, y_valid), 
                 callbacks=[tb_cb, lr_cb])

In [None]:
model_linear.summary()

In [None]:
def linear_estimator(x):
    return model_linear.predict(x[:,-x_train.shape[1]:])

In [None]:
weights = model_linear.get_layer("dense").get_weights()[0]

f,ax = plt.subplots()
ax.plot(np.abs(weights[:,0]))

In [None]:
smape_np(linear_estimator(x_train), y_train)

In [None]:
f,ax = plt.subplots()

plot_check_result(df.iloc[[50],:], linear_estimator, ax)
#ax.set_xlim(left=700)
#ax.set_ylim((0,300))
plt.show()

### RNN

In [None]:
#!rm -Rf logs/*

In [None]:
x_train.shape

In [None]:
Limit_train=5000
MaxTs = 200
Nneurons = 20
Nlayers = 2

In [None]:
simn = 'Ltr'+str(Limit_train)+'-Ts'+str(MaxTs)+'-Nn'+str(Nneurons)+'-Nl'+str(Nlayers)

In [None]:
tf.keras.backend.clear_session()
tb_cb = create_tb_cb(simn)

In [None]:
model = tf.keras.Sequential()

# Add a LSTM layer with 128 internal units.
model.add(tf.keras.Input((MaxTs,1)))
if Nlayers==0:
    model.add(tf.keras.layers.Flatten())
    
for ii in range(Nlayers-1):
    model.add(tf.keras.layers.GRU(Nneurons, return_sequences=True))
if Nlayers>0:
    model.add(tf.keras.layers.GRU(Nneurons))

model.add(tf.keras.layers.Dense(output_len))

model.summary()


In [None]:
# scaler
from sklearn.preprocessing import MinMaxScaler

def scale_train(x):
    return np.max(x, axis=1).reshape(-1,1)+1e-10
    return np.ones((x.shape[0],1))
    scaler = MinMaxScaler((0,1))
    scaler.fit(x[:,-MaxTs:].T)
    return scaler.transform(x.T).T

In [None]:
np.ones((x_train.shape[0],1)).shape

In [None]:
x_train_rnn = (x_train/scale_train(x_train))[:Limit_train,-MaxTs:, np.newaxis]; 
y_train_rnn = (y_train/scale_train(x_train))[:Limit_train]
x_valid_rnn = (x_valid/scale_train(x_valid))[:Limit_train,-MaxTs:, np.newaxis]; 
y_valid_rnn = (y_valid/scale_train(x_valid))[:Limit_train]

In [None]:
"""%matplotlib inline
f, ax = plt.subplots()
ax.plot(x_train_rnn[0,:,0])
plt.show()"""

In [None]:
model.compile(loss=smape, optimizer=tf.optimizers.Adam(learning_rate=1e-3),metrics=[smape, "mae"])

In [None]:
model.fit(x_train_rnn, y_train_rnn, epochs=200, batch_size=32 , #validation_split= 0.2,
          validation_data= (x_valid_rnn, y_valid_rnn), 
          callbacks=[tb_cb])

In [None]:
def rnn_predict(x):
    xsc = x/(np.max(x,axis=1).reshape(-1,1)+1e-10)
    pred0 = model.predict(xsc[:, -MaxTs:, np.newaxis])
    #return pred0    
    return pred0 * np.max(x,axis=1).reshape(-1,1)

In [None]:
pred_train = rnn_predict(x_train)

In [None]:
smape_np(pred_train, y_train)

In [None]:
%matplotlib notebook
f,ax = plt.subplots()

plot_check_result(df.iloc[[50],:], rnn_predict, ax)
#ax.set_xlim(left=700)
#ax.set_ylim(top=1000,bottom=0)

plt.show()

### ouput

In [None]:
key = pd.read_csv("key_2.csv.zip").set_index("Page")

In [None]:
key["Visits"] = None
key.head()

In [None]:
def output_form(_df, _estimator=None):
    """ return a serie indexed by Page """
    out_date = pd.date_range(start="2017-09-13", end="2017-11-13", freq="1D").strftime("%Y-%m-%d").to_list()
    num_hist = _df.drop(columns="Page").fillna(0).values
    num_pred = _estimator(num_hist)
    ret = pd.DataFrame(num_pred, columns=out_date, index=_df["Page"]).stack().rename("Visits")
    ret.index = [ii[0]+"_"+ii[1] for ii in ret.index]
    return ret

chunk = pd.read_csv("train_2.csv.zip", nrows=10000)
Visits_pred = output_form(chunk, linear_estimator)

In [None]:
df_chunk = pd.read_csv("train_2.csv.zip", chunksize=10000)

for ii, chunk in enumerate(df_chunk):
    print("Prediction {}".format(ii))
    predictions = output_form(chunk, rnn_predict).astype(int)
    key.loc[predictions.index, "Visits"] = predictions.astype(int)

In [None]:
key.to_csv("subm_gru.csv", encoding='utf-8', index=False)