# Import Statements

In [None]:
import os
os.chdir('../..')
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
import tensorflow_addons as tfa
from keras.preprocessing import sequence
from keras.models import Sequential 
from keras.layers import Dense, LSTM, TimeDistributed
from keras.callbacks import CSVLogger

import plotly.express as px
import plotly.graph_objects as go

# Data Cleaning

In [None]:
def clean_data(df):
    # convert to datetime
    df['MEASUREMENT_TIME'] = pd.to_datetime(df['MEASUREMENT_TIME'])
    # calculate time difference
    df['diff'] = df['MEASUREMENT_TIME'].diff().shift(-1)
    # drop unncessary columns
    df = df.drop(columns=['ID_INPUT', 'PRIVATE_DATA'])
    # rename remaining columns
    df.columns = ['time', 'window', 'diff']
    # Select 3-week time-frame
    df = df[(df['time'] > '2022-01-03') & (df['time'] <= '2022-01-24')]
    # Remove windows appearing only once
    df = df.groupby('window').filter(lambda x: len(x) > 1)
    # Remove NaNs
    df = df.dropna()
    
    return df

# Time-Series Regularization

In [None]:
def regularize_timeseries(df, freq):
    # regularized time-series index at specified frequency
    out_time = pd.date_range(df['time'].values[0], df['time'].values[len(df)-1], freq=freq)
    out_windows = []
    
    for i in range(len(out_time[:-1])):
        # subquery dataframe to each time-step
        df_small = df[(df['time'] >= out_time[i]) & (df['time'] <= out_time[i+1])]
        
        if len(df_small) == 0:
            # NaN if no windows in time-step
            out_windows.append(np.NaN)
        elif len(df_small) == 1:
            # append window if only one window in time-step
            out_windows.append(df_small['window'].values[0])
        else:     
            # append window with most time spent if multiple windows during time-step
            summed = df_small.groupby('window')['diff'].sum().reset_index().sort_values('diff', ascending=False)            
            out_windows.append(summed['window'].values[0])
    
    # create new dataframe
    out = pd.DataFrame(list(zip(out_time, out_windows)), columns =['time', 'window']).fillna(method="ffill")
    
    return out.set_index('time')

# Encoder & Decoder

In [None]:
def one_hot_encode(sequence, n_unique):
    """one hot encode a sequence as 2-d array"""
    encoding = list()
    for value in sequence:
        vector = [0 for _ in range(n_unique)]
        vector[value] = 1
        encoding.append(vector)
    return np.array(encoding)

In [None]:
def to_supervised(sequence, n_in, n_out):
    """transform encoded sequence to supervised learning problem"""
    # create lag copies of the sequence
    df = pd.DataFrame(sequence)
    df = pd.concat([df.shift(n_in-i-1) for i in range(n_in)], axis=1)
    # drop rows with missing values
    df.dropna(inplace=True)
    # specify columns for input and output pairs
    values = df.values
    width = sequence.shape[1]
    X = values.reshape(len(values), n_in, width)
    y = values[:, 0:(n_out*width)].reshape(len(values), n_out, width)
    
    return X, y

In [None]:
def one_hot_decode(encoded_seq):
    """decode a one hot encoded string"""
    return [np.argmax(vector) for vector in encoded_seq]

In [None]:
def decode_predictions(pred, test, dim, enc):
    """decode all one-hot encoded strings and store as dataframe"""
    preds = [] 
    for i in range(len(test[n_in-1:])):
        preds.append(one_hot_decode(pred[i])[0])
        
    return pd.DataFrame(enc.inverse_transform(preds), index=test.index[n_in-1:], columns=['window'])

# Plots

In [None]:
# Final App Launch Predictions / Total Time Spent
def make_time_series_plot(df1, preds, freq):
    trace1 = go.Scatter(
    x = df1.index,
    y = df1.window,
    mode = 'lines',
    name = 'Data'
    )
    trace2 = go.Scatter(
        x = preds.index,
        y = preds.window,
        mode = 'lines',
        name = 'Prediction'
    )
    layout = go.Layout(
        title = "App Launch Predictions at {} Frequency".format(freq),
        xaxis = {'title' : "Time"},
        yaxis = {'title' : "App Executable"}
    )
    fig = go.Figure(data=[trace1, trace2], layout=layout)
    fig.show()

## User 1

In [None]:
df1 = pd.read_csv('data/arjun_window_data.csv')
df1 = clean_data(df1)
df1

In [None]:
df1.window.value_counts()

In [None]:
df1_regular = regularize_timeseries(df1, "1min")
df1_regular

In [None]:
df1_regular.window.value_counts()

In [None]:
enc = LabelEncoder()
df1_labeled = enc.fit_transform(df1_regular)
df1_encoded = one_hot_encode(df1_labeled, len(enc1.classes_))

User 1 Split

In [None]:
df1_train, df1_test = train_test_split(df1_regular, test_size=0.4, shuffle=False)
df1_valid, df1_test = train_test_split(df1_test, test_size=0.5, shuffle=False)
df1_train_encoded, df1_test_encoded = train_test_split(df1_encoded, test_size=0.4, shuffle=False)
df1_valid_encoded, df1_test_encoded = train_test_split(df1_test_encoded, test_size=0.5, shuffle=False)

User 1 Encode for Supervised Learning

In [None]:
freqs = ["1s", "30s", "1min"]
dims = [3,5,10]
nodes = [8, 20, 100]
batch_sizes = [6, 12, 18]
encoded_length = len(enc.classes_)

In [None]:
df1_train_X, df1_train_y = to_supervised(df1_train_encoded, dims[1], dims[1])
df1_valid_X, df1_valid_y = to_supervised(df1_valid_encoded, dims[1], dims[1])
df1_test_X, df1_test_y = to_supervised(df1_test_encoded, dims[1], dims[1])

LSTM Fit + Predict

In [None]:
model = Sequential()
model.add(LSTM(nodes[1], input_shape=(dims[1], encoded_length), return_sequences=True))
model.add(TimeDistributed(Dense(encoded_length, activation='softmax')))

model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['categorical_accuracy', 'poisson', 'kullback_leibler_divergence'])

# train LSTM
count = 0
history = model.fit(df1_train_X, df1_train_y, 
                    epochs=2, 
                    batch_size=batch_sizes[1],
                    validation_data=(df1_valid_X, df1_valid_y),
                    verbose=1, 
                    shuffle=False, 
                    callbacks=[CSVLogger("outputs/model_{}.csv".format(count))])

# evaluate LSTM
df1_pred_y = model.predict(df1_test_X, batch_size=batch_sizes[1])

In [None]:
pd.DataFrame(history.history)

In [None]:
model.evaluate(df1_test_y, df1_pred_y, verbose=1)

In [None]:
df1_pred = decode_predictions(df1_pred_y, df1_test, enc)

In [None]:
make_time_series_plot(df1_regular, df1_pred, "1 Minute")

In [None]:
def evaluate_all_models(df):
    dfs = []
    freqs = ["1s", "30s", "1min"]
    dims = [3,5,10]
    nodes = [8, 20, 100]
    batch_sizes = [6, 12, 18]
    
    parameters = {'freq': [], 'dim':[], 'nodes':[], 'batch_size':[]}
    outputs = {'pred_loss':[], 'pred_accuracy':[], 'pred_poisson':[], 'pred_kl_divergence':[]}
    
    df = clean_data(df)
    
    for freq in freqs:
        df = regularize_timeseries(df, freq)
        df.to_csv("data/regularized_{}.csv".format(freq))
        dfs.append(regularize_timeseries(df, freq))
    
    for df_i in dfs:
        enc = LabelEncoder()
        df_labeled = enc.fit_transform(df_i)
        encoded_length = len(enc.classes_)
        
        df_encoded = one_hot_encode(df_labeled, encoded_length)
        
        df_train, df_test = train_test_split(df_encoded, test_size=0.4, shuffle=False)
        df_valid, df_test = train_test_split(df_test, test_size=0.5, shuffle=False)
        
        for dim in dims:
            X_train, y_train = to_supervised(df_train, dim, dim)
            X_valid, y_valid = to_supervised(df_valid, dim, dim)
            X_test, y_test = to_supervised(df_test, dim, dim)
            
            model = Sequential()
            for node in nodes:
                model.add(LSTM(node, input_shape=(dim, encoded_length), return_sequences=True))
                model.add(TimeDistributed(Dense(encoded_length, activation='softmax')))

                model.compile(optimizer='adam', 
                              loss='categorical_crossentropy', 
                              metrics=['categorical_accuracy', 'poisson', 'kullback_leibler_divergence'])

                for batch_size in batch_sizes:
                    history = model.fit(X_train, y_train, 
                              epochs=10, 
                              batch_size=batch_size, 
                              validation_data=(X_valid, y_valid),
                              verbose=1, 
                              shuffle=False, 
                              callbacks=[CSVLogger("outputs/model_{}.csv".format(count))])

                    # evaluate LSTM
                    y_pred = model.predict(X_test, batch_size=batch_size)
                    output = model.evaluate(y_test, y_pred, verbose=1)
                    
                    outputs['dims'].append(dim)
                    outputs['nodes'].append(node)
                    outputs['batch_sizes'].append(batch_size)
                    
                    outputs['loss'].append(output[0])
                    outputs['accuracy'].append(output[1])
                    outputs['poisson'].append(output[2])
                    outputs['kl_divergence'].append(output[3])
    
    pd.DataFrame(outputs)
    
    return pd.DataFrame(outputs)

In [None]:
def select_top_five():
    parameters = pd.read_csv('outputs/')
    models = pd.read_csv('data/')
    outputs = pd.read_csv('outputs/')

In [None]:
def make_plots():
    

## User 2

In [None]:
df2 = pd.read_csv('data/db0_to_39_window_data.csv')
df2 = clean_data(df2)
df2

In [None]:
df2_regular = regularize_timeseries(df2, "1min")
df2_regular