In [None]:
from snowflake.snowpark import (
    Session,
    functions as F, 
    types as T
)
import os
import json
import pandas as pd
import datetime
import calendar
import matplotlib.pyplot as plt
from pandas.tseries.offsets import MonthEnd
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.ensemble import IsolationForest


In [None]:
connection_params = json.loads(os.environ["AIRFLOW_CONN_SNOWFLAKE_ADMIN"])
connection_params['user'] = connection_params.pop('login')
connection_params['account'] = connection_params['extra'].pop('account')
connection_params['database'] = connection_params['extra'].pop('database')
connection_params['schema'] = connection_params['schema']
connection_params['region'] = connection_params['extra'].pop('region')
connection_params['application'] = connection_params['extra'].pop('application')
snowpark_session = Session.builder.configs(connection_params).create()

In [None]:
account_start_cutoff_date = datetime.date(2022, 11, 15)

In [None]:
#Data engineering
currency_usage = snowpark_session.table('SNOWFLAKE.ORGANIZATION_USAGE.USAGE_IN_CURRENCY_DAILY')

In [None]:
#Feature engineering
usage = currency_usage.filter((F.col('USAGE_DATE') >= account_start_cutoff_date) &
                              (F.col('USAGE_DATE') < datetime.date.today()))
                                         
pivot_values = usage.select('USAGE_TYPE').distinct().to_pandas().USAGE_TYPE.to_list()
usage_df = usage.select('USAGE_DATE', 'USAGE_TYPE', 'USAGE')\
                         .pivot(pivot_col='USAGE_TYPE', values=pivot_values)\
                         .sum('USAGE')\
                         .sort('USAGE_DATE')\
                         .to_pandas()\

usage_df.columns = ['date']+pivot_values
usage_df.date = pd.to_datetime(usage_df.date)
usage_df.set_index('date', inplace=True)
usage_df.fillna(value=0, inplace=True)
usage_df = usage_df.apply(pd.to_numeric, downcast='float')
usage_df

In [None]:
plt.rc('figure',figsize=(12,6))
plt.rc('font',size=15)

fig, axs = plt.subplots(2, 1, sharex='all', layout='constrained')
axs[0].plot(usage_df['compute'])
# axs[0].set_xlabel('Date')
axs[0].set_ylabel('Dollars')
axs[0].grid(True)

axs[1].plot(usage_df['storage'])
axs[1].set_xlabel('Date')
axs[1].set_ylabel('Dollars')
axs[1].grid(True)
plt.show()

In [None]:
compute_stl = seasonal_decompose(usage_df.compute, model='additive')

fig = compute_stl.plot()


In [None]:
compute_trend = compute_stl.trend.fillna(0).values.reshape(-1,1)
compute_stationary = usage_df.compute.values.reshape(-1, 1) - compute_trend

compute_scaled = StandardScaler().fit_transform(compute_stationary)

compute_model =  IsolationForest()
compute_model.fit(compute_scaled)
usage_df['compute_anomaly'] = compute_model.score_samples(compute_scaled)

In [None]:
fig, ax = plt.subplots(figsize=(10,6))

anomalies = usage_df.loc[(usage_df.compute_anomaly <= -0.65) & (usage_df.compute > usage_df.compute.mean()), ['compute']]
anomalies

ax.plot(usage_df.index, usage_df.compute, color='black', label = 'Normal')
ax.scatter(anomalies.index, anomalies.compute, color='red', label = 'Anomaly')
plt.legend()
plt.show()


In [None]:
storage_stl = seasonal_decompose(usage_df.storage, model='additive')
fig = storage_stl.plot()

In [None]:
storage_trend = storage_stl.trend.fillna(0).values.reshape(-1,1)
storage_stationary = usage_df.storage.values.reshape(-1, 1) - storage_trend

storage_scaled = StandardScaler().fit_transform(storage_stationary)

storage_model =  IsolationForest()
storage_model.fit(storage_scaled)
usage_df['storage_anomaly'] = storage_model.score_samples(storage_scaled)

In [None]:
fig, ax = plt.subplots(figsize=(10,6))

anomalies = usage_df.loc[(usage_df.storage_anomaly <= -0.65) & (usage_df.storage > usage_df.storage.mean()), ['storage']]
anomalies

ax.plot(usage_df.index, usage_df.storage, color='black', label = 'Normal')
ax.scatter(anomalies.index, anomalies.storage, color='red', label = 'Anomaly')
plt.legend()
plt.show()


In [None]:
usage_df = usage.select('USAGE_DATE', 'USAGE_TYPE', 'USAGE')\
                         .pivot(pivot_col='USAGE_TYPE', values=pivot_values)\
                         .sum('USAGE')\
                         .sort('USAGE_DATE')\
                         .to_pandas()\

usage_df.columns = ['date']+pivot_values
usage_df.date = pd.to_datetime(usage_df.date)
usage_df.set_index('date', inplace=True)
usage_df.fillna(value=0, inplace=True)
usage_df = usage_df.apply(pd.to_numeric, downcast='float')


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
import tensorflow as tf
import keras

scalar = StandardScaler()
data = scalar.fit_transform(usage_df)

ds = tf.keras.preprocessing.timeseries_dataset_from_array(
     data=data,
     targets=data,
     sequence_length=7)

X, y = next(iter(ds))

model = keras.Sequential(
    [
        keras.layers.LSTM(
            units=64, 
            kernel_initializer='he_uniform', 
            batch_input_shape=(None, X.shape[1], X.shape[2]), 
            return_sequences=True, 
            # activation='relu',
            name='enc1'),
        keras.layers.LSTM(
            units=32, 
            kernel_initializer='he_uniform', 
            return_sequences=True, 
            # activation='relu',
            name='enc2'),
        keras.layers.LSTM(
            units=16, 
            kernel_initializer='he_uniform', 
            return_sequences=False, 
            # activation='relu',
            name='enc3'),
        keras.layers.RepeatVector(
            n=7, 
            name='encoder_decoder_bridge'),
        keras.layers.LSTM(
            units=16, 
            kernel_initializer='he_uniform', 
            return_sequences=True, 
            # activation='relu',
            name='dec1'),
        keras.layers.LSTM(
            units=32, 
            kernel_initializer='he_uniform', 
            return_sequences=True, 
            # activation='relu',
            name='dec2'),
        keras.layers.LSTM(
            units=64, 
            kernel_initializer='he_uniform', 
            return_sequences=True, 
            # activation='relu',
            name='dec3'),
        keras.layers.TimeDistributed(layer=keras.layers.Dense(X.shape[2]))
    ]
)

model.compile(loss="mse", optimizer=tf.keras.optimizers.legacy.Adam()) #(learning_rate=0.001))
model.build()
# print(model.summary())

history = model.fit(
    x=X,
    y=X,
    epochs=200,
    batch_size=128,
    validation_split=0.1,
    callbacks=[
        keras.callbacks.EarlyStopping(monitor="val_loss", 
                                      patience=20, 
                                      mode="min")
    ],
)


In [None]:
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.legend()
plt.show()

In [None]:
import numpy as np
X_pred = model.predict(X)
mse = np.mean(np.power(X - X_pred, 2), axis=1)

In [None]:
forecast_df = pd.DataFrame(scalar.inverse_transform(mse), columns=usage_df.columns)
pred_df = pd.DataFrame(mse, columns=usage_df.columns)

In [None]:
pred_df.loc[0]

In [None]:
#normal day
usage_df.iloc[-4]

In [None]:
plt.plot(pred_df.compute, label="Compute")
plt.plot(pred_df.storage, label="Storage")
plt.legend()
plt.show()