# Time Series Analysis

## Import Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import datetime as dt

from sklearn.linear_model import LinearRegression
from statsmodels.tsa.deterministic import DeterministicProcess

from pathlib import Path
from warnings import simplefilter
# ignore warnings to clean up output cells
simplefilter("ignore")  

# visualization
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns

# to print all the outputs in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# to show warnings only once:
import warnings; warnings.filterwarnings(action='once')

## Some additional settings

In [None]:
# adjusting columns width & number of chars
pd.set_option('display.width', 1200)
pd.set_option('max_colwidth', 500)

# set pandas max columns and rows to print
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

# settings for Vizs
plt.style.use("dark_background")
# plt.style.use("seaborn-whitegrid")

plt.rc(
    "figure",
    autolayout=True,
    figsize=(11, 4),
    titlesize=18,
    titleweight='bold',
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)
%config InlineBackend.figure_format = 'retina'

In [None]:
pio.templates.default = "simple_white"

px.defaults.template = "plotly_dark"
px.defaults.color_continuous_scale = px.colors.sequential.Blackbody
px.defaults.width = 800
px.defaults.height = 500
color_discrete_sequence=px.colors.sequential.Oryel

## Linear Regression with time series

`Download the data`

In [None]:
source_dir = './reports/'

# files to download:
file_users_daily = 'at_users_daily.csv'

In [None]:
users_daily = pd.read_csv(os.path.join(source_dir, file_users_daily)
                       ,parse_dates=['visit_date', 'week_start_monday', 'visit_month']
                    #    ,dtype={'device': 'category', 'source_id': 'int'}
                       )

In [None]:
df_users_daily = users_daily.copy()

In [None]:
df_users_daily.info()

In [None]:
df_users_daily = df_users_daily.rename(columns={'orders_count': 'orders', 'revenue_sum': 'revenue'})
df_users_daily.head()

In [None]:
df_dau = df_users_daily[['visit_date', 'dau', 'orders']].set_index('visit_date')
df_dau.head()

### Linear Regression of DAU

`Time-step features for DAU`

In [None]:
df_dau['time'] = np.arange(len(df_dau.index))
df_dau = df_dau.reindex(columns=['time', 'dau', 'orders'])
df_dau.head()

In [None]:
fig, ax = plt.subplots()
ax.plot('time', 'dau', data=df_dau, color='0.50')
ax = sns.regplot(x='time', y='dau', data=df_dau, ci=None, scatter_kws=dict(color='0.75'))
ax.set_title('Time plot of DAU (Daily Active Unique Users)')

`Lag features for DAU`

Linear regression with a lag feature produces the model:

target = weight * lag + bias

More generally, lag features let you model serial dependence.

In [None]:
df_dau['lag_dau'] = df_dau['dau'].shift(periods=1, fill_value=0).astype(int)
df_dau['lag_orders'] = df_dau['orders'].shift(periods=1, fill_value=0).astype(int)

df_dau.head()

In [None]:
fig, ax = plt.subplots()
ax = sns.regplot(x='lag_dau', y='dau', data=df_dau, ci=None, scatter_kws=dict(color='0.75'))
ax.set_aspect('equal')
ax.set_title('Lag Plot of DAU (Daily Active Unique Users)')

`Linear Regression Model for DAU`

`The procedure for fitting a linear regression model follows the standard steps for scikit-learn.`

In [None]:
X_dau = df_dau.loc[:, ['time']] # feature
y_dau = df_dau.loc[:, 'dau'] # target value

# Train the model

model = LinearRegression()
model.fit(X_dau, y_dau)

# Store the fitted values as a time series with the same time index as
# the training data

y_pred_dau = pd.Series(model.predict(X_dau), index=X_dau.index)

In [None]:
ax = y_dau.plot(**plot_params)
ax = y_pred_dau.plot(ax=ax, linewidth=2)
ax.set_title('Time Plot of Fitted Values of DAU (Daily Active Unique Users)')

In [None]:
X2 = df_dau.loc[:, ['lag_dau']]

# drop missing values in the feature set
X2.dropna(inplace=True)  

# create the target
y2 = df_dau.loc[:, 'dau']  

# drop corresponding values in target
y2, X2 = y2.align(X2, join='inner')  

model = LinearRegression()
model.fit(X2, y2)

y_pred2 = pd.Series(model.predict(X2), index=X2.index)

The lag plot shows us how well we were able to fit the relationship between the number DAU one day and the number the previous day.

In [None]:
fig, ax = plt.subplots()
ax.plot(X2['lag_dau'], y2, '.', color='0.50')
ax.plot(X2['lag_dau'], y_pred2)
ax.set_aspect('equal')
ax.set_ylabel('dau')
ax.set_xlabel('lag_dau')
ax.set_title('Lag Plot of Fitted Values of DAU (Daily Active Unique Users)')

`The following time plot shows us how our forecasts now respond to the behavior of the series in the recent past`

In [None]:
ax = y2.plot(**plot_params)
ax = y_pred2.plot()

### Linear Regression with Orders

`Time-step features for Orders`

In [None]:
fig, ax = plt.subplots()
ax.plot('time', 'orders', data=df_dau, color='0.50')
ax = sns.regplot(x='time', y='orders', data=df_dau, ci=None, scatter_kws=dict(color='0.75'))
ax.set_title('Time plot of Number of Orders')

`Lag feature of order Numbers`

In [None]:
fig, ax = plt.subplots()
ax = sns.regplot(x='lag_orders', y='orders', data=df_dau, ci=None, scatter_kws=dict(color='0.75'))
ax.set_aspect('equal')
ax.set_title('Lag Plot of Number of Orders')

`Linear Regression Model for Orders`

In [None]:
X_orders = df_dau.loc[:, ['time']] # feature
y_orders = df_dau.loc[:, 'orders'] # target value

# Train the model

model = LinearRegression()
model.fit(X_orders, y_orders)

# Store the fitted values as a time series with the same time index as
# the training data

y_pred_orders = pd.Series(model.predict(X_orders), index=X_orders.index)

In [None]:
ax = y_orders.plot(**plot_params)
ax = y_pred_orders.plot(ax=ax, linewidth=2)
ax.set_title('Time Plot of Fitted values of Number of Orders')

In [None]:
X2_orders = df_dau.loc[:, ['lag_orders']]

# drop missing values in the feature set
X2_orders.dropna(inplace=True)  

# create the target
y2_orders = df_dau.loc[:, 'orders']  

# drop corresponding values in target
y2_orders, X2_orders = y2_orders.align(X2_orders, join='inner')  

model = LinearRegression()
model.fit(X2_orders, y2_orders)

y_pred2_orders = pd.Series(model.predict(X2_orders), index=X2_orders.index)

In [None]:
fig, ax = plt.subplots()
ax.plot(X2_orders['lag_orders'], y2_orders, '.', color='0.50')
ax.plot(X2_orders['lag_orders'], y_pred2_orders)
ax.set_aspect('equal')
ax.set_ylabel('orders')
ax.set_xlabel('lag_orders')
ax.set_title('Lag Plot of Fitted Values of Number of Orders')

In [None]:
ax = y2_orders.plot(**plot_params)
ax = y_pred2_orders.plot()

## Trend

Creating in accodings with the Kaggle tutorial:

https://www.kaggle.com/code/ryanholbrook/trend

`Moving Average Plots`

The idea of mooving average is to smooth out any short-term fluctuations in the series so that only long-term changes remain.

For the DAU and Order series, we chose a window of size 30 to smooth over the monthly period within 12 months.

In [None]:
df_dau['moving_avg_30d_dau'] = df_dau['dau'].rolling(
    window=30 # 7 days window
    ,center=True # put the average at the center of the window
    ,min_periods=30
).mean()

df_dau['moving_avg_30d_orders'] = df_dau['orders'].rolling(
    window=30 # 7 days window
    ,center=True # put the average at the center of the window
    ,min_periods=30
).mean()


In [None]:
df_dau.sample(5)

In [None]:
ax = df_dau['dau'].plot(style='', color='0.5')
df_dau['moving_avg_30d_dau'].plot(
    ax=ax
    ,linewidth=2
    ,title='DAU (Daily Active Users) - 30 days Moving Average'
    ,legend=False
)

In [None]:
ax = df_dau['orders'].plot(style='', color='0.5')
df_dau['moving_avg_30d_orders'].plot(
    ax=ax
    ,linewidth=2
    ,title='Numbers of Orders - 30 days Moving Average'
    ,legend=False
)

# End