In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pymc as pm
import arviz as az
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from sklearn.preprocessing import MaxAbsScaler

from utils import summary_table

In [None]:
print(f"Running on PyMC v{pm.__version__}")

In [None]:
raw = pd.read_csv('MMM_test_data.csv')
df=raw.copy()

In [None]:
df['start_of_week'] = pd.to_datetime(df['start_of_week'])
# df.set_index('start_of_week', inplace=True)

In [None]:
summary_table(df)

In [None]:
#values for revenue compared the channels differs in scale
df.boxplot()

In [None]:
rwm = raw.melt('start_of_week',var_name='cols', value_name='vals')
rwm['type']= 'daily'
weekly = rwm.groupby([pd.to_datetime(rwm.start_of_week).dt.week,'cols']).mean().reset_index()
weekly['type']='weekly'
monthly = rwm.groupby([pd.to_datetime(rwm.start_of_week).dt.month,'cols']).mean().reset_index()
monthly['type']='monthly'
df_table= rwm.append(weekly, ignore_index=True).append(monthly,ignore_index=True)


In [None]:
sns.relplot(data=df_table, x="start_of_week", y="vals", col="type",row='cols',kind="line", facet_kws=dict(sharex=False,sharey=False))

In [None]:
df['day'] = pd.to_datetime(df.start_of_week).dt.dayofyear
df['week'] = pd.to_datetime(df.start_of_week).dt.week
df['month'] = pd.to_datetime(df.start_of_week).dt.month
df['year'] = pd.to_datetime(df.start_of_week).dt.year

In [None]:
df

In [None]:
n_order = 1
periods = df["day"] / 365.25
fourier_features = pd.DataFrame(
    {
        f"{func}_order_{order}": getattr(np, func)(2 * np.pi * periods * order)
        for order in range(1, n_order + 1)
        for func in ("sin", "cos")
    }
)


In [None]:
fig, ax = plt.subplots(nrows=2, sharex=True, layout="constrained")
fourier_features.filter(like="sin").plot(color="C0", alpha=0.15, ax=ax[0])
ax[0].get_legend().remove()
ax[0].set(title="Fourier Modes (Sin)", xlabel="index (week)")
fourier_features.filter(like="cos").plot(color="C1", alpha=0.15, ax=ax[1])
ax[1].get_legend().remove()
ax[1].set(title="Fourier Modes (Cos)", xlabel="index (week)");

In [None]:
plot_periodogram(df.revenue);

In [None]:
def plot_periodogram(ts, detrend='linear', ax=None):
    from scipy.signal import periodogram
    fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(
        [
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)",
        ],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title("Periodogram")
    return ax

In [None]:
def fourier_features(index, freq, order):
    time = np.arange(len(index), dtype=np.float32)
    k = 2 * np.pi * (1 / freq) * time
    features = {}
    for i in range(1, order + 1):
        features.update({
            f"sin_{freq}_{i}": np.sin(i * k),
            f"cos_{freq}_{i}": np.cos(i * k),
        })
    return pd.DataFrame(features, index=index)

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
series = df['revenue']
result = seasonal_decompose(series, model='additive')
result.plot()
pyplot.show()

In [None]:
df.corr()

In [None]:
# figure for subplots
plt.figure(figsize = (12, 8))

# acf and pacf for A
plt.subplot(421); plot_acf(sales_a, lags = 50, ax = plt.gca(), color = c)
plt.subplot(422); plot_pacf(sales_a, lags = 50, ax = plt.gca(), color = c)

# acf and pacf for B
plt.subplot(423); plot_acf(sales_b, lags = 50, ax = plt.gca(), color = c)
plt.subplot(424); plot_pacf(sales_b, lags = 50, ax = plt.gca(), color = c)

# acf and pacf for C
plt.subplot(425); plot_acf(sales_c, lags = 50, ax = plt.gca(), color = c)
plt.subplot(426); plot_pacf(sales_c, lags = 50, ax = plt.gca(), color = c)

# acf and pacf for D
plt.subplot(427); plot_acf(sales_d, lags = 50, ax = plt.gca(), color = c)
plt.subplot(428); plot_pacf(sales_d, lags = 50, ax = plt.gca(), color = c)

plt.show()



In [None]:
_ = plot_lags(df.revenue, lags=24, nrows=4)

In [None]:
def lagplot(x, y=None, lag=1, standardize=True, ax=None, **kwargs):
    from matplotlib.offsetbox import AnchoredText
    x_ = x.shift(lag)
    if standardize:
        x_ = (x_ - x_.mean()) / x_.std()
    if y is not None:
        y_ = (y - y.mean()) / y.std() if standardize else y
    else:
        y_ = x
    corr = y_.corr(x_)
    if ax is None:
        fig, ax = plt.subplots()
    scatter_kws = dict(
        alpha=0.75,
        s=3,
    )
    line_kws = dict(color='C3', )
    ax = sns.regplot(x=x_,
                     y=y_,
                     scatter_kws=scatter_kws,
                     line_kws=line_kws,
                     lowess=True,
                     ax=ax,
                     **kwargs)
    at = AnchoredText(
        f"{corr:.2f}",
        prop=dict(size="large"),
        frameon=True,
        loc="upper left",
    )
    at.patch.set_boxstyle("square, pad=0.0")
    ax.add_artist(at)
    ax.set(title=f"Lag {lag}", xlabel=x_.name, ylabel=y_.name)
    return ax


def plot_lags(x, y=None, lags=6, nrows=1, lagplot_kwargs={}, **kwargs):
    import math
    kwargs.setdefault('nrows', nrows)
    kwargs.setdefault('ncols', math.ceil(lags / nrows))
    kwargs.setdefault('figsize', (kwargs['ncols'] * 2, nrows * 2 + 0.5))
    fig, axs = plt.subplots(sharex=True, sharey=True, squeeze=False, **kwargs)
    for ax, k in zip(fig.get_axes(), range(kwargs['nrows'] * kwargs['ncols'])):
        if k + 1 <= lags:
            ax = lagplot(x, y, lag=k + 1, ax=ax, **lagplot_kwargs)
            ax.set_title(f"Lag {k + 1}", fontdict=dict(fontsize=14))
            ax.set(xlabel="", ylabel="")
        else:
            ax.axis('off')
    plt.setp(axs[-1, :], xlabel=x.name)
    plt.setp(axs[:, 0], ylabel=y.name if y is not None else x.name)
    fig.tight_layout(w_pad=0.1, h_pad=0.1)
    return fig

In [None]:
df['week'] = pd.to_datetime(df.start_of_week).dt.day
df['month'] = pd.to_datetime(df.start_of_week).dt.month
df['year'] = pd.to_datetime(df.start_of_week).dt.year


In [None]:
spend_channels= ['spend_channel_1','spend_channel_2','spend_channel_3','spend_channel_4','spend_channel_5','spend_channel_6','spend_channel_7']
# df['total_spent'] = df[spend_channels].sum(axis=1)

In [None]:
raw.melt('start_of_week',var_name='cols', value_name='vals')

In [None]:
rwm = raw.melt('start_of_week',var_name='cols', value_name='vals')


In [None]:
rwm[rwm['cols']=='revenue']

In [None]:
rwm.mask(rwm['cols']=='revenue').dropna()

In [None]:
sns.lineplot(data=rwm.mask(rwm['cols']=='revenue').dropna(), x=pd.to_datetime(rwm.start_of_week).dt.week,y='vals',hue='cols')

In [None]:
rwm[rwm['cols']=='revenue']

In [None]:

fig = plt.figure(figsize=(10, 10))
ax1 = sns.lineplot( 
    x=pd.to_datetime(rwm.mask(rwm['cols']=='revenue').dropna().start_of_week).dt.week,
    y='vals',
    hue='cols',
    data=rwm.mask(rwm['cols']=='revenue').dropna(), 
    sort=False, 
    color='blue',
    linewidth=4
)

ax2 = ax1.twinx()# plot bar graph on axis #2
sns.barplot( 
    x=pd.to_datetime(rwm[rwm['cols']=='revenue'].start_of_week).dt.week,
    y= 'vals',
    data=rwm[rwm['cols']=='revenue'], 
    color='orange', 
    alpha=0.5, 
    ax = ax2       # Pre-existing axes for the plot
)
ax1.tick_params(axis='x', rotation=90)
ax1.set_ylabel('avg Trips')
ax1.set_xlabel('Days')

plt.show()


In [None]:
basic_model = pm.Model()

with basic_model:
    # Priors for unknown model parameters
    alpha = pm.Normal("alpha", mu=0, sigma=10)
    beta = pm.Normal("beta", mu=0, sigma=10, shape=2)
    sigma = pm.HalfNormal("sigma", sigma=1)

    # Expected value of outcome
    mu = alpha + beta[0] * X1 + beta[1] * X2

    # Likelihood (sampling distribution) of observations
    Y_obs = pm.Normal("Y_obs", mu=mu, sigma=sigma, observed=Y)

In [None]:
with basic_model:
    # draw 1000 posterior samples
    idata = pm.sample()

In [None]:
pm.model_to_graphviz(basic_model)

In [None]:
idata


In [None]:
az.plot_trace(idata, combined=True);

In [None]:
az.summary(idata, round_to=2)

In [None]:
az.plot_energy(idata);

In [None]:
az.plot_forest(idata, var_names=["beta"], combined=True, hdi_prob=0.95, r_hat=True);

In [None]:
spend_channels= ['spend_channel_1','spend_channel_2','spend_channel_3','spend_channel_4','spend_channel_5','spend_channel_6','spend_channel_7']
spends = df[spend_channels].copy()