### Imports

#### Libraries

In [None]:
# Celonis
from pycelonis import get_celonis
from pycelonis.pql import PQL, PQLColumn, PQLFilter

# Standard libs
import numpy as np
import pandas as pd
from pandas import datetime
from pandas import DataFrame
from pandas import read_csv
from matplotlib import pyplot
import matplotlib.pyplot as plt

# Dates and Times
from datetime import datetime
import datetime as dt
from isoweek import Week

# Maths, Models
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.statespace.sarimax import SARIMAX
import pmdarima as pm
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

# Others
import warnings

#### Other files

In [None]:
# Functions
%run Template_Func_Predictions_TS.ipynb
# External data/GDP
%run Template_Ext_Data.ipynb
ext_data.head()

### Load input data

In [None]:
# PARAM - DM name, table, column and filter to query input data from DM
dm_name = 'TBD'
datamodel = celonis.datamodels.find(dm_name)
table_name = 'TBD'
datamodel.tables.find(table_name).columns
input_columns = [('col_name','pretty_name'),('col_name_2','pretty_name_2')]
input_filter = "FILTER TBD"
# Set columns of query
query = PQL()
for col_name,col_pretty_name in input_columns:
    query += PQLColumn(col_name,col_pretty_name)
# Set filter of query
query += PQLFilter(input_filter)

# Query input data from DM
df = datamodel._get_data_frame(query)
df.head()

### Model

#### Function: Run Predictions Model for Train df

In [None]:
def run_predictions_model(df,ext_data,val_size_perc,to_adjust_years):
    
    ### PRE-PROCESSING
    
    # Reindex and print Df
    df = df.reset_index()
    # Clean data: fill empty weeks with 0 value
    df=Fix_Data(df)
    print(df.shape)
    # Create Train Df
    train_df = df.copy()
        
    # Cap the high outliers to a max value
    # PARAM - Max value
    max_outlier_value = 1000 #TBD
    train_df = cap_outliers(train_df, max_outlier_value)
    
    # Adjust past data if baseline changed at date D
    if to_adjust_years:
        # PARAM - dates
        change_date = 'TBD'
        end_date = 'TBD'
        train_df = adjust_baseline(train_df,change_date, end_date)
    
    # Plot
    # PARAM - y margin for y axis
    y_margin = 500000
    plot_clean_y(df,train_df,max_outlier_value,y_margin)
    
    ### MODEL: Y = Trend + Seasonality + Residuals

    ## Trend: Calculate, Model and Predict future values
    
    # PARAM - Trend window e.g. 52 if weekly TS with annual seasonality. 7 if daily TS with weekly seasonality
    ts_seasonality = 52
    train_df['Trend'] = calculate_trend(train_df,ts_seasonality, center=False)
    # Plot Y and Trend
    # PARAM - y axis
    y_min = 0
    y_max = 9000000
    plot_y_trend(train_df,t,y_min,y_max)

    # Use External data/GDP to fit and predict the Trend
    print(train_df.dropna().shape)
    train_df = combine_ext_data(train_df,ext_data,days_to_shift=1)
    # PARAM - External Data/GDP column
    exo_col_name = 'TBD'
    exo_pretty_name = 'TBD'
    
    # Regression Trend on External data/GDP: define X=GDP and Y=Trend for regression model
    X,Y = subsets_to_fit(train_df,exo_col_name,'Trend',val_size_perc)
    # Plot Y, Trend and Exo Regr
    # PARAM - y axis scale for External data/GDP
    y_min_gdp = 100
    y_max_gdp = 200
    plot_y_trend_ext(train_df,exo_col_name,exo_pretty_name,y_min,y_max,y_min_gdp,y_max_gdp)
    
    # Fit Regression Y=Trend X=Exo
    reg = LinearRegression().fit(X, Y)
    print(reg.coef_)
    print(reg.intercept_)
    print(reg.score(X,Y))
    
    # Predict Trend with fitted Regression
    trend_pred_col_name = 'Predicted Trend'
    X_F,train_df = predict_trend(train_df,exo_col_name,trend_pred_col_name)
    # Plot Trend, External data/GDP and Predicted Trend
    plot_y_pred_trend_ext(train_df,exo_col_name,X,Y,X_F,y_min,y_max,y_min_gdp,y_max_gdp)
    print(train_df.head())

    ## Calculate Seasonality
    
    # Calculate Y - Trend
    train_df['Y - Trend'] = train_df[y_col_name]-train_df['Trend']
    
    # Get Seasonality by moving avg on Y - T, and average across years for 1 value per week of year
    # PARAM - Moving avg window for S
    window = 10
    s = train_df['Y - Trend'].rolling(window=window,center=True).mean()
    s = s.groupby(s.index.week).mean()
    print(s.head(5))
    print(s.tail(5))
    
    # Add Seasonality to df (Assign S to each week of each year)
    # PARAM - S column name
    seasonality_col_name = 'Seasonality'
    train_df[seasonality_col_name] = np.nan
    for i in train_df.index:
        train_df.loc[i][seasonality_col_name] = s[i.week]
        
    # Should be not be required #
    # Fix border dates with Null values
    # PARAM - seasonsality period in days
    seas_period_days = 52*7
    delta = datetime.timedelta(days=-seas_period_days)
    for i in train_df[train_df[seasonality_col_name].isnull()==True].index:
        train_df.loc[i][seasonality_col_name] = train_df.loc[i+delta][seasonality_col_name]
    
    # Plot Y, T and S
    plot_y_t_s_with_pred(train_df,trend_col_name,seasonality_col_name,trend_pred_col_name)
    
    ## Residuals: Calculate, Model and Predict future values

    # Calculate R = Y - Trend - Season
    train_df['Y - T - S'] = train_df[y_col_name]-train_df[trend_col_name]-train_df[seasonality_col_name]
    # Create R df
    # PARAM - R column name
    r_col_name = 'Y - T - S'
    r = train_df[r_col_name]
    # Plot R
    plot_r(train_df,r_col_name)

    # R Study
    # R shape
    print(r.dropna().shape)
    # Stationarity test
    res = sm.tsa.adfuller(r.dropna(),regression='c')
    print('p-value:{}'.format(res[1]))
    # Verify that p value is low
    
    # ACF PACF on R
    # PARAM - # lags for acf pacf
    lags = 25
    plot_acf_pacf_r(r,lags)
    # Deduce ARMA(p,q) model for R

    # Create R df for R Model
    columns_to_drop = [y_col_name,exo_col_name]
    col_to_rename = {'index':'Date'}
    r_df = create_r_df(train_df,columns_to_drop,col_to_rename)

    # Fit ARIMA Model on R for R predictions
    # PARAM - p for AR, d for I, q for MA. Set using acf pacf plots above.
    # P,D,Q,s can remain None.
    # n_pred is # future points to forecast
    # (Optional) model - to input an existing loaded model
    # (Optional) exo - to input exogenous regressors
    p,d,q = 3,0,3
    P,D,Q,s = None,None,None,None
    n_pred = 18
    model = None
    exo = None
    model_r,results_df_r= get_results_with_val(df=r_df.dropna(),exo=exo,p,d,q,P,D,Q,s,model=model,n_predictions=n_pred,r_col_name,val_size_perc)

    # Add Predicted R to df
    # PARAM - R column name for df
    r_col_name = 'Predicted R'
    r_class_col_name = 'Predicted R Classification'
    train_df = add_r(train_df,results_df_r,r_col_name,r_class_col_name)

    ## Calculate Total Y Prediction = Predicted T + S + Predicted R
    
    # PARAM - y pred column names
    y_pred_col_name = 'Y Prediction'
    train_df = calc_y_pred(train_df,y_pred_col_name,trend_pred_col_name,seasonality_col_name,r_class_col_name)
    print(train_df.tail(n=20))
    # Plot and show Final Df with predictions
    plot_final_y_t_s_r_with_pred(train_df,trend_col_name,seasonality_col_name,r_col_name,trend_pred_col_name,y_pred_col_name)
    
    # Show Final Df
    train_df.head(n=10)
    train_df.tail(n=10)
    
    return train_df

#### Function: Format Output for DM Export

In [None]:
# Reformat results for Export to DM
def prepare_export_df(train_df,output_col_names,y_pred_col_name):
    export_df = pd.DataFrame(train_df[[y_col_name,y_pred_col_name,r_class_col_name]])
    export_df.reset_index(inplace=True)
    export_df.rename(columns=output_col_names
                    ,inplace=True)
    print(export_df.shape)
    return export_df

#### Run Predictions Model for selected Subsets

In [None]:
#### INPUT Product Families
#### OUTPUT Exported Predictions for DM

## INPUTS
subsets = ['Subset1','Subset2']
subset_needs_adjusts = ['Subset2']
subset_col_name = 'TBD'
y_col_name = 'Y Value'
val_size_perc = 0.2
### OUTPUTS
all_subset_results = {}
all_subset_exports = {}
output_col_names = {'index':'Date'
                              ,y_col_name:'Actual Y Value'
                              ,y_pred_col_name:'Predicted Y Value'
                             ,r_class_col_name:'Classification'}

### Run Predictions for each selected subset
for subset in subsets:
    print('Running model for ',subset)
    # Check if subset needs baseline adjustment
    to_adjust = False
    if subset in subset_needs_adjusts:
        to_adjust = True
    
    # Filter train df for subset
    subset_train_df = train_df[train_df[subset_col_name]==prod_fam]
    subset_train_df.drop(columns=[subset_col_name],inplace=True)
    # Run Predictions model for this subset
    subset_results = run_predictions_model(fm_train_df,ext_data,to_adjust)
    # Store Output (subset Predictions)
    all_subset_results[subset] = subset_results
    print(subset,all_subset_results[subset].shape)
    # Store export-version of the Output (subset Predictions)
    all_subset_exports[subset] = prepare_export_df(subset_results,output_col_names,y_pred_col_name)

print('Finished running predictions for all subsets, total output shape is ',all_subset_results[subset].shape)
print('Subsets are ',all_subset_exports.keys())

### Combine Results into single Export table

In [None]:
# Add new 'subset name' column to the export-version of Predictions
export_df = pd.DataFrame()
for key in all_subset_exports:
    print('Adding ',key,' value in new column')
    subset_df = all_subset_exports[key]
    subset_df[subset_col_name] = key
    print('shape of subset export-version is ',subset_df.shape)
    export_df = pd.concat([export_df,subset_df],axis=0)

# Preview Export Df
export_df.tail(5)

#### Verify shape of export-version Predictions

In [None]:
# Shape of export-version of all predictions
export_df.shape
# VALIDATION - should be # subsets x Timeframe (train, val and future)

In [None]:
# Tail of export-version of predictions
export_df.tail()

#### Export to DM (Disable during WIP)

In [None]:
# Get DM for export
dm = a.datamodel 

In [None]:
# Export table to DM
# PARAM - table name for exported predictions in DM
dm_export_table_name = 'Predictions_Output'
tablecombine = dm.push_table(export_df, dm_export_table_name, reload_datamodel = False, if_exists='replace')