In [205]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.arima_model import ARIMA
from pmdarima.arima import auto_arima
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [206]:
df = pd.read_excel("AIB_History_Training_2014-2020.xlsx", sheet_name='HistoricalTransactions')
df.head()

Unnamed: 0,FileFolder,FileName,Report_Title,Report_RunDate,Report_AsOfDate,Report_TransactionEffectiveDate,Currency,Institution,Branch,TransactionGroup,TransactionCode,TransactionBackdateFlag,TransactionAmount
0,Generic Random Data,AIBC0101.001,AIBC 0809TTTBB,2014-01-01,2013-12-31,2013-12-30,CAD,1,1,D,CP,Yes,-972.619715
1,Generic Random Data,AIBC0101.001,AIBC 0809TTTBB,2014-01-01,2013-12-31,2013-12-30,CAD,1,1,D,UR,Yes,-75.956876
2,Generic Random Data,AIBC0101.001,AIBC 0809TTTBB,2014-01-01,2013-12-31,2013-12-30,CAD,1,1,C,CP,Yes,127.43047
3,Generic Random Data,AIBC0101.001,AIBC 0809TTTBB,2014-01-01,2013-12-31,2013-12-30,CAD,1,1,D,UR,Yes,-38.300469
4,Generic Random Data,AIBC0101.001,AIBC 0809TTTBB,2014-01-01,2013-12-31,2013-12-30,CAD,1,1,D,UR,Yes,-117.467137


In [207]:
df = df.drop(columns=["FileFolder", "FileName", "Report_Title", "Report_RunDate", "Report_AsOfDate", "Currency", "Institution", "Branch", "TransactionBackdateFlag", "TransactionCode"])
df.shape

(67731, 3)

In [208]:
## Split data into Debit and Credit
credit_df = df[df["TransactionGroup"] == "C"].copy()
debit_df = df[df["TransactionGroup"] == "D"].copy()
print("Credit Shape: ", credit_df.shape)
print("Debit Shape: ", debit_df.shape)

Credit Shape:  (43402, 3)
Debit Shape:  (24329, 3)


In [209]:
## Group by date
daily_credit = credit_df.groupby("Report_TransactionEffectiveDate")["TransactionAmount"].sum().asfreq("D")
daily_debit = debit_df.groupby("Report_TransactionEffectiveDate")["TransactionAmount"].sum().asfreq("D")

print("Daily Credit Missing Values: ", daily_credit.isnull().sum())
print("Daily Debit Missing Values: ", daily_debit.isnull().sum())

Daily Credit Missing Values:  809
Daily Debit Missing Values:  787


In [210]:
## Fill n/a with 0
daily_credit = daily_credit.fillna(method="ffill").fillna(method="bfill")
daily_debit = daily_debit.fillna(method="ffill").fillna(method="bfill")

print("Daily Credit Missing Values: ", daily_credit.isnull().sum())
print("Daily Debit Missing Values: ", daily_debit.isnull().sum())

Daily Credit Missing Values:  0
Daily Debit Missing Values:  0


In [211]:
## Rows of daily credit and debit transactions
print("Daily Credit Rows: ", daily_credit.shape[0])
print("Daily Debit Rows: ", daily_debit.shape[0])

Daily Credit Rows:  2559
Daily Debit Rows:  2559


In [212]:
## Function to split train and test sets
def split_train_test_time_series(data, n_splits=5, test_size=None):
    X = data.values.reshape(-1, 1)
    
    tscv = TimeSeriesSplit(n_splits=n_splits, max_train_size=None, test_size=test_size)
    
    train_indices = []
    test_indices = []
    
    for train_index, test_index in tscv.split(X):
        train_indices.append(train_index)
        test_indices.append(test_index)
    
    return data.iloc[train_indices[-1]], data.iloc[test_indices[-1]]

## Split train and test sets
credit_train, credit_test = split_train_test_time_series(daily_credit, test_size=int(len(daily_credit) * 0.2))
debit_train, debit_test = split_train_test_time_series(daily_debit, test_size=int(len(daily_debit) * 0.2))


(2048,) (511,)


In [None]:
## Pick p,d,m for arima model using auto-arima for the most optimal parameters
