**Self-Organised Direction Aware Data Partitioning Algorithm Combined With Type-2 Fuzzy Time Series**


Arthur C. V. e Pinto, Thiago E. Fernandes, Petrônio C. L. Silva, Frederico G. Guimarães, Christian Wagner, Eduardo P. de Aguiar, Under publishing.

#SODA-T2FTS

In [None]:
pip install --upgrade numpy==1.18.5

In [None]:
!git clone https://github.com/arthurcaio92/SODA_T2FTS.git

In [None]:
pip install XlsxWriter

##Experiments

**CAUTION:** This task is computationally expensive and takes several hours to be performed. 

In [None]:
# -*- coding: utf-8 -*-
from SODA_T2FTS.SODA_T2FTS.sliding_window import run_sliding_window
from SODA_T2FTS.SODA_T2FTS.datasets import get_TAIEX,get_NASDAQ,get_SP500
import numpy as np
import pandas as pd

'------------------------------------------------ Data set import -------------------------------------------------'

taiex_df = get_TAIEX()
taiex = taiex_df.avg               
taiex = taiex.to_numpy()

nasdaq_df = get_NASDAQ()
nasdaq = nasdaq_df.avg               
nasdaq = nasdaq.to_numpy()    

sp500_df = get_SP500()
sp500 = sp500_df.Avg               
sp500 = sp500[11500:16000]
sp500 = sp500.to_numpy()    

'------------------------------------------------ Gridsearch Parameters -------------------------------------------------'

datasets = [taiex,nasdaq,sp500]
dataset_names = ['TAIEX','NASDAQ','SP500']
diff = 1                                      
partition_parameters = np.arange(1,11)               
orders = [1,2,3]
partitioners = ['SODA']            
mfs = ['triangular']                         


'------------------------------------------------ Running the model -------------------------------------------------'

'Builds and runs the model'
run_sliding_window(datasets,dataset_names,diff,partition_parameters,orders,partitioners,mfs)

'When done, excel files (.xlsx) will be generated for each data set with error metrics'

# FTS models

##Setup


In [None]:
!pip3 install -U git+git://github.com/PYFTS/pyFTS.git@5563af3079a91aac1717557224f8629370ea9f22

!pip3 install -U git+https://github.com/nicolaskruchten/jupyter_pivottablejs.git
!git clone https://github.com/petroniocandido/stac
!pip3 install dill
!wget https://github.com/petroniocandido/PWFTS/raw/master/benchmarks.db.gz


In [None]:
pip install XlsxWriter

In [None]:
!gunzip benchmarks.db.gz

##Data set import

In [None]:
from pyFTS.common import Util as cUtil
from pyFTS.benchmarks import benchmarks as bchmk, Util as bUtil
from pyFTS.models import pwfts
from pyFTS.common import Transformations
tdiff = Transformations.Differential(1)
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import time

from pyFTS.data import TAIEX, NASDAQ, SP500

dataset_names = ["TAIEX", "SP500","NASDAQ"]

def get_dataset(name):
    if dataset_name == "TAIEX":
        return TAIEX.get_data()
    elif dataset_name == "SP500":
        return SP500.get_data()[11500:16000]
    elif dataset_name == "NASDAQ":
        return NASDAQ.get_data()

train_split = 2000
test_length = 200

fig, ax = plt.subplots(nrows=2, ncols=3, figsize=[10,5])

for count,dataset_name in enumerate(dataset_names):
    dataset = get_dataset(dataset_name)
    dataset_diff = tdiff.apply(dataset)

    ax[0][count].plot(dataset)
    ax[1][count].plot(dataset_diff)
    ax[0][count].set_title(dataset_name)

##Auxiliary Functions

In [None]:
def synthetic_dataframe(file, tag, measure, transformation, benchmark_model):
    
    measure_time = 'time'
    
    df_time = bUtil.get_dataframe_from_bd(file,
                                     "tag = '"+tag+"' and measure = '"+measure_time+"' "+
                                     "and transformation is " + (" not null " if transformation else " null ") + 
                                     "and partitions is " + ("null " if benchmark_model else "not null" ))
    '--------------------------------'


    df = bUtil.get_dataframe_from_bd(file,
                                     "tag = '"+tag+"' and measure = '"+measure+"' "+
                                     "and transformation is " + (" not null " if transformation else " null ") + 
                                     "and partitions is " + ("null " if benchmark_model else "not null" ))
    data = []

    models = df.Model.unique()
    datasets = df.Dataset.unique()
    for dataset in datasets:
        for model in models:
            _filter = (df.Dataset == dataset) & (df.Model == model)
            orders = df[_filter].Order.unique()
            partitions = df[_filter].Partitions.unique()
            for order in orders:
                if benchmark_model:
                    _filter2 = (df.Dataset == dataset) & (df.Model == model)  & (df.Order == order) 
                    avg = np.nanmean(df[_filter2].Value)
                    std = np.nanstd(df[_filter2].Value)
                    rules = "No"
                    tempo = np.sum(df_time[_filter2].Value)
                    data.append([dataset, model, transformation, order, None, avg, std,rules,tempo])
                else:
                    for partition in partitions:
                        _filter2 = (df.Dataset == dataset) & (df.Model == model)  & (df.Order == order) & (df.Partitions == partition)
                        avg = np.nanmean(df[_filter2].Value)
                        std = np.nanstd(df[_filter2].Value)
                        rules = np.nanmean(df[_filter2].Size)
                        tempo = np.sum(df_time[_filter2].Value)
                        data.append([dataset, model, transformation, order, partition, avg, std,rules,tempo])

    dat = pd.DataFrame(data,columns=['Dataset','Model','Transformation','Order','Partitions','AVG','STD','Rules','Tempo'])
    dat = dat.sort_values(['AVG','STD'])
 
    best = []

    for dataset in datasets:
        for model in models:
            orders = dat[(dat.Dataset == dataset) & (dat.Model == model)].Order.unique()
            for order in orders:
                ix = dat[(dat.Dataset == dataset) &  (dat.Model == model) & (dat.Order == order)].index[0]
                best.append(ix)
                
    ret = dat.loc[best].sort_values(['AVG','STD'])
    ret.groupby('Dataset')

    return ret
    
def filter_db_by(file, df, tag, measure):
    sql = "tag = '" + tag + "' and measure = '" + measure +"' and ("
    tmpsql = ""
    df.index = np.arange(len(df.index))
    for ix in df.index:
        row = df.loc[ix]
        try:
            tmp = row.pop("AVG")
            tmp = row.pop("STD")
            part = row.pop('Partitions')
            transf = row.pop('Transformation')
            order = row.pop('Order')
            sql2 = ""
            for key in row.keys():
                if len(sql2) > 0:
                    sql2 += " and "
                sql2 += key +"='"+str(row[key])+"'"
            sql2 += " and benchmarks.'order' = " + str(order)
            sql2 += " and Partitions " + ("= " + str(part) if part is not None else " is null ")     
            sql2 += " and Transformation " + (" is not null " if transf  else " is null ")

            if len(tmpsql) > 0:
                tmpsql += " or "

            tmpsql += "(" + sql2 + ")"
        except Exception as ex:
            print(ex)
            print(row)
        
    sql += tmpsql + ")"
        
    return bUtil.get_dataframe_from_bd(file,sql)
            
    
def split_measurements_by(df, dataset, exclude=[], dump=False):
    ret = []
    mods = []
    models = df.Model.unique()
    for model in models:
        test = np.any([model.rfind(k) != -1 for k in exclude]) if len(exclude) > 0 else False
        if not test:
            orders = df[(df.Dataset == dataset) & (df.Model == model)].Order.unique()
            if len(orders) > 0:
                for order in orders:
                    if dump: print(model,order)
                    values = df[(df.Dataset == dataset) & (df.Model == model) & (final.Order == order)]["Value"].values 
                    ret.append(values)
                    if dump: print(len(values))
            else:
                if dump: print(model)
                values = df[(df.Dataset == dataset) & (df.Model == model)]["Value"].values
                ret.append(values)
                if dump: print(len(values))
            mods.append(model + str(order))

        
    return (ret, mods)
  

##Experiments

**CAUTION:** This task is computationally expensive and takes several hours to be performed. 

In [None]:
from pyFTS.common import Transformations
from pyFTS.data import TAIEX, SP500, NASDAQ
from pyFTS.benchmarks import benchmarks as bchmk, Util as bUtil
from pyFTS.benchmarks import Measures, naive, arima, ResidualAnalysis, quantreg, knn

tdiff = Transformations.Differential(1)
types = ['point']
dataset_names = ["SP500", "TAIEX","NASDAQ"]

tag = "fre"

benchmark_methods=[
    [arima.ARIMA for k in range(4)] + [naive.Naive],
    [arima.ARIMA for k in range(8)] + [quantreg.QuantileRegression for k in range(4)],
    [arima.ARIMA for k in range(4)] + [quantreg.QuantileRegression for k in range(2)] + [knn.KNearestNeighbors for k in range(3)]
    ]

benchmark_methods_parameters= [
    [
        {'order': (1, 0, 0)},
        {'order': (1, 0, 1)},
        {'order': (2, 0, 1)},
        {'order': (2, 0, 2)},
        {},
    ],[
        {'order': (1, 0, 0), 'alpha': .05},
        {'order': (1, 0, 0), 'alpha': .25},
        {'order': (1, 0, 1), 'alpha': .05},
        {'order': (1, 0, 1), 'alpha': .25},
        {'order': (2, 0, 1), 'alpha': .05},
        {'order': (2, 0, 1), 'alpha': .25},
        {'order': (2, 0, 2), 'alpha': .05},
        {'order': (2, 0, 2), 'alpha': .25},
        {'order': 1, 'alpha': .05},
        {'order': 1, 'alpha': .25},
        {'order': 2, 'alpha': .05},
        {'order': 2, 'alpha': .25}
    ],[
        {'order': (1, 0, 0)},
        {'order': (1, 0, 1)},
        {'order': (2, 0, 1)},
        {'order': (2, 0, 2)},
        {'order': 1, 'dist': True},
        {'order': 2, 'dist': True},
        {'order': 1}, {'order': 2}, {'order': 3},
    ]
]

for dataset_name in dataset_names:

  dataset = get_dataset(dataset_name)

  for ct, _type in enumerate(types):
      
      bchmk.sliding_window_benchmarks(dataset, 1000, train=0.8, inc=0.2,
                                      benchmark_models=True,
                                      benchmark_methods=benchmark_methods[ct],
                                      benchmark_methods_parameters=benchmark_methods_parameters[ct],
                                      transformations=[tdiff],
                                      orders=[1,2,3],
                                      partitions=np.arange(1, 51), 
                                      progress=False, type=_type,
                                      steps_ahead=[1],
                                      #distributed=True, nodes=['192.168.0.110', '192.168.0.107', '192.168.0.106'],
                                      file="benchmarks.db", dataset=dataset_name, tag=tag)
 

'Generate the dataframes from the sql database'
abc1 = synthetic_dataframe("benchmarks.db","fre","rmse",True,False)
abc2 = synthetic_dataframe("benchmarks.db","fre","rmse",True,True)
_filter = pd.concat([abc1,abc2])

'Save to excel'
nome_arquivo = 'PYFTS_all_models.xlsx'
writer = pd.ExcelWriter(nome_arquivo, engine='xlsxwriter')
_filter.to_excel(writer, sheet_name='Results')
writer.save()