In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from glob import glob
import warnings
warnings.filterwarnings("ignore")
from tqdm import tnrange, tqdm_notebook

import ruptures as rpt
from utils.custom_cost import CostNew

# 1 Data importing

TEP

In [2]:
files = sorted(glob('TEP_data/*_te.dat'))

In [3]:
columns=[]
for i in range(1, 42):
    columns.append("XMEAS({})".format(i))
for i in range(1, 12):
    columns.append("XMV({})".format(i))

In [4]:
# We do not use the first file because it does not contain a defect
test = {}
for i, j in enumerate(files[1:], start=1):
    test[i] = pd.read_table(j, sep="\s+", names=columns)

# 2 Testing ensembling functions

In [5]:
signal = StandardScaler().fit_transform(test[1].values)

## 2.1 Testing Dynp and DynpEnsembling algorithms

In [6]:
print('Dynp algorithm:')
algo = rpt.Dynp(custom_cost='l1', jump=1).fit(signal)

my_bkps = algo.predict(n_bkps=1)
print(f'Detection delay = {(my_bkps[0]-160)*3} min')

Dynp algorithm:
Detection delay = 153 min


In [7]:
print('DynpEnsembling algorithm:')
c = CostNew()
algo = rpt.DynpEnsembling(custom_cost=c, jump=1, ensembling=5).fit(signal)

my_bkps = algo.predict(n_bkps=1)
print(f'Detection delay = {(my_bkps[0]-160)*3} min')

DynpEnsembling algorithm:
Detection delay = 153 min


## 2.2 Testing Window and WindowEnsembling algorithms

In [8]:
print('Window algorithm:')
algo = rpt.Window(custom_cost='ar', jump=1, width=40).fit(signal)

my_bkps = algo.predict(n_bkps=1)
print(f'Detection delay = {(my_bkps[0]-160)*3} min')

Window algorithm:
Detection delay = 168 min


In [9]:
print('WindowEnsembling algorithm:')
c = CostNew()
algo = rpt.WindowEnsembling(custom_cost=c, jump=1, width=40, ensembling=5).fit(signal)

my_bkps = algo.predict(n_bkps=1)
print(f'Detection delay = {(my_bkps[0]-160)*3} min')

WindowEnsembling algorithm:
Detection delay = 180 min


## 2.3 Testing BinSeg and BinSegEnsembling algorithms

In [10]:
print('BinSeg algorithm:')
algo = rpt.Binseg(custom_cost='l2', jump=1).fit(signal)

my_bkps = algo.predict(n_bkps=1)
print(f'Detection delay = {(my_bkps[0]-160)*3} min')

BinSeg algorithm:
Detection delay = 153 min


In [11]:
print('BinSegEnsembling algorithm:')
c = CostNew()
algo = rpt.BinsegEnsembling(custom_cost=c, jump=1, ensembling=5).fit(signal)

my_bkps = algo.predict(n_bkps=1)
print(f'Detection delay = {(my_bkps[0]-160)*3} min')

BinSegEnsembling algorithm:
Detection delay = 153 min


In [12]:
NUM_CPDE = 17

# 3 Dynp experiment

## 3.1 Individual algorithms

In [13]:
def dynp(cost, data, params):
    ttf = []
    for i in data.keys():
        stsc = StandardScaler()
        signal = stsc.fit_transform(data[i])
        algo = rpt.Dynp(model=cost, 
                        params=params, 
                        jump=1)
        algo.fit(signal)

        my_bkps = algo.predict(n_bkps=1)
        ttf.append(my_bkps[0]-160)
    return pd.DataFrame({(cost+' '+str(*params.values())): ttf}).T

In [14]:
models = (
    {'cost':'ar', 'params':{'order':1}},
    {'cost':'mahalanobis', 'params':{}},
    {'cost':'l1', 'params':{}},
    {'cost':'l2', 'params':{}},
    {'cost':'linear', 'params':{}},
#     {'model':'rbf', 'params':{}},
)

In [15]:
table = []
for model in tqdm_notebook(models):
    results = dynp(data=test, **model)
    table.append(results)

  0%|          | 0/5 [00:00<?, ?it/s]

In [16]:
# (pd.concat(table)*3).style.background_gradient(cmap='Blues')

In [17]:
final_table = (pd.concat(table)*3)

pd.DataFrame({'FDR, %':(final_table[(final_table>=0) & (final_table<96)].count(axis=1) / 21 * 100).round(1),
              'ADD (detected)':final_table[(final_table>=0) & (final_table<96)].mean(axis=1).round(1)}).style.background_gradient(cmap='Blues')

Unnamed: 0,"FDR, %",ADD (detected)
ar 1,9.5,46.5
mahalanobis,28.6,21.0
l1,19.0,34.5
l2,9.5,30.0
linear,4.8,0.0


## 3.2 Ensemble

In [18]:
def dynpEnsemble(cost, data, num_agg_func):
    ttf = []
    for i in data.keys():
        stsc = StandardScaler()
        signal = stsc.fit_transform(data[i].values)
        algo = rpt.DynpEnsembling(custom_cost=cost, 
                                  jump=1, 
                                  ensembling=num_agg_func)
        algo.fit(signal)

        try:
            my_bkps = algo.predict(n_bkps=1)
            ttf.append(my_bkps[0]-160)
        except:
            ttf.append(np.NaN)
    return pd.DataFrame({(num_agg_func): ttf}).T

In [19]:
%%time
cost = CostNew()
table1 = []

for n in tnrange(1, NUM_CPDE, desc='agg functions loop'):
    table1.append(dynpEnsemble(cost=cost, data=test, num_agg_func=n))

agg functions loop:   0%|          | 0/16 [00:00<?, ?it/s]

CPU times: user 1h 22min 54s, sys: 14min 7s, total: 1h 37min 2s
Wall time: 16min 22s


In [20]:
# (pd.concat(table1)*3).style.background_gradient(cmap='Blues')

In [21]:
final_table = (pd.concat(table1)*3)

pd.DataFrame({'FDR, %':(final_table[(final_table>=0) & (final_table<96)].count(axis=1) / 21 * 100).round(1),
              'ADD (detected)':final_table[(final_table>=0) & (final_table<96)].mean(axis=1).round(1)}).style.background_gradient(cmap='Blues')

Unnamed: 0,"FDR, %",ADD (detected)
1,28.6,22.5
2,9.5,28.5
3,4.8,0.0
4,28.6,22.5
5,9.5,30.0
6,9.5,30.0
7,19.0,27.8
8,14.3,51.0
9,9.5,30.0
10,9.5,30.0


# 4 Win experiment

## 4.1 Individual algorithms

In [54]:
models = (
    {'cost':'ar', 'params':{'order':1}, 'width':10},
    {'cost':'ar', 'params':{'order':1}, 'width':15},
    {'cost':'ar', 'params':{'order':5}, 'width':20},
    {'cost':'mahalanobis', 'params':{}, 'width':10},
    {'cost':'mahalanobis', 'params':{}, 'width':15},
    {'cost':'mahalanobis', 'params':{}, 'width':20},
    {'cost':'l1', 'params':{}, 'width':10},
    {'cost':'l1', 'params':{}, 'width':15},
    {'cost':'l1', 'params':{}, 'width':20},
    {'cost':'l2', 'params':{}, 'width':10},
    {'cost':'l2', 'params':{}, 'width':15},
    {'cost':'l2', 'params':{}, 'width':20},
#     {'cost':'linear', 'params':{}, 'width':10},
#     {'cost':'linear', 'params':{}, 'width':40},
#     {'cost':'linear', 'params':{}, 'width':100}
#     {'model':'rbf', 'params':{}, 'width':40},
#     {'model':'rbf', 'params':{}, 'width':100},
)

In [55]:
def wind(cost, data, params, width):
    ttf = []
    for i in data.keys():
        stsc = StandardScaler()
        signal = stsc.fit_transform(data[i].values)
        algo = rpt.Window(model=cost, 
                          params=params, 
                          width=width, 
                          jump=1)
        algo.fit(signal)

        my_bkps = algo.predict(n_bkps=1)
        ttf.append(my_bkps[0]-160)
    return pd.DataFrame({(cost+' '+str(*params.values()), width): ttf}).T

In [56]:
table = []
for model in tqdm_notebook(models, desc='agg functions loop'):
    table.append(wind(**model, data=test))

agg functions loop:   0%|          | 0/12 [00:00<?, ?it/s]

In [57]:
# (pd.concat(table)*3).style.background_gradient(cmap='Blues')

In [58]:
final_table = (pd.concat(table)*3)

pd.DataFrame({'FDR, %':(final_table[(final_table>=0) & (final_table<96)].count(axis=1) / 21 * 100).round(1),
              'ADD (detected)':final_table[(final_table>=0) & (final_table<96)].mean(axis=1).round(1)}).style.background_gradient(cmap='Blues')

Unnamed: 0,Unnamed: 1,"FDR, %",ADD (detected)
ar 1,10,4.8,18.0
ar 1,15,4.8,9.0
ar 5,20,4.8,18.0
mahalanobis,10,19.0,11.2
mahalanobis,15,14.3,16.0
mahalanobis,20,28.6,13.0
l1,10,4.8,18.0
l1,15,4.8,21.0
l1,20,0.0,
l2,10,9.5,31.5


## 4.2 Ensemble

In [59]:
def windowEnsemble(cost, data, num_agg_func, width):
    ttf = []
    for i in data.keys():
        stsc = StandardScaler()
        signal = stsc.fit_transform(data[i].values)
        algo = rpt.WindowEnsembling(custom_cost=cost, 
                                    jump=1, 
                                    ensembling=num_agg_func, 
                                    width=width)
        algo.fit(signal)

        try:
            my_bkps = algo.predict(n_bkps=1)
            ttf.append(my_bkps[0]-160)
        except:
            ttf.append(np.NaN)
    return pd.DataFrame({(num_agg_func, width): ttf}).T

In [60]:
%%time
cost = CostNew()
table1 = []

for n in tnrange(1, NUM_CPDE, desc='agg functions loop'):
    for w in tqdm_notebook([10, 15, 20], desc='width loop', leave=False):
        table1.append(windowEnsemble(cost=cost, data=test, num_agg_func=n, width=w))

agg functions loop:   0%|          | 0/16 [00:00<?, ?it/s]

width loop:   0%|          | 0/3 [00:00<?, ?it/s]

width loop:   0%|          | 0/3 [00:00<?, ?it/s]

width loop:   0%|          | 0/3 [00:00<?, ?it/s]

width loop:   0%|          | 0/3 [00:00<?, ?it/s]

width loop:   0%|          | 0/3 [00:00<?, ?it/s]

width loop:   0%|          | 0/3 [00:00<?, ?it/s]

width loop:   0%|          | 0/3 [00:00<?, ?it/s]

width loop:   0%|          | 0/3 [00:00<?, ?it/s]

width loop:   0%|          | 0/3 [00:00<?, ?it/s]

width loop:   0%|          | 0/3 [00:00<?, ?it/s]

width loop:   0%|          | 0/3 [00:00<?, ?it/s]

width loop:   0%|          | 0/3 [00:00<?, ?it/s]

width loop:   0%|          | 0/3 [00:00<?, ?it/s]

width loop:   0%|          | 0/3 [00:00<?, ?it/s]

width loop:   0%|          | 0/3 [00:00<?, ?it/s]

width loop:   0%|          | 0/3 [00:00<?, ?it/s]

CPU times: user 18min 6s, sys: 1min 35s, total: 19min 42s
Wall time: 9min 25s


In [61]:
# (pd.concat(table1)*3).style.background_gradient(cmap='Blues')

In [62]:
final_table = (pd.concat(table1)*3)

pd.DataFrame({'FDR, %':(final_table[(final_table>=0) & (final_table<96)].count(axis=1) / 21 * 100).round(1),
              'ADD (detected)':final_table[(final_table>=0) & (final_table<96)].mean(axis=1).round(1)}).style.background_gradient(cmap='Blues')

Unnamed: 0,Unnamed: 1,"FDR, %",ADD (detected)
1,10,14.3,1.0
1,15,19.0,25.5
1,20,14.3,22.0
2,10,9.5,15.0
2,15,23.8,38.4
2,20,19.0,28.5
3,10,4.8,0.0
3,15,9.5,33.0
3,20,19.0,33.8
4,10,9.5,22.5


# 5 BinSeg experiment

## 5.1 Individual algorithms

In [31]:
models = (
    {'cost':'ar', 'params':{'order':1}},
    {'cost':'mahalanobis', 'params':{}},
    {'cost':'l1', 'params':{}},
    {'cost':'l2', 'params':{}},
    {'cost':'linear', 'params':{}},
#     {'model':'rbf', 'params':{}},
)

In [32]:
def binseg(cost, data, params):
    ttf = []
    for i in data.keys():
        stsc = StandardScaler()
        signal = stsc.fit_transform(data[i].values)
        algo = rpt.Binseg(model=cost, 
                          params=params, 
                          jump=1)
        algo.fit(signal)

        my_bkps = algo.predict(n_bkps=1)
        ttf.append(my_bkps[0]-160)
    return pd.DataFrame({(cost+' '+str(*params.values())): ttf}).T

In [33]:
table = []
for model in tqdm_notebook(models, desc='agg functions loop'):
    table.append(binseg(**model, data=test))

agg functions loop:   0%|          | 0/5 [00:00<?, ?it/s]

In [34]:
# (pd.concat(table)*3).style.background_gradient(cmap='Blues')

In [35]:
final_table = (pd.concat(table)*3)

pd.DataFrame({'FDR, %':(final_table[(final_table>=0) & (final_table<96)].count(axis=1) / 21 * 100).round(1),
              'ADD (detected)':final_table[(final_table>=0) & (final_table<96)].mean(axis=1).round(1)}).style.background_gradient(cmap='Blues')

Unnamed: 0,"FDR, %",ADD (detected)
ar 1,9.5,46.5
mahalanobis,28.6,21.0
l1,19.0,34.5
l2,9.5,30.0
linear,4.8,0.0


## 5.2 Ensemble

In [36]:
def binsegEnsemble(cost, data, num_agg_func):
    ttf = []
    for i in data.keys():
        stsc = StandardScaler()
        signal = stsc.fit_transform(data[i].values)
        algo = rpt.BinsegEnsembling(custom_cost=cost, 
                                    jump=1, 
                                    ensembling=num_agg_func)
        algo.fit(signal)

        try:
            my_bkps = algo.predict(n_bkps=1)
            ttf.append(my_bkps[0]-160)
        except:
            ttf.append(np.NaN)
    return pd.DataFrame({(num_agg_func): ttf}).T

In [37]:
%%time
cost = CostNew()
table1 = []

for n in tnrange(1, NUM_CPDE, desc='agg functions loop'):
    table1.append(binsegEnsemble(cost, data=test, num_agg_func=n))

agg functions loop:   0%|          | 0/16 [00:00<?, ?it/s]

CPU times: user 2h 25min 13s, sys: 22min 18s, total: 2h 47min 32s
Wall time: 28min 2s


In [38]:
# (pd.concat(table1)*3).style.background_gradient(cmap='Blues')

In [39]:
final_table = (pd.concat(table1)*3)

pd.DataFrame({'FDR, %':(final_table[(final_table>=0) & (final_table<96)].count(axis=1) / 21 * 100).round(1),
              'ADD (detected)':final_table[(final_table>=0) & (final_table<96)].mean(axis=1).round(1)}).style.background_gradient(cmap='Blues')

Unnamed: 0,"FDR, %",ADD (detected)
1,28.6,22.5
2,9.5,28.5
3,4.8,0.0
4,28.6,22.5
5,9.5,30.0
6,9.5,30.0
7,9.5,30.0
8,14.3,51.0
9,9.5,30.0
10,9.5,30.0
