In [1]:
import sys, os

In [2]:
from tqdm.notebook import tqdm

In [3]:
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [5]:
import sys, os
import numpy as np
import pandas as pd
import pickle

In [6]:
sys.path.append(os.path.abspath('../../'))
from utils import utils
from utils.utils import evaluate_experiment
sys.path.append(os.path.abspath('../'))
from timeseries_utils import *

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
path_to_data='../../../'

In [9]:
data=np.load(os.path.abspath(path_to_data+'data-002.npy'),allow_pickle=True)
Y=np.load(path_to_data+'Y.npy',allow_pickle=True)
labels =pd.read_csv(path_to_data+'labels.csv')

In [10]:
train_fold=8
val_fold=9
test_fold=10

# 10th fold for testing (9th for now)
X_test = data[labels.strat_fold == test_fold]
y_test = Y[labels.strat_fold == test_fold]
# 9th fold for validation (8th for now)
X_val = data[labels.strat_fold == val_fold]
y_val = Y[labels.strat_fold == val_fold]
# rest for training
X_train = data[labels.strat_fold <= train_fold]
y_train = Y[labels.strat_fold <= train_fold]

In [11]:
X_train.shape ,X_val.shape

((17111, 1000, 12), (2156, 1000, 12))

In [12]:
# Preprocess signal data
X_train, X_val, X_test = utils.preprocess_signals(X_train, X_val, X_test,'/content/')
n_classes = y_train.shape[1]
X_train = np.reshape(X_train,[X_train.shape[0],X_train.shape[2],X_train.shape[1]])
X_val = np.reshape(X_val,[X_val.shape[0],X_val.shape[2],X_val.shape[1]])
X_test = np.reshape(X_test,[X_test.shape[0],X_test.shape[2],X_test.shape[1]])

In [13]:
X_train.shape, X_val.shape

((17111, 12, 1000), (2156, 12, 1000))

In [14]:
y_train.shape, y_val.shape

((17111, 5), (2156, 5))

## Converting data to tsfresh format

In [15]:
list_of_sample_df = []
for sample_number, sample in tqdm(enumerate(X_train)):
    df = pd.DataFrame(sample).T
    df = df.reset_index().rename(columns={"index": "time"})
    df['id'] = sample_number
    list_of_sample_df.append(df)
fresh_train = pd.concat(list_of_sample_df,axis=0)
fresh_train = fresh_train[["id", "time", 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]]

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [16]:
fresh_train.head()

Unnamed: 0,id,time,0,1,2,3,4,5,6,7,8,9,10,11
0,0,0,-0.506445,-0.442161,0.110678,-0.292166,-0.270738,0.277815,-0.31788,2.004902,0.119249,0.020681,0.05068,-0.099315
1,0,1,-0.232168,-0.112172,0.003538,-0.05646,0.316385,0.187818,-0.249311,-0.000747,0.114963,-0.275024,-0.073602,-0.1336
2,0,2,0.277815,-0.103601,-0.125029,0.243531,0.106392,0.179247,0.072108,-1.749262,0.080679,-0.292166,-0.013604,-0.116458
3,0,3,0.372098,-0.000747,0.05068,0.183533,0.342099,0.132106,0.290672,-1.50927,-0.335022,0.136391,-0.09503,0.153534
4,0,4,-0.386449,-0.129314,-0.309309,-0.262167,0.27353,0.123535,-0.193598,0.667803,0.509236,0.162105,-0.035032,0.226388


In [17]:
fresh_train.shape

(17111000, 14)

### Multilabel y to just multiclass y

percorre todos os labels, transforma em string

In [18]:
y_train_tsfresh = pd.DataFrame(y_train).astype(str)

e converte binario pra inteiro, python tem uma funcao de conversao, "0,0,0,0,0" vai vira 0 no caso

int('11111111', 2),
, 2 é base 2
'111111' é o numero bnario, 
entende?

In [19]:
def multilabel_to_int(line):
    line = ''.join(line)
    return int(line, 2)

y_train_tsfresh = y_train_tsfresh.apply(multilabel_to_int,axis=1)

In [20]:
y_train_tsfresh.shape

(17111,)

### The robot example

## Extrating features

In [21]:
ids_per_iteration = 500
lines_per_iteration = ids_per_iteration*1000
print('Lines per it: ',lines_per_iteration)

Lines per it:  500000


## Loading extracted features

In [22]:
n_loops = int(np.ceil(len(fresh_train)/lines_per_iteration))
X_list = []
for i in tqdm(range(n_loops)):
    with open(f'tsfresh_pickles/tsfresh_comprehensive_features_{i}_of_{n_loops}.pickle', 'rb') as handle:
        X_list.append(pickle.load(handle))

HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




In [23]:
len(X_list)

35

In [24]:
X = pd.concat(X_list)

In [25]:
X.shape

(17111, 9156)

## Feature selection

Save the selected features

In [26]:
with open('tsfresh_comprehensive_selected_features.pickle', 'rb') as handle:
    X_filtered = pickle.load(handle)

In [27]:
X_filtered.shape

(17111, 4884)

## Validating model

Converting to tsfresh format

In [28]:
list_of_sample_df = []
for sample_number, sample in tqdm(enumerate(X_val)):
    df = pd.DataFrame(sample).T
    df = df.reset_index().rename(columns={"index": "time"})
    df['id'] = sample_number
    list_of_sample_df.append(df)
fresh_val = pd.concat(list_of_sample_df,axis=0)
fresh_val = fresh_val[["id", "time", 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]]

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [29]:
fresh_val.shape

(2156000, 14)

In [30]:
def extract_parts(df, lines_per_iteration,tag='train'):
    n_loops = int(np.ceil(len(df)/lines_per_iteration))
    extraction_settings = ComprehensiveFCParameters()
    for i in tqdm(range(n_loops)):
        index_start = (lines_per_iteration)*i
        index_end = (lines_per_iteration)*(i+1)
        if index_end > len(df):
            index_end = len(df)
        X = extract_features(df[index_start:index_end], column_id='id', column_sort='time',
             default_fc_parameters=extraction_settings,
             # we impute = remove all NaN features automatically
             impute_function=impute)

        with open(f'tsfresh_pickles/tsfresh_comprehensive_{tag}_features_{i}_of_{n_loops}.pickle', 'wb') as handle:
            pickle.dump(X, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [31]:
def load_tsfresh_features(df, lines_per_iteration, tag='train'):
    n_loops = int(np.ceil(len(df)/lines_per_iteration))
    X_list = []
    for i in tqdm(range(n_loops)):
        with open(f'tsfresh_pickles/tsfresh_comprehensive_{tag}_features_{i}_of_{n_loops}.pickle', 'rb') as handle:
            X_list.append(pickle.load(handle))
    return pd.concat(X_list)

In [32]:
X_val = load_tsfresh_features(fresh_val, lines_per_iteration ,tag='val')
X_val.shape

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




(2156, 9156)

In [33]:
X_val_filtered = X_val[X_filtered.columns]

In [34]:
y_val.shape

(2156, 5)

## Train and evaluate classifier

In [36]:
classifier_full = DecisionTreeClassifier()
classifier_full.fit(X, y_train)
preds_full = classifier_full.predict(X_val)
print(classification_report(y_val, preds_full))

              precision    recall  f1-score   support

           0       0.40      0.43      0.42       497
           1       0.39      0.38      0.39       271
           2       0.40      0.38      0.39       544
           3       0.65      0.65      0.65       957
           4       0.46      0.49      0.47       534

   micro avg       0.50      0.50      0.50      2803
   macro avg       0.46      0.47      0.46      2803
weighted avg       0.50      0.50      0.50      2803
 samples avg       0.52      0.52      0.50      2803



In [39]:
classifier_filtered = DecisionTreeClassifier()
classifier_filtered.fit(X_filtered, y_train)
preds_filtered = classifier_filtered.predict(X_val_filtered)
print(classification_report(y_val, preds_filtered))

              precision    recall  f1-score   support

           0       0.44      0.46      0.45       497
           1       0.34      0.32      0.33       271
           2       0.42      0.44      0.43       544
           3       0.66      0.65      0.65       957
           4       0.45      0.45      0.45       534

   micro avg       0.50      0.50      0.50      2803
   macro avg       0.46      0.46      0.46      2803
weighted avg       0.50      0.50      0.50      2803
 samples avg       0.52      0.52      0.50      2803



## Train and evaluate multilabel classifier

In [35]:
X.shape, y_train.shape

((17111, 9156), (17111, 5))

In [54]:
classifier_forest_full = RandomForestClassifier()
classifier_forest_full.fit(X, y_train)
preds_forest_full = classifier_forest_full.predict(X_val)
print(classification_report(y_val, preds_forest_full))

              precision    recall  f1-score   support

           0       0.84      0.26      0.40       497
           1       0.77      0.20      0.32       271
           2       0.81      0.22      0.35       544
           3       0.78      0.73      0.75       957
           4       0.79      0.20      0.32       534

   micro avg       0.79      0.39      0.53      2803
   macro avg       0.80      0.32      0.43      2803
weighted avg       0.80      0.39      0.49      2803
 samples avg       0.47      0.43      0.44      2803



In [37]:
X_filtered.shape, y_train.shape

((17111, 4884), (17111, 5))

In [51]:
classifier_forest = RandomForestClassifier()
classifier_forest.fit(X_filtered, y_train)
preds_forest = classifier_forest.predict(X_val_filtered)
print(classification_report(y_val, preds_forest))

              precision    recall  f1-score   support

           0       0.83      0.29      0.43       497
           1       0.79      0.23      0.36       271
           2       0.81      0.25      0.38       544
           3       0.79      0.73      0.76       957
           4       0.76      0.24      0.37       534

   micro avg       0.80      0.42      0.55      2803
   macro avg       0.80      0.35      0.46      2803
weighted avg       0.80      0.42      0.51      2803
 samples avg       0.49      0.45      0.46      2803



## Evaluating

In [45]:
from utils.utils import evaluate_experiment

In [47]:
tr_df_point_full = evaluate_experiment(np.array(y_val,dtype=np.float32), np.array(preds_full,dtype=np.float32))
print('Full: \n',tr_df_point_full)

Full: 
    macro_auc      Fmax
0   0.639361  0.515519


In [48]:
tr_df_point_filter = evaluate_experiment(np.array(y_val,dtype=np.float32), np.array(preds_filtered,dtype=np.float32))
print('Filtered: \n',tr_df_point_filter)

Filtered: 
    macro_auc      Fmax
0   0.639263  0.520243


In [55]:
tr_df_point_forest = evaluate_experiment(np.array(y_val,dtype=np.float32), np.array(preds_forest_full,dtype=np.float32))
print('Filtered Forest: \n',tr_df_point_forest)

Filtered Forest: 
    macro_auc      Fmax
0   0.639165  0.561163


In [52]:
tr_df_point_forest = evaluate_experiment(np.array(y_val,dtype=np.float32), np.array(preds_forest,dtype=np.float32))
print('Filtered Forest: \n',tr_df_point_forest)

Filtered Forest: 
    macro_auc      Fmax
0   0.651533  0.578119
