In [2]:
import os

import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.ensemble import IsolationForest

%matplotlib inline

In [26]:
def load(name):
    d = dict(np.load(name))
    return d['time'], d['derivative'], d['target'], d['rig_type']

def save_isol_res(data, pathto='isol', contam=0.1):
    data_names = os.listdir(data)
    if os.path.isdir(pathto):
            raise FileExistsError("Directory with name `{}` already exists".format(pathto))
    else:
        os.makedirs(pathto)
    
    for name in tqdm(data_names):
        file_name = os.path.join(data, name)
        time, derivative, target, rig_type = load(file_name)
        X = np.array([derivative]).T
        isol = IsolationForest(contamination=contam).fit(X)
        pred = isol.predict(X)
        np.savez(os.path.join(pathto, name), 
                 time = time[pred == 1],
                 derivative = derivative[pred == 1],
                 rig_type = rig_type,
                 target = target
        )

In [27]:
save_isol_res('data_splited/train', pathto='data_splited/train_isol')

100%|██████████| 311/311 [00:52<00:00,  5.95it/s]


In [28]:
save_isol_res('data_splited/test', pathto='data_splited/test_isol')

100%|██████████| 90/90 [00:15<00:00,  5.66it/s]
