In [23]:
# /*==========================================================================================*\
# **                        _           _ _   _     _  _         _                            **
# **                       | |__  _   _/ | |_| |__ | || |  _ __ | |__                         **
# **                       | '_ \| | | | | __| '_ \| || |_| '_ \| '_ \                        **
# **                       | |_) | |_| | | |_| | | |__   _| | | | | | |                       **
# **                       |_.__/ \__,_|_|\__|_| |_|  |_| |_| |_|_| |_|                       **
# \*==========================================================================================*/


# Author: Bùi Tiến Thành (@bu1th4nh)
# Date: 2022/12/26 14:13 
# CTTN Toán tin K64

from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import os
import re
import ujson as json
import os
import ot
# import pandas_profiling as pp
from dtaidistance import dtw

# Figure size
height = 20
width  = 8

## Input and Normalization

In [24]:
df = pd.read_parquet("../DataWater_train_cleansed_phase3.parquet")

# Ariel = df[:int(len(df)//5*4)]
Belle = df[int(len(df)//5*4):].copy()

## Data test

In [25]:
attributes  = Belle.columns
mean        = Belle.describe().loc["mean", :].to_numpy()
std         = Belle.describe().loc["std", :].to_numpy()
D           = len(attributes)
T           = len(Belle)


print("D: ", D);
print("T: ", T);
print("Mean: ", mean);
print("Std:  ", std);

D:  10
T:  5660
Mean:  [8.08862542e+00 1.92143155e+04 6.04362828e+00 1.44514268e+02
 3.60454005e+00 7.46178078e+01 1.87313740e+01 4.91225421e+02
 2.78585632e+01 2.98941411e+01]
Std:   [1.99514440e+00 2.08815839e+04 2.09290107e+00 2.92960738e+01
 5.02513002e-01 1.59204979e+01 2.70135568e+00 3.42599435e+02
 4.53129821e+00 1.11563399e+00]


In [26]:
# Do data được đo theo giờ nên ta có thể chuyển qua thời gian tương đối
initial_time = Belle.index[0];
Belle.reset_index(drop=True, inplace=True)
for i, col in enumerate(Belle.columns):
    Belle[col] = Belle[col].apply(lambda x: (x - mean[i]) / std[i])

### Xây dựng ma trận đánh dấu

In [27]:
mask = ~pd.isnull(Belle)
display(mask.head(5))
# pp.ProfileReport(Belle)

Unnamed: 0,pH,EC,DO,TSS,TN,TP,TOC,ORP,Temp,TEMP
0,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True


### Xây dựng ma trận delta dựa theo số bước


In [28]:
n_steps = 128

def build_delta(start):
    delta = np.zeros((n_steps, D))
    for i in range(1, n_steps):
        delta[i] = np.ones(D) + (np.ones(D).astype('int') - mask.loc[start + i - 1, :].astype('int')) * delta[i - 1]
    return delta


### Xây dựng mẫu

In [29]:
idx_list = []
start = 0;
end = 0;

for i in tqdm(range(n_steps, T), desc="Building sample index list"):
    if(not np.any(pd.isnull(Belle.loc[i, :])) and i % 3 == 0): 
        idx_list.append((i - n_steps, i))


print(len(idx_list))

Building sample index list: 100%|██████████| 5532/5532 [00:01<00:00, 2997.69it/s]

903





### Xây dựng JSON

In [31]:
def parse_rec(values, masks, deltas):

    # only used in GRU-D
    forwards = pd.DataFrame(values).fillna(method='ffill').fillna(0.0).iloc[:,:].values

    rec = {}

    rec['values'] = np.nan_to_num(values).tolist()
    rec['masks'] = masks.astype('int32').tolist()
    rec['forwards'] = forwards.tolist()
    rec['deltas'] = deltas.tolist()

    return rec



save_path = 'data/json/split-valid'
if not os.path.exists(save_path):
    os.makedirs(save_path)
for i, (start, end) in tqdm(enumerate(idx_list), desc='Building JSON'):
    values = Belle.iloc[start:end].to_numpy()
    delta  = build_delta(start)
    mask_  = mask.iloc[start:end].to_numpy()
    label  = Belle.loc[end].to_numpy()

    rec = {'label': label.tolist()}
    rec['forward']  = parse_rec(values, mask_, delta)
    rec['backward'] = parse_rec(values, mask_, delta)
    rec = json.dumps(rec)

    save_file = os.path.join(save_path, str(i))
    with open(save_file,'w') as f:
        f.write(rec)

Building JSON: 903it [01:55,  7.81it/s]
