In [11]:
# /*==========================================================================================*\
# **                        _           _ _   _     _  _         _                            **
# **                       | |__  _   _/ | |_| |__ | || |  _ __ | |__                         **
# **                       | '_ \| | | | | __| '_ \| || |_| '_ \| '_ \                        **
# **                       | |_) | |_| | | |_| | | |__   _| | | | | | |                       **
# **                       |_.__/ \__,_|_|\__|_| |_|  |_| |_| |_|_| |_|                       **
# \*==========================================================================================*/


# Author: Bùi Tiến Thành (@bu1th4nh)
# Date: 2022/12/26 14:13 
# CTTN Toán tin K64

from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import os
import re
import ujson as json
import os
import ot
# import pandas_profiling as pp
from dtaidistance import dtw

# Figure size
height = 20
width  = 8

## Input and Normalization

In [12]:
df = pd.read_parquet("../DataWater_train_cleansed_phase3.parquet")

Ariel = df[:int(len(df)//5*4)]
Belle = df[int(len(df)//5*4):]

## Data train

In [13]:
attributes  = Ariel.columns
mean        = Ariel.describe().loc["mean", :].to_numpy()
std         = Ariel.describe().loc["std", :].to_numpy()
D           = len(attributes)
T           = len(Ariel)

print("Mean: ", mean);
print("Std:  ", std);

Mean:  [7.20219748e+00 2.57050229e+04 5.86370558e+00 9.57432549e+01
 3.60111669e+00 7.31472334e+01 1.86869853e+01 3.35487884e+02
 2.75097170e+01 2.98922879e+01]
Std:   [1.78314075e+00 1.77055381e+04 2.05694640e+00 1.19236342e+02
 5.32550083e-01 1.81974977e+01 2.73506084e+00 2.34139880e+02
 3.66966597e+00 1.10355017e+00]


In [14]:
# Do data được đo theo giờ nên ta có thể chuyển qua thời gian tương đối
initial_time = Ariel.index[0];
Ariel.reset_index(drop=True, inplace=True)
for i, col in enumerate(Ariel.columns):
    Ariel[col] = Ariel[col].apply(lambda x: (x - mean[i]) / std[i])

### Xây dựng ma trận đánh dấu

In [15]:
mask = ~pd.isnull(Ariel)
display(mask.head(5))
# pp.ProfileReport(Ariel)

Unnamed: 0,pH,EC,DO,TSS,TN,TP,TOC,ORP,Temp,TEMP
0,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True


### Xây dựng ma trận delta dựa theo số bước


In [16]:
n_steps = 128

def build_delta(start):
    delta = np.zeros((n_steps, D))
    for i in range(1, n_steps):
        delta[i] = np.ones(D) + (np.ones(D).astype('int') - mask.loc[start + i - 1, :].astype('int')) * delta[i - 1]
    return delta


### Xây dựng mẫu

In [17]:
idx_list = []
start = 0;
end = 0;

for i in tqdm(range(n_steps, T), desc="Building sample index list"):
    if(not np.any(pd.isnull(Ariel.loc[i, :])) and i % 3 == 0): 
        idx_list.append((i - n_steps, i))


print(len(idx_list))

Building sample index list: 100%|██████████| 22496/22496 [00:05<00:00, 4259.35it/s]

5739





### Xây dựng JSON

In [18]:
def parse_rec(values, masks, deltas):

    # only used in GRU-D
    forwards = pd.DataFrame(values).fillna(method='ffill').fillna(0.0).iloc[:,:].values

    rec = {}

    rec['values'] = np.nan_to_num(values).tolist()
    rec['masks'] = masks.astype('int32').tolist()
    rec['forwards'] = forwards.tolist()
    rec['deltas'] = deltas.tolist()

    return rec



save_path = 'data/json/split-train'
if not os.path.exists(save_path):
    os.makedirs(save_path)
for i, (start, end) in tqdm(enumerate(idx_list), desc='Building JSON'):
    values = Ariel.iloc[start:end].to_numpy()
    delta  = build_delta(start)
    mask_  = mask.iloc[start:end].to_numpy()
    label  = Ariel.loc[end].to_numpy()

    rec = {'label': label.tolist()}
    rec['forward']  = parse_rec(values, mask_, delta)
    rec['backward'] = parse_rec(values, mask_, delta)
    rec = json.dumps(rec)

    save_file = os.path.join(save_path, str(i))
    with open(save_file,'w') as f:
        f.write(rec)

Building JSON: 5739it [10:44,  8.91it/s]
