# ЛАБОРАТОРИЯ

# Часть I. Сбор и предварительная обработка данных

## Оглавление

[Библиотеки и утилиты](#Библиотеки-и-утилиты)

[Разбор сырых данных](#Разбор-сырых-данных)

[Предварительная обработка и анализ](#Предварительная-обработка-и-анализ)

### Библиотеки и утилиты

In [None]:
import os
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from datetime import datetime
pd.set_option('display.max_columns', None)

### Разбор сырых данных

In [None]:
def get_node_dict(node):
    node_dict = {}
    for item in node.items():
        key = '{}_{}'.format(node.tag, item[0])
        value = item[1]
        node_dict.update({key: value})
        node_dict.update({'{}_text'.format(node.tag): node.text.strip()})
    return node_dict

def get_data_root(root):
    head_dict = get_node_dict(root)
    data = []
    for ch in root:
        if ch.tag == 'area':
            head_dict.update(get_node_dict(ch))
        else:
            for item in ch:
                if item.tag != 'measuringpoint':
                    head_dict.update({item.tag: item.text})
    for ch in root[2]:
        if ch.tag == 'measuringpoint':
            for mp in ch:
                for per in mp:
                    data_dict = {}
                    data_dict.update(head_dict)
                    data_dict.update(get_node_dict(ch))
                    data_dict.update(get_node_dict(mp))
                    data_dict.update(get_node_dict(per))
                    data_dict.update(get_node_dict(per[0]))
                    data.append(data_dict)
    return data

In [None]:
PATH = '.'
files_list = os.listdir(f'{PATH}/data_anmz')
print('total files:', len(files_list))
print('first files:', files_list[:3])
data_total = []
for file_name in tqdm(files_list):
    if '.xml' in file_name:
        tree = ET.parse(f'{PATH}/data_anmz/{file_name}')
        root = tree.getroot()
        data_total.extend(get_data_root(root))
    else:
        print(file)
print('done records: ', len(data_total))

### Предварительная обработка и анализ

In [None]:
df = pd.DataFrame(data_total)
print('dataframe created of shape: ', df.shape)
df['value_text'] = pd.to_numeric(df['value_text'])
df['timestamp_value'] = df.apply(lambda x: '{} {}'.format(x['day'], x['period_start']), axis=1)
df['timestamp_value'] = df.apply(lambda x: datetime.strptime(x['timestamp_value'], '%Y%m%d %H%M'), axis=1)
df = df.sort_values(['measuringpoint_serial', 'timestamp_value'], ascending=[True, True])
print('timestamps done')
print(
    'chanels:', 
    len(df['measuringchannel_code'].unique())
)
print(
    'measure points:', 
    len(df['measuringpoint_serial'].unique())
)
print(
    'days:', 
    len(df['day'].unique())
)
print(
    'length of one time series:', 
    int(df.shape[0] / len(df['measuringchannel_code'].unique()) / len(df['measuringpoint_serial'].unique()))
)
df.info()

In [None]:
df.head()

In [None]:
for col in df:
    print(f'{col}:', df[col].unique())

In [None]:
plt.figure(figsize=(20, 8))
for ch_serial in df['measuringpoint_serial'].unique():
    plt.plot(df[df['measuringpoint_serial'] == ch_serial].timestamp_value, 
             df[df['measuringpoint_serial'] == ch_serial].value_text, 
             label=ch_serial)
    plt.legend()
plt.show()

In [None]:
df.to_csv('cpt_power_data.csv', sep='\t', encoding='utf-8')