# Xакатон ЛИДЕРЫ ЦИФРОВОЙ ТРАНСФОРМАЦИИ 2023

## Северсталь. Модель раннего обнаружения неисправностей промышленного оборудования
[Задача 15](https://leaders2023.innoagency.ru/task_15) 


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!git clone https://github.com/YaninaK/predictive-maintenance.git -q
!pip install -r predictive-maintenance/requirements_Colab.txt -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [3]:
%cd /content/predictive-maintenance

/content/predictive-maintenance


In [4]:
import sys
import os

sys.path.append(os.getcwd())
sys.path.append(os.path.join(os.getcwd(), "src", "predictive_maintenance"))     

In [5]:
import pandas as pd
import numpy as np
import pyspark
from pprint import pprint

from data.make_dataset import (
    load_data, 
    get_new_X_column_names, 
    rename_columns,
    get_unified_tech_places,
)
from data.resample_dataset import (
    save_resampled_X,
    save_resampled_y_train,
    get_equipment_columns,
    resample,
)
from data.EDA_utilities import(
    add_groups_to_messages,
    identify_lack_of_messages_in_y_train,
    get_vectorizer_and_messages_vectors,
)
import matplotlib.pyplot as plt
import seaborn as sns

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
app_name = 'data_preprocessing'
spark_ui_port = 4041

In [8]:
spark = (
    pyspark.sql.SparkSession.builder
        .appName(app_name)        
        .config("spark.ui.port", spark_ui_port)
        .getOrCreate()
)

In [9]:
PATH = '/content/drive/MyDrive/ML_projects/predictive_maintenance/'

## 1. Data ingestion

In [10]:
X_train, y_train, messages, unified_tech_places = load_data(PATH)

X_train = X_train.repartition(4)
y_train = y_train.repartition(4)

print(f'X_train.shape = ({X_train.count()}, {len(X_train.columns)})')
print(f'y_train.shape = ({y_train.count()}, {len(y_train.columns)})')

X_train.shape = (9335034, 97)
y_train.shape = (9335034, 176)


In [11]:
messages.head(2)

Unnamed: 0,МАШИНА,ИМЯ_МАШИНЫ,ТЕХ_МЕСТО,НАЗВАНИЕ_ТЕХ_МЕСТА,ВИД_СООБЩЕНИЯ,ОПИСАНИЕ,ДАТА_НАЧАЛА_НЕИСПРАВНОСТИ,ДАТА_УСТРАНЕНИЯ_НЕИСПРАВНОСТИ,ТЕКСТ_ГРУППЫ_КОДОВ,equipment,unified_name
390,AA2/006-006,ЭКСГАУСТЕР А/М №9,AA2/006-006-002-008,ЗАПОРНАЯ АРМАТУРА ЭКСГАУСТЕРА №9,M3,неисправен двигатель,2019-01-21 00:00:00,2019-02-25,,9,ЗАПОРНАЯ АРМАТУРА ЭКСГАУСТЕРА №
391,CH-AGP-AG2/011-005,ЭКСГАУСТЕР А/М №9,CH-AGP-AG2/011-005-002,МАСЛОСТАНЦИЯ ЖИДКОЙ СМАЗКИ ЭКСГ. №9,M3,неисправен двигатель,2019-01-21 12:26:08,2019-02-25,,9,МАСЛОСТАНЦИЯ ЖИДКОЙ СМАЗКИ ЭКСГ. №


In [12]:
unified_tech_places.head(2)

Unnamed: 0,equipment,description,unified_name
0,9,ЗАПОРНАЯ АРМАТУРА ЭКСГАУСТЕРА №9,ЗАПОРНАЯ АРМАТУРА ЭКСГАУСТЕРА №
1,9,МАСЛОСТАНЦИЯ ЖИДКОЙ СМАЗКИ ЭКСГ. №9,МАСЛОСТАНЦИЯ ЖИДКОЙ СМАЗКИ ЭКСГ. №


In [13]:
X_test = spark.read.parquet(
    PATH + 'data/01_raw/' + 'X_test.parquet', header=True, inferSchema= True
)
X_cols = get_new_X_column_names(X_test)
X_test = rename_columns(X_test, X_cols)
X_test = X_test.repartition(4)

print(f'X_test.shape = ({X_test.count()}, {len(X_test.columns)})')

X_test.shape = (4008961, 97)


In [14]:
test_intervals = pd.read_excel(
    PATH + 'data/01_raw/' + 'test_intervals.xlsx', index_col=0
).sort_index()

print(f'test_intervals.shape = {test_intervals.shape}')
test_intervals.head(2)

test_intervals.shape = (189, 4)


Unnamed: 0,start,finish,machine,tm
0,2022-01-07 09:05:16,2022-01-07 14:05:15,,
1,2022-02-25 03:44:52,2022-02-25 08:15:03,,


## 2. EDA

### 2.1 Target variable

#### 2.1.1. y_train

In [15]:
y_cols = get_equipment_columns(y_train.schema.names)

for i in range(4, 10): 
  print(len(y_cols[i]))

24
35
28
33
32
29


In [16]:
y_spec = pd.pivot_table(
    unified_tech_places, 
    index='unified_name', 
    columns='equipment', 
    values='description', 
    aggfunc='count',
    margins=True,    
)
print(f'y_spec.shape = {y_spec.shape}\n')
y_spec

y_spec.shape = (52, 7)



equipment,4,5,6,7,8,9,All
unified_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ВК 310С ВИБРОПРЕОБРАЗОВАТЕЛЬ ЭКСГ.№ Т.1,1.0,1.0,,,,1.0,3
ВК 310С ВИБРОПРЕОБРАЗОВАТЕЛЬ ЭКСГ.№ Т.2,,1.0,1.0,1.0,1.0,,4
ВК 310С ВИБРОПРЕОБРАЗОВАТЕЛЬ ЭКСГ.№ Т.3,,1.0,1.0,1.0,1.0,,4
ВК 310С ВИБРОПРЕОБРАЗОВАТЕЛЬ ЭКСГ.№ Т.4,,,,1.0,1.0,,2
ГАЗОВАЯ ЗАДВИЖКА ЭКСГАУСТЕРА А/М №,,1.0,1.0,,,,2
ГСМ ЭКСГ. №,1.0,1.0,,1.0,1.0,1.0,5
ДВИГАТЕЛЬ ПУСКОВОГО МАСЛОНАСОСА ЭКСГ. №,,1.0,1.0,,,1.0,3
ДВИГАТЕЛЬ РЕЗЕРВНОГО МАСЛОНАСОСА ЭКСГ.№,,1.0,,,,,1
ЗАДВИЖКА ЭКСГ. №,1.0,1.0,1.0,1.0,1.0,1.0,6
ЗАП. И РЕГ. АРМАТУРА ЭКСГ.№,1.0,,,,,,1


Информация об остановках или аномалиях доступна менее, чем по 60% тех.мест.

In [17]:
y_train.select(y_cols[4]).orderBy("DT").limit(5).toPandas() 

Unnamed: 0,DT,4_ВК 310С ВИБРОПРЕОБРАЗОВАТЕЛЬ ЭКСГ_№4 Т_1,4_ГСМ ЭКСГ_ №4,4_ЗАДВИЖКА ЭКСГ_ №4,4_ЗАП_ И РЕГ_ АРМАТУРА ЭКСГ_№4,4_КЛ1 ТР№4 ДО ЭД ЭКСГАУСТЕРА №4,4_КЛ2 ТР№4 ДО ЭД ЭКСГАУСТЕРА №4,4_МАСЛОНАСОС РАБОЧИЙ ЭКСГ_ №4,4_МАСЛООХЛАДИТЕЛЬ М-05-1 ЭКСГ_ №4,4_МАСЛОПРОВОДЫ ЭКСГ №4,...,4_РЕДУКТОР ГАЗ_ ЗАДВИЖКИ ЭКСГ_ №4,4_РОТОР ЭКСГ_ №4,4_ТИРИСТ_ ВОЗБУДИТЕЛЬ ВТ-РЭМ-400 ЭКСГ4 ВУ1,4_ТР-Р ТМ-4000-10/6 ЭКСГ_ №4,4_ТСМТ-101-010-50М-400 ТЕРМОПР_ПОДШ_Т_1,4_УЛИТА ЭКСГ_ №4,4_ЭКСГАУСТЕР А/М №4,4_ЭЛ/ДВИГАТЕЛЬ ГАЗ_ ЗАДВИЖКИ ЭКСГ_ №4,4_ЭЛЕКТРОАППАРАТУРА ЭКСГ_ №4,4_ЭЛЕКТРОДВИГАТЕЛЬ ДСПУ-140-84-4 ЭКСГ_ №4
0,2019-01-16 13:21:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2019-01-16 13:21:10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2019-01-16 13:21:20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2019-01-16 13:21:30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2019-01-16 13:21:40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
i = 4
(
    resample(y_train.select(y_cols[i]))
    .groupBy("dt_resampled")
    .max()
    .orderBy("dt_resampled")
    .limit(5).toPandas()
)

Unnamed: 0,dt_resampled,max(epoch),max(4_ВК 310С ВИБРОПРЕОБРАЗОВАТЕЛЬ ЭКСГ_№4 Т_1),max(4_ГСМ ЭКСГ_ №4),max(4_ЗАДВИЖКА ЭКСГ_ №4),max(4_ЗАП_ И РЕГ_ АРМАТУРА ЭКСГ_№4),max(4_КЛ1 ТР№4 ДО ЭД ЭКСГАУСТЕРА №4),max(4_КЛ2 ТР№4 ДО ЭД ЭКСГАУСТЕРА №4),max(4_МАСЛОНАСОС РАБОЧИЙ ЭКСГ_ №4),max(4_МАСЛООХЛАДИТЕЛЬ М-05-1 ЭКСГ_ №4),...,max(4_РЕДУКТОР ГАЗ_ ЗАДВИЖКИ ЭКСГ_ №4),max(4_РОТОР ЭКСГ_ №4),max(4_ТИРИСТ_ ВОЗБУДИТЕЛЬ ВТ-РЭМ-400 ЭКСГ4 ВУ1),max(4_ТР-Р ТМ-4000-10/6 ЭКСГ_ №4),max(4_ТСМТ-101-010-50М-400 ТЕРМОПР_ПОДШ_Т_1),max(4_УЛИТА ЭКСГ_ №4),max(4_ЭКСГАУСТЕР А/М №4),max(4_ЭЛ/ДВИГАТЕЛЬ ГАЗ_ ЗАДВИЖКИ ЭКСГ_ №4),max(4_ЭЛЕКТРОАППАРАТУРА ЭКСГ_ №4),max(4_ЭЛЕКТРОДВИГАТЕЛЬ ДСПУ-140-84-4 ЭКСГ_ №4)
0,2019-01-16 13:00:00,1547643600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2019-01-16 13:30:00,1547645400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2019-01-16 14:00:00,1547647200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2019-01-16 14:30:00,1547649000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2019-01-16 15:00:00,1547650800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


10-секундный интервал между измерениями увеличен до 30-минутного. По y-train взято максимальное значение на интервале.

In [19]:
save = True
if save:
  save_resampled_y_train(y_train, path=PATH)

In [20]:
i = 4
df = pd.read_parquet(PATH + "data/02_intermediate/" + f'y{i}_resampled.parquet')
df[df.iloc[:, 2:].sum(axis=1) == 0]

Unnamed: 0,dt_resampled,max(epoch),max(4_ВК 310С ВИБРОПРЕОБРАЗОВАТЕЛЬ ЭКСГ_№4 Т_1),max(4_ГСМ ЭКСГ_ №4),max(4_ЗАДВИЖКА ЭКСГ_ №4),max(4_ЗАП_ И РЕГ_ АРМАТУРА ЭКСГ_№4),max(4_КЛ1 ТР№4 ДО ЭД ЭКСГАУСТЕРА №4),max(4_КЛ2 ТР№4 ДО ЭД ЭКСГАУСТЕРА №4),max(4_МАСЛОНАСОС РАБОЧИЙ ЭКСГ_ №4),max(4_МАСЛООХЛАДИТЕЛЬ М-05-1 ЭКСГ_ №4),...,max(4_РЕДУКТОР ГАЗ_ ЗАДВИЖКИ ЭКСГ_ №4),max(4_РОТОР ЭКСГ_ №4),max(4_ТИРИСТ_ ВОЗБУДИТЕЛЬ ВТ-РЭМ-400 ЭКСГ4 ВУ1),max(4_ТР-Р ТМ-4000-10/6 ЭКСГ_ №4),max(4_ТСМТ-101-010-50М-400 ТЕРМОПР_ПОДШ_Т_1),max(4_УЛИТА ЭКСГ_ №4),max(4_ЭКСГАУСТЕР А/М №4),max(4_ЭЛ/ДВИГАТЕЛЬ ГАЗ_ ЗАДВИЖКИ ЭКСГ_ №4),max(4_ЭЛЕКТРОАППАРАТУРА ЭКСГ_ №4),max(4_ЭЛЕКТРОДВИГАТЕЛЬ ДСПУ-140-84-4 ЭКСГ_ №4)
0,2019-01-16 13:00:00,1547643600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2019-01-16 13:30:00,1547645400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2019-01-16 14:00:00,1547647200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2019-01-16 14:30:00,1547649000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2019-01-16 15:00:00,1547650800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
497,2019-01-26 21:30:00,1548538200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
498,2019-01-26 22:00:00,1548540000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
499,2019-01-26 22:30:00,1548541800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
500,2019-01-26 23:00:00,1548543600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


По 4 эксгаустеру доступно 502 получасовых интервала, когда оборудование работало штатно.

#### 2.1.2. Messages

In [21]:
messages['ВИД_СООБЩЕНИЯ'].value_counts()

M3    898
M1     83
Name: ВИД_СООБЩЕНИЯ, dtype: int64

In [22]:
messages['ДАТА_НАЧАЛА_НЕИСПРАВНОСТИ'].min(), messages['ДАТА_НАЧАЛА_НЕИСПРАВНОСТИ'].max()

(Timestamp('2019-01-21 00:00:00'), Timestamp('2021-12-29 16:27:02'))

In [23]:
messages['ДАТА_УСТРАНЕНИЯ_НЕИСПРАВНОСТИ'].min(), messages['ДАТА_УСТРАНЕНИЯ_НЕИСПРАВНОСТИ'].max()

(Timestamp('2019-01-22 00:00:00'), Timestamp('2021-12-30 00:00:00'))

In [24]:
(messages['ДАТА_УСТРАНЕНИЯ_НЕИСПРАВНОСТИ'] - messages['ДАТА_НАЧАЛА_НЕИСПРАВНОСТИ']).describe()

count                           851
mean     49 days 23:01:29.286721504
std      90 days 01:42:27.596829132
min                 0 days 00:00:00
25%          0 days 07:32:29.500000
50%                12 days 21:28:47
75%         65 days 20:37:10.500000
max               728 days 15:13:55
dtype: object

In [25]:
messages = add_groups_to_messages(messages)

messages_spec_1 = pd.pivot_table(
    messages, 
    index='groups', 
    columns='equipment', 
    values='ОПИСАНИЕ', 
    aggfunc='count',
    margins=True,
)
messages_spec_1

equipment,4,5,6,7,8,9,All
groups,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bearings,52.0,53.0,29.0,93.0,53.0,38.0,318
electric_devices,20.0,38.0,10.0,21.0,19.0,12.0,120
fittings,88.0,68.0,82.0,59.0,61.0,39.0,397
oil_system,13.0,38.0,13.0,23.0,28.0,22.0,137
other,2.0,2.0,1.0,3.0,1.0,,9
All,175.0,199.0,135.0,199.0,162.0,111.0,981


In [26]:
messages_spec_2, missing_messages = identify_lack_of_messages_in_y_train(
    messages, unified_tech_places
)
messages_spec_2

equipment,4,5,6,7,8,9,All
unified_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
All,175.0,199.0,135.0,199.0,162.0,111.0,981.0
ВК 310С ВИБРОПРЕОБРАЗОВАТЕЛЬ ЭКСГ.№ Т.1,1.0,2.0,,,,1.0,4.0
ВК 310С ВИБРОПРЕОБРАЗОВАТЕЛЬ ЭКСГ.№ Т.2,,3.0,,1.0,2.0,,6.0
ВК 310С ВИБРОПРЕОБРАЗОВАТЕЛЬ ЭКСГ.№ Т.3,,4.0,,4.0,2.0,,10.0
ВК 310С ВИБРОПРЕОБРАЗОВАТЕЛЬ ЭКСГ.№ Т.4,,,,1.0,2.0,,3.0
ГАЗОВАЯ ЗАДВИЖКА ЭКСГАУСТЕРА А/М №,,1.0,1.0,,,,2.0
ГСМ ЭКСГ. №,5.0,5.0,,5.0,6.0,3.0,24.0
ДВИГАТЕЛЬ ПУСКОВОГО МАСЛОНАСОСА ЭКСГ. №,,2.0,,,,,2.0
ДВИГАТЕЛЬ РЕЗЕРВНОГО МАСЛОНАСОСА ЭКСГ.№,,,,,,,
ЗАДВИЖКА ЭКСГ. №,16.0,23.0,18.0,8.0,12.0,2.0,79.0


40% всех сообщений приходится на 5 и 7 эксгаузеры. Между различными видами оборудования сообщения распределены очень неравномерно.

In [27]:
missing_messages

equipment,4,5,6,7,8,9,All
unified_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ВК 310С ВИБРОПРЕОБРАЗОВАТЕЛЬ ЭКСГ.№ Т.2,0,0,1,0,0,0,1
ВК 310С ВИБРОПРЕОБРАЗОВАТЕЛЬ ЭКСГ.№ Т.3,0,0,1,0,0,0,1
ДВИГАТЕЛЬ ПУСКОВОГО МАСЛОНАСОСА ЭКСГ. №,0,0,1,0,0,1,2
ДВИГАТЕЛЬ РЕЗЕРВНОГО МАСЛОНАСОСА ЭКСГ.№,0,1,0,0,0,0,1
КЛ1 ТР№ ДО ЭД ЭКСГАУСТЕРА №,0,0,1,1,0,0,2
КЛ2 ТР№ ДО ЭД ЭКСГАУСТЕРА №,0,0,1,0,0,0,1
МАСЛОНАСОС ШЕСТЕРЕНЧАТЫЙ (ПУСК.) ЭКСГ.№,0,0,1,0,0,0,1
МАСЛОПРОВОДЫ ЭКСГАУСТЕРА №,1,0,1,0,0,1,3
МЕТРАН-100 ДАТЧИКИ ДАВЛЕНИЯ ЭКСГ.№,0,0,0,1,0,1,2
РЕГУЛИРУЮЩАЯ АППАРАТУРА ЭКСГАУСТЕРА №,0,0,0,1,0,0,1


Единицами отмечены тех.места соответствующих эксгаустеров из y_train, о которых  отсутствует информация в messages.

In [28]:
missing_messages.sum(axis=0)

equipment
4       1
5       4
6       7
7       6
8       0
9       7
All    25
dtype: int64

По 25 видам оборудования y_train фиксирует нештатные ситуации, но информации в messages по ним нет.

##### A. Stoppage M1

In [29]:
pd.pivot_table(
    messages[messages['ВИД_СООБЩЕНИЯ'] == 'M1'], 
    index='ТЕКСТ_ГРУППЫ_КОДОВ', 
    columns='equipment', 
    values='ОПИСАНИЕ', 
    aggfunc='count',
    margins=True,
)

equipment,4,5,6,7,8,9,All
ТЕКСТ_ГРУППЫ_КОДОВ,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ВНЕШНИЕ ОРГАНИЗАЦИИ,,1.0,,,1.0,,2
ТЕХНИЧЕСКИЕ НЕПЛАНОВЫЕ,17.0,19.0,6.0,11.0,12.0,4.0,69
ТЕХНИЧЕСКИЕ ПЛАНОВЫЕ,2.0,3.0,1.0,2.0,,,8
ТЕХНОЛОГИЧЕСКИЕ НЕПЛАНОВЫЕ,1.0,,,1.0,1.0,,3
ТЕХНОЛОГИЧЕСКИЕ ПЛАНОВЫЕ,,,1.0,,,,1
All,20.0,23.0,8.0,14.0,14.0,4.0,83


83% остановок - технические неплановые.

In [30]:
pd.pivot_table(
    messages[messages['ВИД_СООБЩЕНИЯ'] == 'M1'], 
    index='groups', 
    columns='equipment', 
    values='ОПИСАНИЕ', 
    aggfunc='count',
    margins=True,
)

equipment,4,5,6,7,8,9,All
groups,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bearings,9.0,9.0,,5.0,7.0,,30
electric_devices,4.0,9.0,,3.0,5.0,,21
fittings,5.0,3.0,7.0,3.0,1.0,4.0,23
other,2.0,2.0,1.0,3.0,1.0,,9
All,20.0,23.0,8.0,14.0,14.0,4.0,83


Больше половины остановок М1 прихходилось на 4 и 5 эксгаузеры.

In [31]:
pd.pivot_table(
    messages[messages['ВИД_СООБЩЕНИЯ'] == 'M1'], 
    index='unified_name', 
    columns='equipment', 
    values='ОПИСАНИЕ', 
    aggfunc='count',
    margins=True,
)

equipment,4,5,6,7,8,9,All
unified_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ГАЗОВАЯ ЗАДВИЖКА ЭКСГАУСТЕРА А/М №,,1.0,1.0,,,,2
ЗАДВИЖКА ЭКСГ. №,4.0,1.0,6.0,,,,11
КОРПУС ЭКСГ. №,,,,2.0,,,2
ПОДШИПНИК ОПОРНО-УПОРНЫЙ ЭКСГ. №,1.0,,,,,,1
ПОДШИПНИК ОПОРНЫЙ №1,,5.0,,,,,5
ПОДШИПНИК ОПОРНЫЙ №2,3.0,,,1.0,,,4
РЕДУКТОР ГАЗ. ЗАДВИЖКИ ЭКСГ. №,1.0,,,,,,1
РОТОР ЭКСГ. №,5.0,4.0,,4.0,7.0,,20
ТИРИСТ. ВОЗБУДИТЕЛЬ ВТ-РЭМ-400 ЭКСГ ВУ1,,4.0,,,,,4
ТИРИСТ. ВОЗБУДИТЕЛЬ ВТ-РЭМ-400 ЭКСГ ВУ2,,1.0,,,,,1


Причиной четверти всех остановок М1 был ротор.

##### B. Anomalies M3

In [32]:
pd.pivot_table(
    messages[messages['ВИД_СООБЩЕНИЯ'] == 'M3'], 
    index='groups', 
    columns='equipment', 
    values='ОПИСАНИЕ', 
    aggfunc='count',
    margins=True,
)

equipment,4,5,6,7,8,9,All
groups,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bearings,43,44,29,88,46,38,288
electric_devices,16,29,10,18,14,12,99
fittings,83,65,75,56,60,35,374
oil_system,13,38,13,23,28,22,137
All,155,176,127,185,148,107,898


In [33]:
pd.pivot_table(
    messages[messages['ВИД_СООБЩЕНИЯ'] == 'M3'], 
    index='unified_name', 
    columns='equipment', 
    values='ОПИСАНИЕ', 
    aggfunc='count',
    margins=True,
)

equipment,4,5,6,7,8,9,All
unified_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ВК 310С ВИБРОПРЕОБРАЗОВАТЕЛЬ ЭКСГ.№ Т.1,1.0,2.0,,,,1.0,4
ВК 310С ВИБРОПРЕОБРАЗОВАТЕЛЬ ЭКСГ.№ Т.2,,3.0,,1.0,2.0,,6
ВК 310С ВИБРОПРЕОБРАЗОВАТЕЛЬ ЭКСГ.№ Т.3,,4.0,,4.0,2.0,,10
ВК 310С ВИБРОПРЕОБРАЗОВАТЕЛЬ ЭКСГ.№ Т.4,,,,1.0,2.0,,3
ГСМ ЭКСГ. №,5.0,5.0,,5.0,6.0,3.0,24
ДВИГАТЕЛЬ ПУСКОВОГО МАСЛОНАСОСА ЭКСГ. №,,2.0,,,,,2
ЗАДВИЖКА ЭКСГ. №,12.0,22.0,12.0,8.0,12.0,2.0,68
ЗАП. И РЕГ. АРМАТУРА ЭКСГ.№,1.0,,,,,,1
ЗАПОРНАЯ АРМАТУРА ЭКСГАУСТЕРА №,,3.0,,,2.0,1.0,6
КЛ1 ТР№ ДО ЭД ЭКСГАУСТЕРА №,2.0,1.0,,,1.0,1.0,5


Наибольшее число аномалий М3 зафиксировано на эксгаузерах 5 и 7.

##### C. Text vectorizing

In [34]:
vectorizer, X =  get_vectorizer_and_messages_vectors(messages)
print(f'X.shape = {X.shape}\n')

dict_ = sorted(vectorizer.vocabulary_, key=vectorizer.vocabulary_.get, reverse=True)
for i in dict_:
  print(i, vectorizer.vocabulary_[i])

Installing mystem to /root/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz


X.shape = (981, 150)

электродвигатель 149
эл 148
эксгаустер 147
эксг ра 146
эксг 145
экс ра 144
экс 143
щетка 142
шток 141
шпилька 140
часть 139
фланец 138
утечка смазка 137
утечка масло 136
утечка 135
уровень масло 134
уровень 133
уплотнение разъем 132
уплотнение 131
улита 130
трансформатор 129
тр 128
течь масло 127
течь 126
температура 125
статор 124
смазка 123
сильный 122
сигнал 121
сгорать электродвигатель 120
сгорать 119
рп 118
ротор 117
резьба 116
редуктор 115
расцентровка 114
разъем улита 113
разъем 112
рабочий 111
работать 110
ра температура 109
ра вибрация 108
ра 107
просос разъем 106
просос 105
примесь 104
показание температура 103
показание вибрация 102
показание 101
подшипник обмотка 100
подшипник 99
подтекать масло 98
подтекать 97
подклинивать 96
подача 95
повышенный 94
патрубок 93
палец 92
отсутствовать 91
открытие 90
остановка эксгаустер 89
остановка эксг 88
остановка ра 87
остановка 86
ослабнуть 85
ослабление 84
обрыв 83
обмотка статор 82
обмотка 81
низкий уровень 80
н

### 2.2. X_train

In [35]:
X_cols = get_equipment_columns(X_train.schema.names)

In [36]:
X_train.select(X_cols[6]).orderBy("DT").limit(5).toPandas()

Unnamed: 0,DT,6 ВИБРАЦИЯ НА ОПОРЕ 1,6 ВИБРАЦИЯ НА ОПОРЕ 2,6 ВИБРАЦИЯ НА ОПОРЕ 3,6 ВИБРАЦИЯ НА ОПОРЕ 3 ПРОДОЛЬНАЯ,6 ВИБРАЦИЯ НА ОПОРЕ 4,6 ВИБРАЦИЯ НА ОПОРЕ 4 ПРОДОЛЬНАЯ,6 ДАВЛЕНИЕ МАСЛА В СИСТЕМЕ,6 ТЕМПЕРАТУРА МАСЛА В МАСЛОБЛОКЕ,6 ТЕМПЕРАТУРА МАСЛА В СИСТЕМЕ,6 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 1,6 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 2,6 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 3,6 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 4,6 ТОК РОТОРА 1,6 ТОК РОТОРА 2,6 ТОК СТАТОРА
0,2019-01-16 13:21:00,0.80816,1.08,0.83,1.805,0.7,0.763333,162.918,44.62,29.21,45.43,51.11,45.43,47.86,347.495,347.495,245.314
1,2019-01-16 13:21:10,0.830729,1.0625,0.887153,1.735,0.705729,0.715,163.04,44.538548,30.016869,44.62,51.434316,45.43094,47.783604,347.723333,347.723333,245.4825
2,2019-01-16 13:21:20,0.835262,1.051312,0.876736,1.713333,0.660687,1.0,162.653333,44.439393,29.746448,44.619675,51.470372,45.43094,47.86473,347.27,347.27,244.703333
3,2019-01-16 13:21:30,0.790799,1.046007,0.854167,1.823785,0.644965,0.784722,163.101852,44.295168,29.692363,44.619675,51.190936,45.43094,47.86473,347.402345,347.402345,243.8368
4,2019-01-16 13:21:40,0.81684,1.059896,0.862847,1.843333,0.652778,0.706667,163.31,44.376295,28.39,44.619675,51.92,45.835,47.621351,346.58,346.58,243.472


In [37]:
i = 4
(
    resample(X_train.select(X_cols[i]), period=60*30)
    .groupBy("dt_resampled")
    .mean()
    .orderBy("dt_resampled")
    .limit(5).toPandas()
)

Unnamed: 0,dt_resampled,avg(epoch),avg(4 ВИБРАЦИЯ НА ОПОРЕ 1),avg(4 ВИБРАЦИЯ НА ОПОРЕ 2),avg(4 ВИБРАЦИЯ НА ОПОРЕ 3),avg(4 ВИБРАЦИЯ НА ОПОРЕ 3 ПРОДОЛЬНАЯ),avg(4 ВИБРАЦИЯ НА ОПОРЕ 4),avg(4 ВИБРАЦИЯ НА ОПОРЕ 4 ПРОДОЛЬНАЯ),avg(4 ДАВЛЕНИЕ МАСЛА В СИСТЕМЕ),avg(4 ТЕМПЕРАТУРА МАСЛА В МАСЛОБЛОКЕ),avg(4 ТЕМПЕРАТУРА МАСЛА В СИСТЕМЕ),avg(4 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 1),avg(4 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 2),avg(4 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 3),avg(4 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 4),avg(4 ТОК РОТОРА 1),avg(4 ТОК РОТОРА 2),avg(4 ТОК СТАТОРА)
0,2019-01-16 13:00:00,1547644000.0,1.555479,0.734822,0.652846,1.609719,3.297655,0.72314,119.338196,40.327572,33.363486,41.469402,39.766466,39.926317,44.673509,362.470365,362.470365,242.106509
1,2019-01-16 13:30:00,1547645000.0,1.561917,0.738544,0.642654,1.629317,3.260738,0.752766,119.472094,40.326196,33.330226,41.458176,39.716148,39.9583,44.639066,360.290962,360.290962,245.530294
2,2019-01-16 14:00:00,1547647000.0,1.560495,0.742096,0.637606,1.536847,2.505211,0.72298,119.808137,40.18131,32.946006,41.367945,39.645747,39.884937,44.508887,356.424307,356.424307,244.586814
3,2019-01-16 14:30:00,1547649000.0,1.552921,0.742581,0.644131,1.597993,3.308348,0.743605,119.716988,40.155692,33.282268,41.368633,39.600247,39.850154,44.572482,357.412598,357.412598,244.966794
4,2019-01-16 15:00:00,1547651000.0,1.547022,0.732789,0.641007,1.636592,3.374121,0.765043,119.512763,40.228135,33.712776,41.389879,39.618564,39.916285,44.725751,358.037421,358.036736,246.13376


In [38]:
save = True
if save:
  save_resampled_X(X_train, prefix="X_train", path=PATH)

In [39]:
i = 4
df = pd.read_parquet(
    PATH + "data/02_intermediate/" + f'X_train{i}_mean_resampled.parquet'
)
df.head()

Unnamed: 0,dt_resampled,avg(epoch),avg(4 ВИБРАЦИЯ НА ОПОРЕ 1),avg(4 ВИБРАЦИЯ НА ОПОРЕ 2),avg(4 ВИБРАЦИЯ НА ОПОРЕ 3),avg(4 ВИБРАЦИЯ НА ОПОРЕ 3 ПРОДОЛЬНАЯ),avg(4 ВИБРАЦИЯ НА ОПОРЕ 4),avg(4 ВИБРАЦИЯ НА ОПОРЕ 4 ПРОДОЛЬНАЯ),avg(4 ДАВЛЕНИЕ МАСЛА В СИСТЕМЕ),avg(4 ТЕМПЕРАТУРА МАСЛА В МАСЛОБЛОКЕ),avg(4 ТЕМПЕРАТУРА МАСЛА В СИСТЕМЕ),avg(4 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 1),avg(4 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 2),avg(4 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 3),avg(4 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 4),avg(4 ТОК РОТОРА 1),avg(4 ТОК РОТОРА 2),avg(4 ТОК СТАТОРА)
0,2019-01-16 13:00:00,1547644000.0,1.555479,0.734822,0.652846,1.609719,3.297655,0.72314,119.338196,40.327572,33.363486,41.469402,39.766466,39.926317,44.673509,362.470365,362.470365,242.106509
1,2019-01-16 13:30:00,1547645000.0,1.561917,0.738544,0.642654,1.629317,3.260738,0.752766,119.472094,40.326196,33.330226,41.458176,39.716148,39.9583,44.639066,360.290962,360.290962,245.530294
2,2019-01-16 14:00:00,1547647000.0,1.560495,0.742096,0.637606,1.536847,2.505211,0.72298,119.808137,40.18131,32.946006,41.367945,39.645747,39.884937,44.508887,356.424307,356.424307,244.586814
3,2019-01-16 14:30:00,1547649000.0,1.552921,0.742581,0.644131,1.597993,3.308348,0.743605,119.716988,40.155692,33.282268,41.368633,39.600247,39.850154,44.572482,357.412598,357.412598,244.966794
4,2019-01-16 15:00:00,1547651000.0,1.547022,0.732789,0.641007,1.636592,3.374121,0.765043,119.512763,40.228135,33.712776,41.389879,39.618564,39.916285,44.725751,358.037421,358.036736,246.13376


In [40]:
df.describe(percentiles=[0.01, 0.25, 0.5, 0.75, 0.99]).T

Unnamed: 0,count,mean,std,min,1%,25%,50%,75%,99%,max
avg(epoch),51862.0,1594318000.0,26948550.0,1547644000.0,1548577000.0,1570981000.0,1594318000.0,1617656000.0,1640060000.0,1640993000.0
avg(4 ВИБРАЦИЯ НА ОПОРЕ 1),50601.0,1.252102,0.4904377,0.0,0.05117647,0.9180708,1.347995,1.564374,2.209234,3.066241
avg(4 ВИБРАЦИЯ НА ОПОРЕ 2),50614.0,0.9307013,0.3730291,0.0,0.09267522,0.6999708,0.8536385,1.124756,1.906169,2.974146
avg(4 ВИБРАЦИЯ НА ОПОРЕ 3),50601.0,1.174368,0.8585902,0.0,0.08367359,0.4964072,0.8609996,1.734835,3.650435,5.146976
avg(4 ВИБРАЦИЯ НА ОПОРЕ 3 ПРОДОЛЬНАЯ),50802.0,3.093527,1.87182,0.0,0.09473722,1.753842,2.549842,4.265988,8.064821,12.80355
avg(4 ВИБРАЦИЯ НА ОПОРЕ 4),50601.0,1.586873,3.047845,-100.0,0.06193548,1.074637,1.449944,2.186662,4.002921,7.681333
avg(4 ВИБРАЦИЯ НА ОПОРЕ 4 ПРОДОЛЬНАЯ),50674.0,1.218909,0.7942266,0.0,0.106833,0.5749755,1.028045,1.691076,3.271966,5.840659
avg(4 ДАВЛЕНИЕ МАСЛА В СИСТЕМЕ),50018.0,131.049,26.63944,0.0,0.3082114,115.4764,129.5671,148.4418,185.2901,339.8845
avg(4 ТЕМПЕРАТУРА МАСЛА В МАСЛОБЛОКЕ),50530.0,40.98256,5.915142,0.0,17.445,37.7925,41.65001,44.66784,52.3528,55.71711
avg(4 ТЕМПЕРАТУРА МАСЛА В СИСТЕМЕ),50569.0,35.63336,7.994236,-0.3998734,11.02224,31.89313,36.86015,41.23573,49.31946,56.16831


### 2.3. X_test

В наборе сигналов тестовой выборки (X_test) присутствуют интервалы с пропущенными значениями. Список интервалов предоставлен в test_intervals. Для этих интервалов необходимо предсказать наличие неисправности. 

In [41]:
X_test.select(X_cols[6]).orderBy("DT").limit(5).toPandas()

Unnamed: 0,DT,6 ВИБРАЦИЯ НА ОПОРЕ 1,6 ВИБРАЦИЯ НА ОПОРЕ 2,6 ВИБРАЦИЯ НА ОПОРЕ 3,6 ВИБРАЦИЯ НА ОПОРЕ 3 ПРОДОЛЬНАЯ,6 ВИБРАЦИЯ НА ОПОРЕ 4,6 ВИБРАЦИЯ НА ОПОРЕ 4 ПРОДОЛЬНАЯ,6 ДАВЛЕНИЕ МАСЛА В СИСТЕМЕ,6 ТЕМПЕРАТУРА МАСЛА В МАСЛОБЛОКЕ,6 ТЕМПЕРАТУРА МАСЛА В СИСТЕМЕ,6 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 1,6 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 2,6 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 3,6 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 4,6 ТОК РОТОРА 1,6 ТОК РОТОРА 2,6 ТОК СТАТОРА
0,2022-01-01 00:00:00,1.69,0.65,0.49,1.01,0.82,0.7,147.11,,,,,,,344.44,344.44,266.41
1,2022-01-01 00:00:10,1.78,0.69,0.45,1.04,0.75,0.53,,,35.7,,,,44.62,343.52,343.52,265.54
2,2022-01-01 00:00:20,1.72,0.67,0.43,0.65,0.79,0.43,,,34.88,,47.86,,,,,265.71
3,2022-01-01 00:00:30,1.76,0.69,0.49,0.83,0.78,0.57,147.45,,35.7,,,,,,,265.62
4,2022-01-01 00:00:40,1.69,0.68,0.45,0.77,,0.54,147.22,,34.88,51.92,48.68,,45.43,,,265.36


In [42]:
i = 4
(
    resample(X_test.select(X_cols[i]), period=60*30)
    .groupBy("dt_resampled")
    .mean()
    .orderBy("dt_resampled")
    .limit(5).toPandas()
)

Unnamed: 0,dt_resampled,avg(epoch),avg(4 ВИБРАЦИЯ НА ОПОРЕ 1),avg(4 ВИБРАЦИЯ НА ОПОРЕ 2),avg(4 ВИБРАЦИЯ НА ОПОРЕ 3),avg(4 ВИБРАЦИЯ НА ОПОРЕ 3 ПРОДОЛЬНАЯ),avg(4 ВИБРАЦИЯ НА ОПОРЕ 4),avg(4 ВИБРАЦИЯ НА ОПОРЕ 4 ПРОДОЛЬНАЯ),avg(4 ДАВЛЕНИЕ МАСЛА В СИСТЕМЕ),avg(4 ТЕМПЕРАТУРА МАСЛА В МАСЛОБЛОКЕ),avg(4 ТЕМПЕРАТУРА МАСЛА В СИСТЕМЕ),avg(4 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 1),avg(4 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 2),avg(4 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 3),avg(4 ТЕМПЕРАТУРА ПОДШИПНИКА НА ОПОРЕ 4),avg(4 ТОК РОТОРА 1),avg(4 ТОК РОТОРА 2),avg(4 ТОК СТАТОРА)
0,2022-01-01 00:00:00,1640995000.0,0.752381,1.20631,0.341274,1.851676,1.8115,1.3764,134.195455,32.861231,31.661316,42.595,43.405,31.213684,34.468955,247.467967,247.467967,269.991734
1,2022-01-01 00:30:00,1640997000.0,0.800872,1.192987,0.37702,1.851579,1.838295,1.359133,133.930952,32.855,32.02475,42.601045,44.053,31.235,34.49,247.529635,247.529635,277.768114
2,2022-01-01 01:00:00,1640999000.0,0.809368,1.160368,0.377548,1.890058,1.892472,1.357263,133.790523,33.317857,32.045,42.680263,44.208571,31.971364,34.64027,247.492705,247.492705,277.371875
3,2022-01-01 01:30:00,1641001000.0,0.760765,1.190577,0.348077,1.874795,1.870508,1.445966,133.71349,33.5975,32.045,42.709231,43.729,31.987143,34.761852,247.809398,247.809398,270.483011
4,2022-01-01 02:00:00,1641002000.0,0.765747,1.191859,0.387059,1.888793,1.846034,1.379261,133.6,33.586842,32.049551,42.714118,44.060986,31.992742,34.766071,247.827431,247.827431,274.081257


In [43]:
save = True
if save:
  save_resampled_X(X_test, prefix='X_test', path=PATH)

### 2.4. test_intervals

Для этих интервалов необходимо предсказать наличие неисправности. В случае предсказанной неисправности, определить тип и время начала развития и наступления неисправности.

In [44]:
test_intervals.shape

(189, 4)

In [45]:
test_intervals['start'].min(), test_intervals['start'].max()

(Timestamp('2022-01-01 10:54:53'), Timestamp('2023-04-20 16:41:58'))

In [46]:
test_intervals['finish'].min(), test_intervals['finish'].max()

(Timestamp('2022-01-01 16:43:39'), Timestamp('2023-04-20 23:58:07'))

In [47]:
(test_intervals['finish'] - test_intervals['start']).describe()

count                          189
mean     0 days 04:59:35.428571428
std      0 days 03:06:42.587346539
min                0 days 02:42:34
25%                0 days 03:38:37
50%                0 days 04:13:31
75%                0 days 05:05:08
max                1 days 00:58:29
dtype: object

Тестовый период - с 01.01.2022 по 20.04.2023 включительно. В среднем, интервал предсказания составляет 5 часов, медиана - 4 часа 13 минут. Максимальный горизонт предсказания - 26 часов.