In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import nltk
import re
import string
import warnings
# from google.colab import drive



In [2]:
warnings.filterwarnings(action='ignore')
# drive.mount('/content/drive')

In [3]:
# data_path = '/content/drive/MyDrive/MLID/lab2_oil_gas_field_construction_data.csv'
data_path = 'lab2_oil_gas_field_construction_data.csv'
data = pd.read_csv(data_path, sep=',')
data.head(15)

Unnamed: 0,work_name,generalized_work_class,global_work_class,upper_works
0,монтаж ограждения основания блоков управления са,,,
1,монтаж стоек,,,"('Подготовка свай', 'Монтаж ограждения')"
2,монтаж трубопровода г16/1 газ с предохранитель...,,,
3,монтад лестниц,Монтаж лестниц,Строительство зданий,
4,монтаж площадок,,,
5,технологическая обвязка рессиверов газа v-25м3...,,,
6,электромонтажные работы,,,
7,кипиа,Монтаж приборов,Монтаж,
8,монтаж маслоприемников 1.6.2,,,
9,электромонтажные работы 1.4.2,ПНР,ПНР,


In [4]:
data.shape

(716067, 4)

In [5]:
def cleaning(text):
    text = re.sub('\W+', ' ', text)
    text = re.sub('[0-9]+', '', text)

    text = re.split('\W+', text.lower().strip())

    sn_stemmer = nltk.SnowballStemmer('russian')
    text = [sn_stemmer.stem(word) for word in text]

    text = ' '.join(text)
    return text

In [6]:
data_cleaned = data[~data['generalized_work_class'].isna()]

data_cleaned['generalized_work_class'] = data_cleaned.apply(lambda x: x['generalized_work_class'].lower(), axis=1)
data_cleaned['global_work_class'] = data_cleaned.apply(lambda x: x['global_work_class'].lower(), axis=1)
data_cleaned['work_name'] = data_cleaned.apply(lambda x: cleaning(x['work_name']), axis=1)

print(data_cleaned.shape)
display(data_cleaned.head(10))

(296858, 4)


Unnamed: 0,work_name,generalized_work_class,global_work_class,upper_works
3,монтад лестниц,монтаж лестниц,строительство зданий,
7,кип,монтаж приборов,монтаж,
9,электромонтажн работ,пнр,пнр,
10,погружен сва св хх,погружение свай,монтаж свай,
12,монтаж систем вк,испытания системы водоснабжения,испытания систем,
13,монтаж стоек,монтаж мк,монтаж мк,"('Подготовка свай', 'Монтаж ограждения')"
15,монтаж балок под эстакад,монтаж мк,монтаж мк,
16,монтаж стоек,монтаж мк,монтаж мк,"('Подготовка свай', 'Монтаж ограждения')"
17,погружен сва,погружение свай,монтаж свай,"('Подготовка свай', 'Монтаж ограждения')"
20,монтаж балок под эстакад,монтаж мк,монтаж мк,


In [9]:
data_cleaned.describe()

Unnamed: 0,work_name,generalized_work_class,global_work_class,upper_works
count,296858,296858,296858,14361
unique,8292,207,56,5
top,пнр,монтаж мк,монтаж,"('Подготовка свай', 'Монтаж ограждения')"
freq,7498,26081,38286,14222


In [7]:
# investigate gen_work_classes
tmp_df = data_cleaned.groupby('generalized_work_class')['generalized_work_class'].count()
tmp_df.describe()

count      207.000000
mean      1434.096618
std       3634.316107
min          1.000000
25%         24.500000
50%        106.000000
75%        786.000000
max      26081.000000
Name: generalized_work_class, dtype: float64

In [8]:
tmp_df.sort_values(ascending=False).head(10)

generalized_work_class
монтаж мк               26081
монтаж кабеля           21547
монтаж трубопровода     20943
погружение свай         16096
акз свай                13918
пнр                     12937
обратная засыпка        11258
разработка грунта       10577
монтаж теплоизоляции     9319
акз трубопровода         9154
Name: generalized_work_class, dtype: int64

In [10]:
# investigate glob_work_classes
tmp_glob_df = data_cleaned.groupby('global_work_class')['global_work_class'].count()
tmp_glob_df.describe()

count       56.000000
mean      5301.035714
std       8561.257900
min          2.000000
25%         79.250000
50%        648.500000
75%       7862.750000
max      38286.000000
Name: global_work_class, dtype: float64

In [11]:
tmp_glob_df.sort_values(ascending=False).head(10)

global_work_class
монтаж                    38286
монтаж мк                 26081
монтаж электрики          24826
акз                       24045
монтаж свай               23612
прокладка трубопровода    22604
засыпка                   15536
пнр                       12937
изготовление              11185
разработка грунта         11036
Name: global_work_class, dtype: int64

In [12]:
# investigate upper_classes
tmp_up_df = data_cleaned.groupby('upper_works')['upper_works'].count()
tmp_up_df.describe()

count        5.000000
mean      2872.200000
std       6344.821408
min          1.000000
25%          2.000000
50%         59.000000
75%         77.000000
max      14222.000000
Name: upper_works, dtype: float64

In [13]:
tmp_up_df.sort_values(ascending=False).head(10)

upper_works
('Подготовка свай', 'Монтаж ограждения')                     14222
()                                                              77
('Устройство свайного основания',)                              59
('КР-8 Замена теплообменной камеры печи блочной ПТБ №4',)        2
('Монтаж  мачт (№ 53,60)',)                                      1
Name: upper_works, dtype: int64

# BERT

In [17]:
gwc_ids_df = pd.DataFrame({'generalized_work_class': data_cleaned['generalized_work_class'].unique()}) \
  .sort_values('generalized_work_class') \
  .reset_index(drop=True) \
  .assign(gwc_id = lambda df: df.index)

gwc_ids_df.head()

Unnamed: 0,generalized_work_class,gwc_id
0,"автоматизация, монтаж",0
1,акз мк,1
2,акз резервуара,2
3,акз свай,3
4,акз трубопровода,4


In [20]:
data_cleaned_ids = pd.merge(data_cleaned, gwc_ids_df, on='generalized_work_class')
data_cleaned_ids.head(5)

Unnamed: 0,work_name,generalized_work_class,global_work_class,upper_works,gwc_id
0,монтад лестниц,монтаж лестниц,строительство зданий,,130
1,монтад лестниц,монтаж лестниц,строительство зданий,,130
2,монтад лестниц,монтаж лестниц,строительство зданий,,130
3,монтаж лестниц,монтаж лестниц,строительство зданий,,130
4,монтаж лестниц,монтаж лестниц,строительство зданий,,130


In [21]:
num_classes = len(data_cleaned_ids['gwc_id'].unique())

### Train and test data preparation

In [22]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

y = tf.keras.utils.to_categorical(data_cleaned_ids['gwc_id'].values, num_classes=num_classes)
x_train, x_test, y_train, y_test = train_test_split(data_cleaned_ids['work_name'], y, test_size=0.25)

In [26]:
# %pip install tensorflow_hub
# %pip install tensorflow-text

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting tensorflow-textNote: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an OSError: [WinError 5] Отказано в доступе: 'c:\\Users\\petka\\AppData\\Local\\Programs\\Python\\Python39\\Lib\\site-packages\\tensorflow\\compiler\\tf2tensorrt\\_pywrap_py_utils.pyd'
Consider using the `--user` option or check the permissions.


[notice] A new release of pip is available: 23.2.1 -> 23.3
[notice] To update, run: python.exe -m pip install --upgrade pip



  Using cached tensorflow_text-2.10.0-cp39-cp39-win_amd64.whl (5.0 MB)
Collecting tensorflow<2.11,>=2.10.0 (from tensorflow-text)
  Using cached tensorflow-2.10.1-cp39-cp39-win_amd64.whl (455.9 MB)
Collecting keras-preprocessing>=1.1.1 (from tensorflow<2.11,>=2.10.0->tensorflow-text)
  Using cached Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
Collecting tensorboard<2.11,>=2.10 (from tensorflow<2.11,>=2.10.0->tensorflow-text)
  Using cached tensorboard-2.10.1-py3-none-any.whl (5.9 MB)
Collecting google-auth-oauthlib<0.5,>=0.4.1 (from tensorboard<2.11,>=2.10->tensorflow<2.11,>=2.10.0->tensorflow-text)
  Using cached google_auth_oauthlib-0.4.6-py2.py3-none-any.whl (18 kB)
Installing collected packages: keras-preprocessing, google-auth-oauthlib, tensorboard, tensorflow, tensorflow-text
  Attempting uninstall: google-auth-oauthlib
    Found existing installation: google-auth-oauthlib 1.0.0
    Uninstalling google-auth-oauthlib-1.0.0:
      Successfully uninstalled google-auth-oau

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text

preprocessor = hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2')
encoder = hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-base/1')

In [None]:
i = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
x = preprocessor(i)
x = encoder(x)
x = tf.keras.layers.Dropout(0.2, name="dropout")(x['pooled_output'])
x = tf.keras.layers.Dense(num_classes, activation='softmax', name="output")(x)

with tf.device("/GPU:0"):
  model = tf.keras.Model(i, x)

In [None]:
from keras import backend as K

def balanced_recall(y_true, y_pred):
    """This function calculates the balanced recall metric
    recall = TP / (TP + FN)
    """
    recall_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true_class, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        recall_by_class = recall_by_class + recall
    return recall_by_class / y_pred.shape[1]

def balanced_precision(y_true, y_pred):
    """This function calculates the balanced precision metric
    precision = TP / (TP + FP)
    """
    precision_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred_class, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        precision_by_class = precision_by_class + precision
    # return average balanced metric for each class
    return precision_by_class / y_pred.shape[1]

def balanced_f1_score(y_true, y_pred):
    """This function calculates the F1 score metric"""
    precision = balanced_precision(y_true, y_pred)
    recall = balanced_recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

In [None]:
# Converting x_train into numpy array
import numpy as np
x_train = np.array([np.array(val) for val in x_train])

In [None]:
n_epochs = 1

METRICS = [
      tf.keras.metrics.CategoricalAccuracy(name="accuracy"),
      balanced_recall,
      balanced_precision,
      balanced_f1_score
]

earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor = "val_loss",
                                                      patience = 3,
                                                      restore_best_weights = True)

model.compile(optimizer = "adam",
              loss = "categorical_crossentropy",
              metrics = METRICS)

# model_fit = model.fit(x_train,
#                       y_train,
#                       epochs = n_epochs,
#                       validation_data = (x_test, y_test),
#                       callbacks = [earlystop_callback])



In [None]:
sentences = [
    "монтад лестниц",
    "кип",
    "электромонтажн работ"
]

def predict_class(sentences):
  return [np.argmax(pred) for pred in model.predict(sentences)]

predict_class(sentences)