In [8]:
import xml.etree.ElementTree as ET

import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.model_selection import train_test_split
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt
from IPython.display import Image

tf.get_logger().setLevel('ERROR')

2021-11-25 22:55:35.587680: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-25 22:55:35.587732: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Формирование датасета

Датасеты: https://itnan.ru/post.php?c=1&p=516730

In [2]:
cloth_data = pd.read_csv('women-clothing-accessories.3-class.balanced.csv', sep='\t')
cloth_data.head()

Unnamed: 0,review,sentiment
0,качество плохое пошив ужасный (горловина напер...,negative
1,"Товар отдали другому человеку, я не получила п...",negative
2,"Ужасная синтетика! Тонкая, ничего общего с пре...",negative
3,"товар не пришел, продавец продлил защиту без м...",negative
4,"Кофточка голая синтетика, носить не возможно.",negative


In [3]:
cloth_data.sentiment.value_counts()

negative    30000
neautral    30000
positive    30000
Name: sentiment, dtype: int64

In [12]:
pd_data = cloth_data[(cloth_data.sentiment == 'positive') | (cloth_data.sentiment == 'negative')].copy()
pd_data['positive'] = 0
pd_data.loc[pd_data.sentiment == 'positive', 'positive'] = 1

pd_data.positive.value_counts()

0    30000
1    30000
Name: positive, dtype: int64

In [22]:
train_reviews, validate_reviews, train_target, validate_target = train_test_split(pd_data.review.values, pd_data.positive.values, test_size=0.05)

In [23]:
BATCH_SIZE = 32

train_dataset = tf.data.Dataset.from_tensor_slices((
    [tf.convert_to_tensor(x, dtype=tf.string, name='inputs') for x in train_reviews], 
    train_target,
)).batch(BATCH_SIZE)

validate_dataset = tf.data.Dataset.from_tensor_slices((
    [tf.convert_to_tensor(x, dtype=tf.string, name='inputs') for x in validate_reviews],
    validate_target,
)).batch(BATCH_SIZE)

## Модель

In [24]:
BERT_ENCODER = 'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4'
BERT_PREPROCESSOR = 'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3'

In [25]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(BERT_PREPROCESSOR, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(BERT_ENCODER, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [26]:
classifier_model = build_classifier_model()

2021-11-25 23:12:31.072519: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 367248384 exceeds 10% of free system memory.


## Дообучение на отзывах

In [27]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

In [28]:
epochs = 2
steps_per_epoch = tf.data.experimental.cardinality(train_dataset).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1 * num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [29]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [30]:
history = classifier_model.fit(x=train_dataset,
                               validation_data=validate_dataset,
                               epochs=epochs)

Epoch 1/2


2021-11-25 23:12:40.454660: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 367248384 exceeds 10% of free system memory.


In [None]:
history_dict = history.history
acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
# r is for "solid red line"
plt.plot(epochs, loss, 'r', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
# plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')