## Тащим все необходимые пакеты


In [1]:
import re
from string import digits

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub


### Скачиваем файлик токенизатора
https://github.com/google-research/bert - Тут есть всякие приготовленные классификаторы и проч для tensorflow c bert

In [2]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [3]:
import tokenization

## Вспомогательные функции для токенизации и расширения модели

In [4]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [5]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy', 'binary_accuracy'])
    
    return model

## хелпер для очистки данных

In [6]:
def dataframe_cleaner(df):
    remove_digits = str.maketrans('', '', digits)
    for index, row in df.iterrows():
        # Чистим ссылки
        text = re.sub(r'(http|https)?:\/\/.*[\r\n]*', '', row['text'], flags=re.MULTILINE)
        # Фразы аля via Youtube
        text = re.sub(r' via .\w+', ' ', text, flags=re.I)
        # Обращения к юзерам
        text = re.sub(r"@\w+", '', text, flags=re.MULTILINE)
        # Одиночные символы
        text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
        # Одиночные символы со стартаx
        text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
        # Все спец символы
        text = re.sub(r'\W+', ' ', text)
        # Пробелы лишние
        text = re.sub(r'\s+', ' ', text, flags=re.I)

        text = text.lower().replace('#', '')
        text = text.translate(remove_digits)
        df.at[index, 'text'] = text

## Скачиваем преобученную модельку

In [7]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

CPU times: user 1min 25s, sys: 9.36 s, total: 1min 35s
Wall time: 1min 37s


## Загружаем данные из csv и чистим

In [8]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [9]:
dataframe_cleaner(train)
dataframe_cleaner(test)

## Извлекаем из предобученной модели файл словарика и флаг регистрозависимости модели

In [10]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

## Перегоняем входные данные в понятные берту значения<br>

In [11]:
train_input = bert_encode(train.text.values, tokenizer, max_len=160)
test_input = bert_encode(test.text.values, tokenizer, max_len=160)
train_labels = train.target.values

In [12]:
model = build_model(bert_layer, max_len=160)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [13]:
train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=5,
    batch_size=16
)

model.save('model.h5')

Train on 6090 samples, validate on 1523 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
test_pred = model.predict(test_input)

In [15]:
submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
submission['target'] = test_pred.round().astype(int)
submission.to_csv('submission3.csv', index=False)
print('Result is ready')

Result is ready


In [16]:
submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
