In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from nltk.stem.snowball import SnowballStemmer
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
import string
from string import punctuation
from nltk.tokenize import word_tokenize
from sklearn.metrics import roc_auc_score as roc_auc
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
import tensorflow.keras as keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
stop_words = set(stopwords.words('russian'))
from keras.models import model_from_json

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
import zipfile
with zipfile.ZipFile('/content/drive/MyDrive/train.csv.zip', 'r') as zip_ref:
  zip_ref.extractall('content/train.csv')

In [None]:
df = pd.read_csv('content/train.csv/train.csv')

In [None]:
x_val = pd.read_csv('val.csv')

In [None]:
df['title_description'] = df.title + ' ' + df.description
train = df[['title_description', 'category', 'is_bad']]
x_val['title_description'] = x_val.title + ' ' + x_val.description
test = x_val[['is_bad', 'title_description', 'category']]

In [None]:
del df
del x_val

### Кастомный стандартизатор

In [None]:
@tf.keras.utils.register_keras_serializable(package='Custom', name='standart')
def standartizer(data):

    url = "(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    email = '[\w\.-]+@[\w\.-]+'
    mention = '@\w+'
    phone = '\d{10}|\d{3} \d{3} \d{2} \d{2}|\d{3} \d{3} \d{2}-\d{2}|\d{3} \d{3}-\d{2}-\d{2}|\d{3}-\d{3}-\d{2}-\d{2}|\d{6}|\d{2}-\d{2}-\d{2}|\d{2} \d{2} \d{2}'
    stopwords = ' | '.join(r'\b{}\b'.format(w) for w in stop_words)
    stopwords = re.compile(' | '.join(r'\b{}\b'.format(w) for w in stop_words), flags=re.I | re.X).pattern
    digits = '[0-9]+'
    word_length = r'\b\w{1,3}\b'
    
    emoji_pattern = re.compile("["
                        u"\U0001F600-\U0001F64F"  # emoticons
                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                        "]+", flags=re.UNICODE).pattern
    
    data = tf.strings.lower(data)
    data = tf.strings.regex_replace(data, url, 'URL')
    data = tf.strings.regex_replace(data, email, 'EMAIL')
    data = tf.strings.regex_replace(data, mention, 'MENTION')
    data = tf.strings.regex_replace(data, phone, 'PHONE')
    data = tf.strings.regex_replace(data, emoji_pattern, '') 
    data = tf.strings.regex_replace(data, '[%s]' % re.escape(string.punctuation), '')

    return data

Модель векторизатора

In [None]:
vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=25000, 
        standardize=standartizer, 
        output_sequence_length = 1000,
        output_mode='int', pad_to_max_tokens=True
    )
vectorizer.adapt(train.title_description.values)

In [11]:
vectorize_layer_model = tf.keras.models.Sequential()
vectorize_layer_model.add(tf.keras.Input(shape=(1, ), dtype=tf.string))
vectorize_layer_model.add(vectorizer)

In [12]:
vectorize_layer_model.save('vectorizer_model')

INFO:tensorflow:Assets written to: vectorizer_model/assets


In [13]:
vector_load = tf.keras.models.load_model('vectorizer_model')



### Архитектура самой модели

In [14]:
model = tf.keras.models.Sequential([
      
        tf.keras.layers.Embedding(input_dim=25000, output_dim=64, input_length=1000),

        tf.keras.layers.Conv1D(64, 5, activation='relu'),
        tf.keras.layers.MaxPooling1D(),

        tf.keras.layers.Conv1D(32, 3, activation='relu'),
        tf.keras.layers.MaxPooling1D(),

        tf.keras.layers.Conv1D(16, 3, activation='relu'),
        tf.keras.layers.MaxPooling1D(),

        tf.keras.layers.Flatten(),

        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(8, activation='relu'),

        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.AUC()])

### Обучаем векторизатор и потом модель

In [15]:
tran = pad_sequences(vector_load.predict(train.title_description).numpy(), maxlen=1000, padding='post')

In [16]:
model.fit(tran, train.is_bad, batch_size=128, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fdd7deab8d0>

In [17]:
model.save('model_finally')

INFO:tensorflow:Assets written to: model_finally/assets


In [18]:
model_load = tf.keras.models.load_model('model_finally')

### Проверим качество на валидации

In [19]:
val_data = pd.read_csv('val.csv')

In [21]:
val_data['title_description'] = val_data.title + ' ' + val_data.description
val_data = val_data[['title_description', 'category', 'is_bad']]

In [25]:
aucs = []
for category in pd.unique(val_data['category']):
        df = val_data[val_data['category'] == category]
        descriptions = df['title_description'].values
        target = df['is_bad'].values

        y_vect = pad_sequences(vector_load.predict(descriptions).numpy(), maxlen=1000, padding='post')
        y_pred = model_load.predict(y_vect)
        auc = roc_auc(target, y_pred)
        print(f'{category} -- {auc}')
        aucs.append(auc)
print(np.mean(aucs))

Транспорт -- 0.987089027841892
Для бизнеса -- 0.8298463550179797
Для дома и дачи -- 0.9380185605723277
Личные вещи -- 0.8425747070345594
Услуги -- 0.910137079031985
Бытовая электроника -- 0.950951293759513
Недвижимость -- 0.9596301973060386
Хобби и отдых -- 0.8845830805842005
Работа -- 0.8872397945390647
Животные -- 0.927437641723356
0.9117507737410918


### Ниже код для сохранения моделей на гугл диск
Filename - название, которое будет на гугл диске. Folders_or_files_to_save - соответствующие папке в коллабе


In [None]:
#@title save yo data to drive
filename = "model_finally" #@param {type:"string"}
folders_or_files_to_save = "model_finally" #@param {type:"string"}
from google.colab import files
from google.colab import auth
from googleapiclient.http import MediaFileUpload
from googleapiclient.discovery import build

def save_file_to_drive(name, path):
    file_metadata = {
    'name': name,
    'mimeType': 'application/octet-stream'
    }

    media = MediaFileUpload(path, 
                  mimetype='application/octet-stream',
                  resumable=True)

    created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()

    print('File ID: {}'.format(created.get('id')))

    return created


extension_zip = ".zip"

zip_file = filename + extension_zip

# !rm -rf $zip_file
!zip -r $zip_file {folders_or_files_to_save} # FOLDERS TO SAVE INTO ZIP FILE

auth.authenticate_user()
drive_service = build('drive', 'v3')

destination_name = zip_file
path_to_file = zip_file
save_file_to_drive(destination_name, path_to_file)

  adding: model_finally/ (stored 0%)
  adding: model_finally/saved_model.pb (deflated 88%)
  adding: model_finally/variables/ (stored 0%)
  adding: model_finally/variables/variables.data-00000-of-00001 (deflated 8%)
  adding: model_finally/variables/variables.index (deflated 68%)
  adding: model_finally/assets/ (stored 0%)
  adding: model_finally/keras_metadata.pb (deflated 92%)
File ID: 1d_fgMdydWLDfxTdelGGrgLVxXpFoI2pI


{'id': '1d_fgMdydWLDfxTdelGGrgLVxXpFoI2pI'}

In [None]:
#@title save yo data to drive
filename = "vectorizer_model" #@param {type:"string"}
folders_or_files_to_save = "vectorizer_model" #@param {type:"string"}
from google.colab import files
from google.colab import auth
from googleapiclient.http import MediaFileUpload
from googleapiclient.discovery import build

def save_file_to_drive(name, path):
    file_metadata = {
    'name': name,
    'mimeType': 'application/octet-stream'
    }

    media = MediaFileUpload(path, 
                  mimetype='application/octet-stream',
                  resumable=True)

    created = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()

    print('File ID: {}'.format(created.get('id')))

    return created


extension_zip = ".zip"

zip_file = filename + extension_zip

# !rm -rf $zip_file
!zip -r $zip_file {folders_or_files_to_save} # FOLDERS TO SAVE INTO ZIP FILE

auth.authenticate_user()
drive_service = build('drive', 'v3')

destination_name = zip_file
path_to_file = zip_file
save_file_to_drive(destination_name, path_to_file)

  adding: vectorizer_model/ (stored 0%)
  adding: vectorizer_model/saved_model.pb (deflated 72%)
  adding: vectorizer_model/variables/ (stored 0%)
  adding: vectorizer_model/variables/variables.data-00000-of-00001 (deflated 44%)
  adding: vectorizer_model/variables/variables.index (deflated 20%)
  adding: vectorizer_model/assets/ (stored 0%)
  adding: vectorizer_model/keras_metadata.pb (deflated 79%)
File ID: 1W7qfUJngCwKS0BfxIXXdZS-VOiMNq4XJ


{'id': '1W7qfUJngCwKS0BfxIXXdZS-VOiMNq4XJ'}