In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os

np.random.seed(17)
tf.random.set_seed(17)

In [2]:
df = pd.read_pickle('df_pikabu_spam_posts.pd')
df.head()

Unnamed: 0,title,text,bad
0,"[треб, помощник, работ, оффлайн, удаленк]","[знает, процесс, регистрац, профсоюз, https, а...",1
1,"[хоч, прода, аккаунт, pornhubpremium, реальн]","[появ, больш, количеств, аккаунт, сайт, реальн...",1
2,"[нужн, помощ, кумерта]","[здравств, декабр, дедушк, упа, сво, квартир, ...",1
3,"[щенок, хаск, ищет, хозяин, платн]","[мам, пап, хаск, подробн, телефон]",1
4,"[песик, пройд, опросик]","[привет, всем, обитател, дан, платформ, провож...",1


In [3]:
df = df.sample(frac=1)
df.head()

Unnamed: 0,title,text,bad
10952,"[сапожник, сапог]","[родственник, отправ, сын, лет, котор, плох, у...",0
11334,"[кредитн, карт]","[обрат, человек, помощ, дают, ипотек, слов, сл...",0
2758,"[закаля, умр]","[привет, всем, интерес, сочувств, прост, загля...",0
3952,"[нов, закон, действ]",[https],0
676,"[нужн, помощ, кемеров]","[добр, ден, знаком, обрат, просьб, помоч, пере...",1


In [6]:
words_set = set()

for row in df.itertuples():
    for word in row.title:
        words_set.add(word)
    for word in row.text:
        words_set.add(word)
        
print(len(words_set))

# df.itertuples

59073


In [5]:
words_counter = {w: 0 for w in words_set}

for row in df.itertuples():
    for word in row.title:
        words_counter[word] += 1
    for word in row.text:
        words_counter[word] += 1
        
words_list = list(words_counter.items())
words_list.sort(key=(lambda x: x[1]), reverse=True)

In [6]:
words_list = words_list[:5000]
# print(words_list)
words_list = [k[0] for k in words_list]
# print(words_list)
words_ohe_positions = {words_list[i]: i for i in range(len(words_list))}

In [7]:
titles = []
texts = []

for row in df.itertuples():
    title_ohe = [0] * len(words_list)
    for word in row.title:
        try: 
            title_ohe[words_ohe_positions[word]] += 1
        except:
            continue
        
    text_ohe = [0] * len(words_list)
    for word in row.text:
        try:
            text_ohe[words_ohe_positions[word]] += 1
        except:
            continue
        
    titles.append(title_ohe)
    texts.append(text_ohe)
    
titles = np.array(titles)
texts = np.array(texts)

In [8]:
titles.shape, texts.shape

((7443, 5000), (7443, 5000))

In [9]:
len(titles[0])

5000

In [10]:
y = np.array(df['bad'])
y.shape

(7443,)

In [11]:
def train_val_test_split(x, val_frac=.15, test_frac=.15):
    x_train = x[:round((1-val_frac-test_frac) * len(x))]
    x_val = x[round((1-val_frac-test_frac) * len(x)):round((1-test_frac) * len(x))]
    x_test = x[round((1-test_frac)*len(x)):]
    
    return x_train, x_val, x_test
    
titles_train, titles_val, titles_test = train_val_test_split(titles)
texts_train, texts_val, texts_test = train_val_test_split(texts)
y_train, y_val, y_test = train_val_test_split(y)

In [12]:
title_input = tf.keras.layers.Input(shape=(len(titles[0],)))
text_input = tf.keras.layers.Input(shape=(len(texts[0],)))

title_dense_1 = tf.keras.layers.Dense(500, activation='relu')(title_input)
title_bn_1 = tf.keras.layers.BatchNormalization()(title_dense_1)

text_dense_1 = tf.keras.layers.Dense(500, activation='relu')(text_input)
text_bn_1 = tf.keras.layers.BatchNormalization()(text_dense_1)

text_dense_2 = tf.keras.layers.Dense(500, activation='relu')(text_bn_1)
text_bn_2 = tf.keras.layers.BatchNormalization()(text_dense_2)

add = tf.keras.layers.Add()([title_bn_1, text_bn_2]) 

main_dense_1 = tf.keras.layers.Dense(300, activation='relu')(add)
main_bn_1 = tf.keras.layers.BatchNormalization()(main_dense_1)
main_dropout_1 = tf.keras.layers.Dropout(0.8)(main_bn_1)

main_dense_2 = tf.keras.layers.Dense(100, activation='relu')(main_dropout_1)
main_bn_2 = tf.keras.layers.BatchNormalization()(main_dense_2)
main_dropout_2 = tf.keras.layers.Dropout(0.8)(main_bn_2)

output = tf.keras.layers.Dense(1, activation='sigmoid')(main_dropout_2)

model = tf.keras.Model(
    inputs=[title_input, text_input], outputs=output
)

In [13]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 5000)]       0           []                               
                                                                                                  
 dense_1 (Dense)                (None, 500)          2500500     ['input_2[0][0]']                
                                                                                                  
 input_1 (InputLayer)           [(None, 5000)]       0           []                               
                                                                                                  
 batch_normalization_1 (BatchNo  (None, 500)         2000        ['dense_1[0][0]']                
 rmalization)                                                                                 

In [14]:
accuracy = tf.keras.metrics.binary_accuracy

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.binary_crossentropy,
    metrics=([accuracy])
    
)

In [15]:
os.mkdir('logs_hw')

In [16]:
tb_callback = tf.keras.callbacks.TensorBoard(
    log_dir='logs_hw/first',
    histogram_freq=1
)

In [17]:
model.fit(
    [titles_train, texts_train], y_train,
    validation_data=([titles_val, texts_val], y_val),
    batch_size = 256,
    epochs = 10,
    callbacks=[tb_callback]
)

Epoch 1/10


2023-04-02 23:14:18.910167: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x14cee7ac0>

In [18]:
round(0.9069, 1)

0.9