In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv("train_input_data_pl.csv")
print(df.shape[0])
df = df.dropna()
df['words_count'] = df["text"].apply(lambda text: len(text.split()))

# x_columns = ["text", "words_count"]
# y_columns = ["label", "label_id"]
x_columns = "text"
y_columns = "label_id"
x, y = df[x_columns], df[y_columns]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=4, stratify=y)

training = pd.concat([x_train, y_train], axis=1)
testing = pd.concat([x_test, y_test], axis=1)

1835


In [2]:
training

Unnamed: 0,text,label_id
1062,colowierzchy kingdoj fennig goluchow krolewska...,18
337,drukowanyn literami czarnym lub nieb eskim kol...,10
653,iurkiv sefil krasvikoea bigalrillmloti powiato...,11
284,etn alal dhuowal utlaam kolorll pitaxpl poua m...,10
1171,umowa dzielo kielbasiu zawarta dniu abdulvosit...,19
...,...,...
1774,umowa sprzedazy samochodu lad zwar dniu garbcz...,20
595,sschab biaioglotskz pawlowsk crabarka kowalska...,11
1556,emowa sprzedaży samochodu zawara dniu dzwinia€...,20
1412,umowa dzfio kukawka zawarta dniu zaboronek arp...,19


In [3]:
testing

Unnamed: 0,text,label_id
1570,uiuwa sprzedaży w dniu wark aiied ali pomiędzy...,20
895,sansaneesjegireva iax inaeko konunruta mrjscou...,18
635,senchank kozlotsk afysock sodvom h powiatowy i...,11
947,procopie ishtriakov bereznica wyzua zytnia ada...,18
977,moczyly miatwej sebeszuk bauska nizna gornosla...,18
...,...,...
1733,umowa sprzedaży samochodu pomiędzy zawarta w d...,20
731,van lan lai lami szczejkowice j kruteczek fran...,18
1797,umowa sprzfdazy samochodu kocicrz pomiedzy zaw...,20
1364,vmowa odzieło dniu pomiędzy qhochu darki swicm...,19


# Tokenization

In [4]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
training_sentences = x_train.to_list()

LABEL_N = 21

# First approach
# LEARNING_RATE = 0.01
# VOCAB_SIZE = 10_000
# EMBEDDING_DIM = 16
# DENSE_LAYER_COUNT = 128
# max_length = 600
# trunc_type='post'
# padding_type='post'

def pad(sequences):
    return pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Optimized
LEARNING_RATE = 0.005
VOCAB_SIZE = 2_000
EMBEDDING_DIM = 128 # 16
DENSE_LAYER_COUNT = 128
max_length = 400
trunc_type='post'
padding_type='post'

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(training_sentences)
# print(tokenizer.word_index)

In [6]:
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad(training_sequences)
# training_padded

In [7]:
testing_sentences = x_test.to_list()
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad(testing_sequences)
# testing_padded

In [8]:
import numpy as np
def create_all_categories_columns(input_array: np.array):
    result = np.zeros(shape=(input_array.shape[0], LABEL_N), dtype=int)
    result[np.arange(0, input_array.shape[0]), input_array] = 1
    return result

In [9]:
training_padded = np.array(training_padded)
training_labels = create_all_categories_columns(np.array(y_train))
testing_padded = np.array(testing_padded)
testing_labels = create_all_categories_columns(np.array(y_test))

In [10]:
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.models import Sequential

In [11]:
# model = Sequential()
# model.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=training_padded.shape[1]))
# model.add(SpatialDropout1D(0.7))
# model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
# model.add(Dense(20, activation='softmax'))
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# print(model.summary())
# training = model.fit(training_padded, training_labels, epochs=1_00, batch_size=256, validation_split=0.2)

In [12]:
model = tf.keras.Sequential([
        tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(DENSE_LAYER_COUNT, activation="relu"),
        tf.keras.layers.Dense(LABEL_N, activation='softmax'),
])
adam = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, beta_1=0.9, beta_2=0.999, amsgrad=False)
model.compile(loss="categorical_crossentropy", optimizer=adam, metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         256000    
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dense_1 (Dense)             (None, 21)                2709      
                                                                 
Total params: 275,221
Trainable params: 275,221
Non-trainable params: 0
_________________________________________________________________


In [13]:
num_epochs = 10
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/10
43/43 - 1s - loss: 1.3194 - accuracy: 0.6221 - val_loss: 0.4352 - val_accuracy: 0.9477 - 1s/epoch - 34ms/step
Epoch 2/10
43/43 - 0s - loss: 0.1406 - accuracy: 0.9862 - val_loss: 0.0317 - val_accuracy: 1.0000 - 358ms/epoch - 8ms/step
Epoch 3/10
43/43 - 0s - loss: 0.0169 - accuracy: 0.9993 - val_loss: 0.0146 - val_accuracy: 1.0000 - 365ms/epoch - 8ms/step
Epoch 4/10
43/43 - 0s - loss: 0.0075 - accuracy: 1.0000 - val_loss: 0.0075 - val_accuracy: 1.0000 - 379ms/epoch - 9ms/step
Epoch 5/10
43/43 - 0s - loss: 0.0040 - accuracy: 1.0000 - val_loss: 0.0057 - val_accuracy: 1.0000 - 355ms/epoch - 8ms/step
Epoch 6/10
43/43 - 0s - loss: 0.0027 - accuracy: 1.0000 - val_loss: 0.0036 - val_accuracy: 1.0000 - 382ms/epoch - 9ms/step
Epoch 7/10
43/43 - 0s - loss: 0.0021 - accuracy: 1.0000 - val_loss: 0.0037 - val_accuracy: 1.0000 - 372ms/epoch - 9ms/step
Epoch 8/10
43/43 - 0s - loss: 0.0014 - accuracy: 1.0000 - val_loss: 0.0028 - val_accuracy: 1.0000 - 355ms/epoch - 8ms/step
Epoch 9/10
43/43 -

In [14]:
def get_model_prediction(text):
    text_representation = np.array(
        pad(
            tokenizer.texts_to_sequences(
                [text]
            )
        )
    )

    prediction = list(model.predict(text_representation)[0])
    column = prediction.index(max(prediction))
    return column

model.model_reader = get_model_prediction
model.model_reader

In [16]:
import pickle
pickle.dump(model, open('model_pl.pkl', 'wb'))